From e983f904682c726d1367ad28b68dba0827a97a6e Mon Sep 17 00:00:00 2001 From: ethanglaser <42726565+ethanglaser@users.noreply.github.com> Date: Mon, 12 Aug 2024 15:40:49 -0700 Subject: [PATCH] MAINT: remove DAAL CL kernels and GPU interfaces (#2816) * MAINT: remove daal CL kernels * removal of non oneapi/ files referencing oneapi/ * remove daal/*/oneapi includes * remove oneapi::internal references * alternative include for onedal knn * removal of logic outside of device.isCpu * small knn fixes * restore include sycl table header * opencl cleanup * remove cl kernels * sycl table removal * remove deviceInfo checks * remova cpp/daal/src/sycl/* * knn fixed * removal of all execution context guard * delete sycl table adapter * oops * swap include * DAAL_SYCL_INTERFACE removal initial * clang * remove daal gpu support checker * swap include * re-add bazel build knn daal deps * clang * remove exec ctx, services/internal/sycl, clean BUILDs * broader include swap * forgot threadcomm * remove bazel circular depenendency * clang * restore thread comm * removal all daal SYCL container/dispatching macros * remove python double exit code check * dpc bazel test fixes * restore makefile * more opencl restoration * samples restore * restore .ci/env/apt.sh based on discussion * swap ucapi knn header include * restore docs with opencl * remove interop/common_dpc.hpp from new file * address undefined uint error --- .ci/pipeline/ci.yml | 2 - cpp/daal/BUILD | 24 +- .../algorithm_container_base_batch.h | 5 - .../algorithm_container_base_common.h | 5 - cpp/daal/include/daal_sycl.h | 57 - .../data/internal/numeric_table_sycl.h | 74 - .../data/internal/numeric_table_sycl_csr.h | 669 -- .../internal/numeric_table_sycl_homogen.h | 657 -- .../data/internal/numeric_table_sycl_soa.h | 640 -- cpp/daal/include/services/daal_defines.h | 12 - cpp/daal/include/services/env_detect.h | 19 +- .../internal/aarch64/aarch64_kernel_defines.h | 2 - cpp/daal/include/services/internal/buffer.h | 161 - .../services/internal/buffer_impl_sycl.h | 429 - .../services/internal/daal_kernel_defines.h | 8 - .../services/internal/execution_context.h | 188 - .../services/internal/gpu_support_checker.h | 150 - .../internal/riscv64/riscv64_kernel_defines.h | 2 - .../services/internal/sycl/buffer_utils.h | 258 - .../internal/sycl/buffer_utils_sycl.h | 308 - .../internal/sycl/error_handling_sycl.h | 236 - .../internal/sycl/execution_context.h | 458 - .../internal/sycl/execution_context_sycl.h | 308 - .../internal/sycl/kernel_scheduler_sycl.h | 662 -- .../internal/sycl/level_zero_common.h | 38 - .../internal/sycl/level_zero_module_sycl.h | 185 - .../services/internal/sycl/level_zero_types.h | 8191 ----------------- .../internal/sycl/math/blas_executor.h | 309 - .../internal/sycl/math/lapack_executor.h | 164 - .../services/internal/sycl/math/mkl_blas.h | 219 - .../services/internal/sycl/math/mkl_dal.h | 36 - .../internal/sycl/math/mkl_dal_utils.h | 65 - .../services/internal/sycl/math/mkl_lapack.h | 170 - .../internal/sycl/math/reference_axpy.h | 68 - .../internal/sycl/math/reference_gemm.h | 74 - .../internal/sycl/math/reference_lapack.h | 85 - .../services/internal/sycl/math/types.h | 63 - .../include/services/internal/sycl/types.h | 272 - .../services/internal/sycl/types_utils.h | 96 - .../internal/x86_64/x86_64_kernel_defines.h | 13 +- cpp/daal/src/algorithms/classifier/BUILD | 3 +- .../classifier/classifier_predict_fpt.cpp | 19 +- cpp/daal/src/algorithms/covariance/BUILD | 3 +- .../covariance/covariance_container.h | 236 +- ...nce_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...ariance_dense_default_batch_oneapi_fpt.cpp | 44 - ...nse_default_distr_step2_fpt_dispatcher.cpp | 2 +- ...e_dense_default_distr_step2_oneapi_fpt.cpp | 44 - ...ce_dense_default_online_fpt_dispatcher.cpp | 2 +- ...riance_dense_default_online_oneapi_fpt.cpp | 44 - .../covariance/covariance_partialresult.h | 20 +- .../algorithms/covariance/covariance_result.h | 39 +- .../oneapi/cl_kernels/covariance_kernels.cl | 101 - .../covariance_dense_batch_oneapi_impl.i | 105 - .../covariance_dense_distr_step2_oneapi.h | 61 - ...covariance_dense_distr_step2_oneapi_impl.i | 215 - .../covariance_dense_online_oneapi_impl.i | 195 - .../oneapi/covariance_kernel_oneapi.h | 69 - .../oneapi/covariance_oneapi_impl.i | 432 - cpp/daal/src/algorithms/dbscan/BUILD | 3 +- .../src/algorithms/dbscan/dbscan_container.h | 35 +- ...can_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...n_dense_default_batch_kernel_ucapi_fpt.cpp | 38 - .../oneapi/cl_kernels/dbscan_cl_kernels.cl | 146 - .../dbscan_dense_default_batch_ucapi_impl.i | 481 - .../dbscan/oneapi/dbscan_kernel_ucapi.h | 82 - cpp/daal/src/algorithms/decision_tree/BUILD | 2 +- cpp/daal/src/algorithms/dtrees/forest/BUILD | 3 +- .../dtrees/forest/classification/BUILD | 3 +- ...on_predict_dense_default_batch_container.h | 28 +- ...ict_dense_default_batch_fpt_dispatcher.cpp | 4 +- ...predict_dense_default_batch_oneapi_fpt.cpp | 45 - .../df_classification_train_container.h | 28 +- ...cation_train_hist_batch_fpt_dispatcher.cpp | 4 +- ...sification_train_hist_batch_oneapi_fpt.cpp | 46 - .../df_batch_classification_kernels.cl | 632 -- ...df_batch_predict_classification_kernels.cl | 199 - ...assification_predict_dense_kernel_oneapi.h | 126 - ...classification_predict_dense_oneapi_impl.i | 580 -- ..._classification_train_hist_kernel_oneapi.h | 182 - ...df_classification_train_hist_oneapi_impl.i | 1218 --- .../df_classification_tree_helper_impl.i | 212 - .../oneapi/cl_kernels/df_common_kernels.cl | 81 - .../df_tree_level_build_helper_kernels.cl | 441 - .../oneapi/df_feature_type_helper_oneapi.h | 140 - .../oneapi/df_feature_type_helper_oneapi.i | 467 - .../df_tree_level_build_helper_oneapi.h | 141 - .../df_tree_level_build_helper_oneapi.i | 724 -- .../algorithms/dtrees/forest/regression/BUILD | 3 +- ...on_predict_dense_default_batch_container.h | 28 +- ...ict_dense_default_batch_fpt_dispatcher.cpp | 4 +- ...predict_dense_default_batch_oneapi_fpt.cpp | 45 - .../df_regression_train_container.h | 28 +- ...ain_dense_default_batch_fpt_dispatcher.cpp | 4 +- ...ession_train_hist_batch_fpt_dispatcher.cpp | 4 +- ...regression_train_hist_batch_oneapi_fpt.cpp | 46 - .../df_batch_predict_regression_kernels.cl | 113 - .../cl_kernels/df_batch_regression_kernels.cl | 607 -- ...f_regression_predict_dense_kernel_oneapi.h | 109 - .../df_regression_predict_dense_oneapi_impl.i | 366 - .../df_regression_train_hist_kernel_oneapi.h | 181 - .../df_regression_train_hist_oneapi_impl.i | 1196 --- .../oneapi/df_regression_tree_helper_impl.i | 202 - cpp/daal/src/algorithms/dtrees/gbt/BUILD | 3 +- .../dtrees/gbt/classification/BUILD | 3 +- .../oneapi/cl_kernels/gbt_common_kernels.cl | 234 - .../oneapi/gbt_feature_type_helper_oneapi.h | 176 - .../oneapi/gbt_feature_type_helper_oneapi.i | 471 - .../algorithms/dtrees/gbt/regression/BUILD | 5 +- ...ict_dense_default_batch_fpt_dispatcher.cpp | 2 +- .../gbt_regression_train_container.h | 28 +- ...ain_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...n_train_dense_default_batch_oneapi_fpt.cpp | 46 - .../gbt_batch_regression_kernels.cl | 371 - ...gression_train_dense_default_oneapi_impl.i | 1136 --- .../gbt_regression_train_kernel_oneapi.h | 165 - .../src/algorithms/k_nearest_neighbors/BUILD | 3 +- .../bf_knn_classification_model_impl.cpp | 2 +- ...l.h => bf_knn_classification_model_impl.h} | 58 +- .../bf_knn_classification_predict_batch.cpp | 2 +- ...on_predict_dense_default_batch_container.h | 28 +- ...ict_dense_default_batch_fpt_dispatcher.cpp | 4 +- ...f_knn_classification_predict_fpt_ucapi.cpp | 37 - .../bf_knn_classification_predict_kernel.h | 2 +- ...f_knn_classification_predict_kernel_impl.i | 2 +- .../bf_knn_classification_predict_result.h | 38 +- .../bf_knn_classification_train_container.h | 31 +- ...ain_dense_default_batch_fpt_dispatcher.cpp | 4 +- .../bf_knn_classification_train_fpt_ucapi.cpp | 37 - .../bf_knn_classification_train_kernel.h | 2 +- .../bf_knn_classification_train_kernel_impl.i | 2 +- .../k_nearest_neighbors/bf_knn_impl.i | 2 +- ..._knn_classification_predict_kernel_ucapi.h | 90 - ...classification_predict_kernel_ucapi_impl.i | 637 -- ...bf_knn_classification_train_kernel_ucapi.h | 52 - ...n_classification_train_kernel_ucapi_impl.i | 46 - .../oneapi/cl_kernels/bf_knn_cl_kernels.cl | 102 - cpp/daal/src/algorithms/kernel.h | 18 +- cpp/daal/src/algorithms/kernel_config.h | 9 - cpp/daal/src/algorithms/kernel_function/BUILD | 3 +- .../kernel_function/kernel_function_fpt.cpp | 16 +- .../kernel_function_linear_batch_container.h | 29 +- ...n_linear_csr_fast_batch_fpt_dispatcher.cpp | 2 +- ...ction_linear_csr_fast_batch_oneapi_fpt.cpp | 43 - ...ear_dense_default_batch_fpt_dispatcher.cpp | 2 +- ..._linear_dense_default_batch_oneapi_fpt.cpp | 43 - .../kernel_function_rbf_batch_container.h | 24 +- ...rnel_function_rbf_csr_batch_oneapi_fpt.cpp | 43 - ...tion_rbf_csr_fast_batch_fpt_dispatcher.cpp | 2 +- ...rbf_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...ion_rbf_dense_default_batch_oneapi_fpt.cpp | 43 - .../oneapi/cl_kernels/kernel_function.cl | 67 - .../oneapi/kernel_function_helper_oneapi.h | 145 - ...nel_function_linear_csr_fast_oneapi_impl.i | 128 - ...unction_linear_dense_default_oneapi_impl.i | 122 - .../kernel_function_linear_kernel_oneapi.h | 104 - ...kernel_function_rbf_csr_fast_oneapi_impl.i | 135 - ...l_function_rbf_dense_default_oneapi_impl.i | 128 - .../kernel_function_rbf_kernel_oneapi.h | 118 - cpp/daal/src/algorithms/kernel_inst_arm.h | 25 - cpp/daal/src/algorithms/kernel_inst_riscv64.h | 25 - cpp/daal/src/algorithms/kernel_inst_x86.h | 33 - cpp/daal/src/algorithms/kmeans/BUILD | 3 +- .../src/algorithms/kmeans/kmeans_container.h | 107 +- ...eans_dense_lloyd_base_kernel_ucapi_fpt.cpp | 39 - ...means_dense_lloyd_batch_fpt_dispatcher.cpp | 2 +- ...ans_dense_lloyd_batch_kernel_ucapi_fpt.cpp | 39 - ...dense_lloyd_distr_step1_fpt_dispatcher.cpp | 2 +- ...nse_lloyd_distr_step1_kernel_ucapi_fpt.cpp | 39 - ...dense_lloyd_distr_step2_fpt_dispatcher.cpp | 2 +- ...nse_lloyd_distr_step2_kernel_ucapi_fpt.cpp | 39 - .../algorithms/kmeans/kmeans_init_container.h | 27 +- ...eans_init_dense_batch_kernel_ucapi_fpt.cpp | 49 - ...nse_deterministic_batch_fpt_dispatcher.cpp | 2 +- ...init_dense_random_batch_fpt_dispatcher.cpp | 2 +- ...ense_random_distr_step1_fpt_dispatcher.cpp | 2 +- ...ense_random_distr_step2_fpt_dispatcher.cpp | 2 +- .../algorithms/kmeans/kmeans_init_result.h | 22 +- .../src/algorithms/kmeans/kmeans_result.h | 36 +- .../oneapi/cl_kernels/kmeans_cl_kernels.cl | 337 - .../kmeans_cl_kernels_distr_steps.cl | 87 - .../cl_kernels/kmeans_init_cl_kernels.cl | 54 - .../kmeans_dense_lloyd_batch_kernel_ucapi.h | 56 - ...eans_dense_lloyd_batch_kernel_ucapi_impl.i | 227 - .../kmeans_dense_lloyd_kernel_base_ucapi.h | 125 - ...means_dense_lloyd_kernel_base_ucapi_impl.i | 563 -- .../kmeans_init_dense_batch_kernel_ucapi.h | 72 - ...means_init_dense_batch_kernel_ucapi_impl.i | 237 - .../kmeans_lloyd_distr_step1_kernel_ucapi.h | 52 - .../kmeans_lloyd_distr_step1_ucapi_impl.i | 246 - .../kmeans_lloyd_distr_step2_kernel_ucapi.h | 63 - .../kmeans_lloyd_distr_step2_ucapi_impl.i | 341 - cpp/daal/src/algorithms/linear_model/BUILD | 3 +- .../linear_model/linear_model_model_fpt.cpp | 13 +- .../linear_model_predict_batch_fpt.cpp | 13 +- .../linear_model_predict_container.h | 26 +- ...ict_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...predict_dense_default_batch_oneapi_fpt.cpp | 42 - ...model_train_normeq_finalize_oneapi_fpt.cpp | 39 - ...r_model_train_normeq_update_oneapi_fpt.cpp | 39 - .../cl_kernel/linear_model_prediction.cl | 45 - .../oneapi/cl_kernel/reduce_results.cl | 43 - ..._predict_dense_default_batch_oneapi_impl.i | 174 - .../linear_model_predict_kernel_oneapi.h | 77 - ..._model_train_normeq_finalize_oneapi_impl.i | 193 - .../linear_model_train_normeq_kernel_oneapi.h | 147 - ...ar_model_train_normeq_update_oneapi_impl.i | 270 - .../src/algorithms/linear_regression/BUILD | 3 +- .../linear_regression_ne_model_fpt.cpp | 24 +- .../linear_regression_train_container.h | 77 +- ...rain_dense_normeq_batch_fpt_dispatcher.cpp | 2 +- ...on_train_dense_normeq_batch_oneapi_fpt.cpp | 44 - ...n_train_dense_normeq_helper_oneapi_fpt.cpp | 44 - ...ain_dense_normeq_online_fpt_dispatcher.cpp | 3 +- ...n_train_dense_normeq_online_oneapi_fpt.cpp | 44 - .../oneapi/cl_kernel/helper_beta_copy.cl | 53 - ...on_train_dense_normeq_helper_oneapi_impl.i | 107 - ...egression_train_dense_normeq_oneapi_impl.i | 70 - .../linear_regression_train_kernel_oneapi.h | 92 - .../src/algorithms/logistic_regression/BUILD | 3 +- .../logistic_regression_predict_container.h | 28 +- ...ict_dense_default_batch_fpt_dispatcher.cpp | 4 +- ...predict_dense_default_batch_oneapi_fpt.cpp | 42 - .../logistic_regression_train_container.h | 29 +- ...ain_dense_default_batch_fpt_dispatcher.cpp | 3 +- ...n_train_dense_default_batch_oneapi_fpt.cpp | 43 - ...ogistic_regression_training_result_fpt.cpp | 15 +- .../logistic_regression_dense_default.cl | 65 - ..._predict_dense_default_batch_oneapi_impl.i | 258 - ...ogistic_regression_predict_kernel_oneapi.h | 76 - ...gression_train_dense_default_oneapi_impl.i | 184 - .../logistic_regression_train_kernel_oneapi.h | 57 - ..._regression_train_kernel_oneapi_instance.h | 30 - .../src/algorithms/low_order_moments/BUILD | 3 +- .../low_order_moments_container.h | 165 +- ...nts_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...moments_dense_default_batch_oneapi_fpt.cpp | 49 - ...nse_default_distr_step2_fpt_dispatcher.cpp | 3 +- ...s_dense_default_distr_step2_oneapi_fpt.cpp | 49 - ...ts_dense_default_online_fpt_dispatcher.cpp | 2 +- ...oments_dense_default_online_oneapi_fpt.cpp | 49 - .../low_order_moments/moments_batch.h | 43 +- .../low_order_moments/moments_online.h | 64 +- .../low_order_moments_kernels_all.cl | 958 -- .../low_order_moments_kernels_all.h | 906 -- .../low_order_moments_kernels_distr.cl | 252 - .../low_order_moments_kernels_distr.h | 248 - .../low_order_moments_batch_oneapi_impl.i | 415 - ...ow_order_moments_distributed_oneapi_impl.i | 479 - .../low_order_moments_kernel_batch_oneapi.h | 153 - ..._order_moments_kernel_distributed_oneapi.h | 191 - .../low_order_moments_kernel_online_oneapi.h | 202 - .../low_order_moments_online_oneapi_impl.i | 561 -- .../src/algorithms/objective_function/BUILD | 1 - .../cl_kernel/objective_function_utils.cl | 144 - .../oneapi/objective_function_utils_oneapi.h | 459 - .../cross_entropy_loss/BUILD | 3 +- ...tropy_loss_dense_default_batch_container.h | 33 +- ...oss_dense_default_batch_fpt_dispatcher.cpp | 4 +- ...y_loss_dense_default_kernel_oneapi_fpt.cpp | 43 - .../cross_entropy_loss_dense_default.cl | 94 - ...entropy_loss_dense_default_kernel_oneapi.h | 108 - ...s_entropy_loss_dense_default_oneapi_impl.i | 497 - .../objective_function/logistic_loss/BUILD | 3 +- ...istic_loss_dense_default_batch_container.h | 33 +- ...oss_dense_default_batch_fpt_dispatcher.cpp | 4 +- ...c_loss_dense_default_kernel_oneapi_fpt.cpp | 44 - .../cl_kernel/logistic_loss_dense_default.cl | 146 - ...ogistic_loss_dense_default_kernel_oneapi.h | 108 - .../logistic_loss_dense_default_oneapi_impl.i | 576 -- ...mse_dense_default_batch_fpt_dispatcher.cpp | 4 +- .../objective_function_types_fpt.cpp | 15 +- ...rad_dense_default_batch_fpt_dispatcher.cpp | 3 +- .../iterative_solver_types_fpt.cpp | 14 +- ...fgs_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...aga_dense_default_batch_fpt_dispatcher.cpp | 2 +- .../algorithms/optimization_solver/sgd/BUILD | 1 - .../oneapi/cl_kernel/sgd_dense_minibatch.cl | 83 - .../sgd/oneapi/sgd_dense_kernel_oneapi.h | 88 - .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 416 - .../sgd/sgd_batch_container.h | 31 +- ...sgd_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...d_dense_minibatch_batch_fpt_dispatcher.cpp | 2 +- .../sgd_dense_minibatch_batch_oneapi_fpt.cpp | 40 - ...gd_dense_momentum_batch_fpt_dispatcher.cpp | 2 +- .../optimization_solver/sgd/sgd_types_fpt.cpp | 14 +- cpp/daal/src/algorithms/pca/BUILD | 3 +- .../pca/oneapi/cl_kernels/pca_cl_kernels.cl | 43 - ...pca_dense_correlation_batch_kernel_ucapi.h | 71 - ...ense_correlation_batch_kernel_ucapi_impl.i | 264 - ...ca_dense_correlation_online_kernel_ucapi.h | 68 - ...nse_correlation_online_kernel_ucapi_impl.i | 140 - .../pca_dense_correlation_batch_container.h | 33 +- ...dense_correlation_batch_fpt_dispatcher.cpp | 2 +- ...nse_correlation_batch_kernel_ucapi_fpt.cpp | 39 - ...correlation_distr_step2_fpt_dispatcher.cpp | 2 +- .../pca_dense_correlation_online_container.h | 43 +- ...ense_correlation_online_fpt_dispatcher.cpp | 2 +- ...se_correlation_online_kernel_ucapi_fpt.cpp | 39 - .../pca/pca_partialresult_correlation.h | 32 +- .../algorithms/pca/pca_result_impl_fpt.cpp | 15 +- cpp/daal/src/algorithms/pca/transform/BUILD | 3 +- .../cl_kernels/pca_transform_cl_kernels.cl | 96 - ...pca_transform_dense_default_batch_oneapi.h | 103 - ...ransform_dense_default_batch_oneapi_impl.i | 363 - .../pca/transform/pca_transform_batch_fpt.cpp | 15 +- .../pca/transform/pca_transform_container.h | 29 +- ...orm_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...ansform_dense_default_batch_oneapi_fpt.cpp | 46 - cpp/daal/src/algorithms/svm/BUILD | 3 +- .../svm/oneapi/cl_kernels/svm_kernels.cl | 139 - .../cl_kernels/svm_train_block_smo_oneapi.cl | 233 - .../algorithms/svm/oneapi/svm_helper_oneapi.h | 465 - .../svm/oneapi/svm_predict_kernel_oneapi.h | 65 - .../svm/oneapi/svm_predict_oneapi_impl.i | 269 - .../svm/oneapi/svm_train_cache_oneapi.h | 349 - .../svm/oneapi/svm_train_result_oneapi.h | 306 - .../oneapi/svm_train_thunder_kernel_oneapi.h | 93 - .../oneapi/svm_train_thunder_oneapi_impl.i | 274 - .../svm/oneapi/svm_train_workset_oneapi.h | 228 - cpp/daal/src/algorithms/svm/svm_model_fpt.cpp | 38 +- .../svm/svm_predict_batch_container.h | 23 +- ...ict_dense_default_batch_fpt_dispatcher.cpp | 2 +- ...predict_dense_default_batch_oneapi_fpt.cpp | 43 - .../svm/svm_train_batch_container.h | 45 +- ...svm_train_thunder_batch_fpt_dispatcher.cpp | 4 +- .../svm_train_thunder_batch_oneapi_fpt.cpp | 43 - .../src/data_management/daal_factory_impl.cpp | 4 - .../src/data_management/numeric_table.cpp | 35 - cpp/daal/src/services/env_detect.cpp | 1 - cpp/daal/src/services/execution_context.cpp | 37 - cpp/daal/src/services/types_utils.cpp | 55 - cpp/daal/src/sycl/blas_gpu.cpp | 160 - cpp/daal/src/sycl/blas_gpu.h | 105 - cpp/daal/src/sycl/cl_kernels/kernel_blas.cl | 123 - .../src/sycl/cl_kernels/kernel_sparse_blas.cl | 93 - cpp/daal/src/sycl/cl_kernels/math.cl | 39 - cpp/daal/src/sycl/cl_kernels/op_reducer.cl | 164 - cpp/daal/src/sycl/cl_kernels/partition.cl | 175 - cpp/daal/src/sycl/cl_kernels/radix_sort.cl | 282 - .../src/sycl/cl_kernels/select_indexed.cl | 241 - cpp/daal/src/sycl/cl_kernels/sum_reducer.cl | 199 - cpp/daal/src/sycl/gpu_support_checker.cpp | 45 - cpp/daal/src/sycl/lapack_gpu.cpp | 99 - cpp/daal/src/sycl/lapack_gpu.h | 74 - cpp/daal/src/sycl/math_service_types.h | 82 - cpp/daal/src/sycl/partition.cpp | 290 - cpp/daal/src/sycl/partition.h | 72 - cpp/daal/src/sycl/reducer.cpp | 265 - cpp/daal/src/sycl/reducer.h | 112 - cpp/daal/src/sycl/select_indexed.cpp | 363 - cpp/daal/src/sycl/select_indexed.h | 161 - cpp/daal/src/sycl/sorter.cpp | 287 - cpp/daal/src/sycl/sorter.h | 69 - cpp/daal/src/sycl/spblas_gpu.cpp | 91 - cpp/daal/src/sycl/spblas_gpu.h | 61 - cpp/daal/src/sycl/sum_reducer.cpp | 257 - .../backend/basic_statistics_interop.hpp | 1 - .../kmeans/backend/gpu/kernels_csr_impl.hpp | 1 - .../gpu/train_kernel_lloyd_dense_dpc.cpp | 1 - .../kmeans/detail/train_init_centroids.hpp | 2 +- .../backend/gpu/compute_kernel_dense_dpc.cpp | 1 - .../compute_kernel_distr_random_dense_dpc.cpp | 1 - .../backend/gpu/compute_kernel_sparse_dpc.cpp | 1 - .../gpu/infer_kernel_brute_force_dpc.cpp | 1 - .../knn/backend/gpu/infer_kernel_impl.hpp | 1 - .../knn/backend/gpu/infer_kernel_impl_dpc.hpp | 1 - .../gpu/infer_kernel_impl_dpc_distr.hpp | 1 - .../backend/gpu/infer_kernel_kd_tree_dpc.cpp | 1 - .../backend/gpu/train_kernel_kd_tree_dpc.cpp | 1 - .../dal/algo/knn/backend/model_conversion.hpp | 2 +- .../backend/gpu/infer_kernel_norm_eq_dpc.cpp | 1 - .../gpu/infer_kernel_dense_batch_dpc.cpp | 1 - .../backend/gpu/infer_kernel_sparse_dpc.cpp | 1 - cpp/oneapi/dal/backend/interop/common_dpc.cpp | 74 - cpp/oneapi/dal/backend/interop/common_dpc.hpp | 35 - .../dal/backend/interop/table_conversion.hpp | 50 +- cpp/oneapi/dal/backend/primitives/rng/BUILD | 2 +- .../dal/backend/primitives/selection/BUILD | 3 - .../backend/interop/sycl_table_adapter.hpp | 137 - .../interop/sycl_table_adapter_dpc.cpp | 313 - dev/bazel/daal.bzl | 4 +- dev/bazel/dal.bzl | 1 - dev/bazel/deps/micromkldpc.tpl.BUILD | 1 + .../optimization_solvers/custom_obj_func.h | 1 + makefile | 2 - 386 files changed, 359 insertions(+), 52791 deletions(-) delete mode 100644 cpp/daal/include/daal_sycl.h delete mode 100644 cpp/daal/include/data_management/data/internal/numeric_table_sycl.h delete mode 100755 cpp/daal/include/data_management/data/internal/numeric_table_sycl_csr.h delete mode 100644 cpp/daal/include/data_management/data/internal/numeric_table_sycl_homogen.h delete mode 100644 cpp/daal/include/data_management/data/internal/numeric_table_sycl_soa.h delete mode 100644 cpp/daal/include/services/internal/buffer_impl_sycl.h delete mode 100644 cpp/daal/include/services/internal/execution_context.h delete mode 100644 cpp/daal/include/services/internal/gpu_support_checker.h delete mode 100644 cpp/daal/include/services/internal/sycl/buffer_utils.h delete mode 100644 cpp/daal/include/services/internal/sycl/buffer_utils_sycl.h delete mode 100644 cpp/daal/include/services/internal/sycl/error_handling_sycl.h delete mode 100644 cpp/daal/include/services/internal/sycl/execution_context.h delete mode 100644 cpp/daal/include/services/internal/sycl/execution_context_sycl.h delete mode 100644 cpp/daal/include/services/internal/sycl/kernel_scheduler_sycl.h delete mode 100644 cpp/daal/include/services/internal/sycl/level_zero_common.h delete mode 100644 cpp/daal/include/services/internal/sycl/level_zero_module_sycl.h delete mode 100644 cpp/daal/include/services/internal/sycl/level_zero_types.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/blas_executor.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/lapack_executor.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/mkl_blas.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/mkl_dal.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/mkl_lapack.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/reference_axpy.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/reference_gemm.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/reference_lapack.h delete mode 100644 cpp/daal/include/services/internal/sycl/math/types.h delete mode 100644 cpp/daal/include/services/internal/sycl/types.h delete mode 100644 cpp/daal/include/services/internal/sycl/types_utils.h delete mode 100644 cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/covariance/covariance_dense_default_online_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/covariance/oneapi/cl_kernels/covariance_kernels.cl delete mode 100644 cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_batch_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi.h delete mode 100644 cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_online_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/covariance/oneapi/covariance_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/covariance/oneapi/covariance_oneapi_impl.i delete mode 100755 cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_kernel_ucapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/dbscan/oneapi/cl_kernels/dbscan_cl_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dbscan/oneapi/dbscan_dense_default_batch_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/dbscan/oneapi/dbscan_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_oneapi_fpt.cpp mode change 100755 => 100644 cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_fpt_dispatcher.cpp delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_classification_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_predict_classification_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_tree_helper_impl.i delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_common_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_tree_level_build_helper_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.i delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.i delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_oneapi_fpt.cpp mode change 100755 => 100644 cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_batch_fpt_dispatcher.cpp mode change 100755 => 100644 cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_fpt_dispatcher.cpp delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_predict_regression_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_regression_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_tree_helper_impl.i delete mode 100644 cpp/daal/src/algorithms/dtrees/gbt/oneapi/cl_kernels/gbt_common_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.h delete mode 100644 cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.i delete mode 100644 cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/cl_kernels/gbt_batch_regression_kernels.cl delete mode 100644 cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_dense_default_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_kernel_oneapi.h rename cpp/daal/src/algorithms/k_nearest_neighbors/{oneapi/bf_knn_classification_model_ucapi_impl.h => bf_knn_classification_model_impl.h} (50%) mode change 100755 => 100644 cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_container.h delete mode 100644 cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_fpt_ucapi.cpp mode change 100755 => 100644 cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_container.h delete mode 100644 cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_fpt_ucapi.cpp delete mode 100755 cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/cl_kernels/bf_knn_cl_kernels.cl delete mode 100755 cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_oneapi_fpt.cpp delete mode 100755 cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/kernel_function/oneapi/cl_kernels/kernel_function.cl delete mode 100755 cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_helper_oneapi.h delete mode 100755 cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_csr_fast_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_dense_default_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_csr_fast_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_dense_default_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_kernel_oneapi.h delete mode 100755 cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_base_kernel_ucapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_kernel_ucapi_fpt.cpp delete mode 100755 cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_kernel_ucapi_fpt.cpp delete mode 100755 cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_kernel_ucapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/kmeans/kmeans_init_dense_batch_kernel_ucapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels.cl delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels_distr_steps.cl delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_init_cl_kernels.cl delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/linear_model_prediction.cl delete mode 100644 cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/reduce_results.cl delete mode 100644 cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_dense_default_batch_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_finalize_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_update_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_helper_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/linear_regression/oneapi/cl_kernel/helper_beta_copy.cl delete mode 100644 cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_helper_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h delete mode 100755 cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/logistic_regression/oneapi/cl_kernel/logistic_regression_dense_default.cl delete mode 100644 cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_dense_default_batch_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_dense_default_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi_instance.h delete mode 100644 cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.cl delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.h delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.cl delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.h delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_batch_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_distributed_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_batch_oneapi.h delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_distributed_oneapi.h delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_online_oneapi.h delete mode 100644 cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_online_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/objective_function/common/oneapi/cl_kernel/objective_function_utils.cl delete mode 100644 cpp/daal/src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h delete mode 100644 cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_kernel_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cl_kernel/cross_entropy_loss_dense_default.cl delete mode 100644 cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_kernel_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/cl_kernel/logistic_loss_dense_default.cl delete mode 100644 cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/cl_kernel/sgd_dense_minibatch.cl delete mode 100644 cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/pca/oneapi/cl_kernels/pca_cl_kernels.cl delete mode 100644 cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi.h delete mode 100644 cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi_impl.i delete mode 100644 cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_kernel_ucapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/pca/pca_dense_correlation_online_kernel_ucapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/pca/transform/oneapi/cl_kernels/pca_transform_cl_kernels.cl delete mode 100644 cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi.h delete mode 100644 cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_kernels.cl delete mode 100755 cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_train_block_smo_oneapi.cl delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_helper_oneapi.h delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_predict_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_predict_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_train_cache_oneapi.h delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_train_result_oneapi.h delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_kernel_oneapi.h delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_oneapi_impl.i delete mode 100644 cpp/daal/src/algorithms/svm/oneapi/svm_train_workset_oneapi.h delete mode 100644 cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/algorithms/svm/svm_train_thunder_batch_oneapi_fpt.cpp delete mode 100644 cpp/daal/src/services/execution_context.cpp delete mode 100644 cpp/daal/src/services/types_utils.cpp delete mode 100644 cpp/daal/src/sycl/blas_gpu.cpp delete mode 100644 cpp/daal/src/sycl/blas_gpu.h delete mode 100644 cpp/daal/src/sycl/cl_kernels/kernel_blas.cl delete mode 100755 cpp/daal/src/sycl/cl_kernels/kernel_sparse_blas.cl delete mode 100644 cpp/daal/src/sycl/cl_kernels/math.cl delete mode 100644 cpp/daal/src/sycl/cl_kernels/op_reducer.cl delete mode 100755 cpp/daal/src/sycl/cl_kernels/partition.cl delete mode 100644 cpp/daal/src/sycl/cl_kernels/radix_sort.cl delete mode 100644 cpp/daal/src/sycl/cl_kernels/select_indexed.cl delete mode 100644 cpp/daal/src/sycl/cl_kernels/sum_reducer.cl delete mode 100644 cpp/daal/src/sycl/gpu_support_checker.cpp delete mode 100644 cpp/daal/src/sycl/lapack_gpu.cpp delete mode 100644 cpp/daal/src/sycl/lapack_gpu.h delete mode 100644 cpp/daal/src/sycl/math_service_types.h delete mode 100644 cpp/daal/src/sycl/partition.cpp delete mode 100755 cpp/daal/src/sycl/partition.h delete mode 100644 cpp/daal/src/sycl/reducer.cpp delete mode 100644 cpp/daal/src/sycl/reducer.h delete mode 100644 cpp/daal/src/sycl/select_indexed.cpp delete mode 100755 cpp/daal/src/sycl/select_indexed.h delete mode 100644 cpp/daal/src/sycl/sorter.cpp delete mode 100644 cpp/daal/src/sycl/sorter.h delete mode 100755 cpp/daal/src/sycl/spblas_gpu.cpp delete mode 100755 cpp/daal/src/sycl/spblas_gpu.h delete mode 100644 cpp/daal/src/sycl/sum_reducer.cpp delete mode 100644 cpp/oneapi/dal/backend/interop/common_dpc.cpp delete mode 100644 cpp/oneapi/dal/backend/interop/common_dpc.hpp delete mode 100644 cpp/oneapi/dal/table/backend/interop/sycl_table_adapter.hpp delete mode 100644 cpp/oneapi/dal/table/backend/interop/sycl_table_adapter_dpc.cpp diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml index 788839d68b6..188788fa2ed 100755 --- a/.ci/pipeline/ci.yml +++ b/.ci/pipeline/ci.yml @@ -623,8 +623,6 @@ jobs: ret_code=0 python -m sklearnex sklearnex/tests/run_examples.py ret_code=$(($ret_code + $?)) - python -m sklearnex sklearnex/tests/daal4py/sycl/sklearn_sycl.py - ret_code=$(($ret_code + $?)) exit $ret_code displayName: sklearnex examples - script: | diff --git a/cpp/daal/BUILD b/cpp/daal/BUILD index 4f15e2b0c3a..0cbf8a50316 100644 --- a/cpp/daal/BUILD +++ b/cpp/daal/BUILD @@ -122,23 +122,6 @@ daal_module( ], ) -daal_module( - name = "sycl", - hdrs = glob(["src/sycl/**/*.h", "src/sycl/**/*.cl"]), - srcs = glob(["src/sycl/**/*.cpp"]), - deps = select({ - "@config//:backend_ref": [ - ":services", - "@onedal//cpp/daal/src/algorithms/engines:kernel", - ], - "//conditions:default": [ - ":services", - "@onedal//cpp/daal/src/algorithms/engines:kernel", - "@micromkl_dpc//:headers", - ], - }), -) - daal_module( name = "threading_tbb", srcs = glob(["src/threading/**/*.cpp"]), @@ -164,6 +147,13 @@ daal_module( }), ) +daal_module( + name = "engines", + deps = [ + "@onedal//cpp/daal/src/algorithms/engines:kernel", + ], +) + daal_module( name = "core", deps = [ diff --git a/cpp/daal/include/algorithms/algorithm_container_base_batch.h b/cpp/daal/include/algorithms/algorithm_container_base_batch.h index c14cdadce8f..94d8fb000cb 100644 --- a/cpp/daal/include/algorithms/algorithm_container_base_batch.h +++ b/cpp/daal/include/algorithms/algorithm_container_base_batch.h @@ -27,8 +27,6 @@ #include "services/daal_memory.h" #include "services/internal/daal_kernel_defines.h" -#include "services/internal/gpu_support_checker.h" -#include "services/internal/execution_context.h" namespace daal { @@ -174,9 +172,6 @@ class DAAL_EXPORT AlgorithmDispatchContainersetArguments(this->_in, this->_res, this->_par, this->_hpar); return _cntr->compute(); } diff --git a/cpp/daal/include/algorithms/algorithm_container_base_common.h b/cpp/daal/include/algorithms/algorithm_container_base_common.h index e07160402d0..e4d99546c35 100644 --- a/cpp/daal/include/algorithms/algorithm_container_base_common.h +++ b/cpp/daal/include/algorithms/algorithm_container_base_common.h @@ -29,8 +29,6 @@ #include "algorithms/algorithm_container_base.h" #include "services/error_handling.h" -#include "services/internal/gpu_support_checker.h" -#include "services/internal/execution_context.h" namespace daal { @@ -78,9 +76,6 @@ class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImplsetArguments(this->_in, this->_pres, this->_par, this->_hpar); return _cntr->compute(); } diff --git a/cpp/daal/include/daal_sycl.h b/cpp/daal/include/daal_sycl.h deleted file mode 100644 index ad9ec6cea3b..00000000000 --- a/cpp/daal/include/daal_sycl.h +++ /dev/null @@ -1,57 +0,0 @@ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SYCL_H__ -#define __DAAL_SYCL_H__ - -#include - -#define DAAL_SYCL_INTERFACE -#include "daal.h" - -#include "services/internal/execution_context.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "data_management/data/internal/numeric_table_sycl_soa.h" -#include "data_management/data/internal/numeric_table_sycl_csr.h" - -namespace daal -{ -namespace services -{ -using services::internal::Buffer; -using services::internal::ExecutionContext; -using services::internal::SyclExecutionContext; -using services::internal::CpuExecutionContext; - -} // namespace services -} // namespace daal - -namespace daal -{ -namespace data_management -{ -using data_management::internal::SyclNumericTable; -using data_management::internal::SyclNumericTablePtr; -using data_management::internal::SyclHomogenNumericTable; -using data_management::internal::SyclSOANumericTable; -using data_management::internal::SyclSOANumericTablePtr; -using data_management::internal::SyclCSRNumericTable; -using data_management::internal::SyclCSRNumericTablePtr; - -} // namespace data_management -} // namespace daal - -#endif diff --git a/cpp/daal/include/data_management/data/internal/numeric_table_sycl.h b/cpp/daal/include/data_management/data/internal/numeric_table_sycl.h deleted file mode 100644 index e458449b9cb..00000000000 --- a/cpp/daal/include/data_management/data/internal/numeric_table_sycl.h +++ /dev/null @@ -1,74 +0,0 @@ -/* file: numeric_table_sycl.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __SYCL_NUMERIC_TABLE_H__ -#define __SYCL_NUMERIC_TABLE_H__ - -#include "data_management/data/numeric_table.h" - -namespace daal -{ -namespace data_management -{ -namespace internal -{ -namespace interface1 -{ -/** - * @ingroup sycl - * @{ - */ - -/** - * - * \brief Base class for all numeric tables designed to work with SYCL* runtime. - * These tables avoid unnecessary data transfer between devices. - */ -class DAAL_EXPORT SyclNumericTable : public NumericTable -{ -public: - DAAL_CAST_OPERATOR(SyclNumericTable) - -protected: - explicit SyclNumericTable(size_t nColumns, size_t nRows, DictionaryIface::FeaturesEqual featuresEqual, services::Status & st) - : NumericTable(nColumns, nRows, featuresEqual, st) - {} - - explicit SyclNumericTable(size_t nColumns, size_t nRows, DictionaryIface::FeaturesEqual featuresEqual) - : NumericTable(nColumns, nRows, featuresEqual) - {} - - explicit SyclNumericTable(NumericTableDictionaryPtr ddict, services::Status & st) : NumericTable(ddict, st) {} - - virtual ~SyclNumericTable() {} -}; -typedef services::SharedPtr SyclNumericTablePtr; -typedef services::SharedPtr SyclNumericTableConstPtr; - -/** @} */ - -} // namespace interface1 - -using interface1::SyclNumericTable; -using interface1::SyclNumericTablePtr; -using interface1::SyclNumericTableConstPtr; - -} // namespace internal -} // namespace data_management -} // namespace daal - -#endif diff --git a/cpp/daal/include/data_management/data/internal/numeric_table_sycl_csr.h b/cpp/daal/include/data_management/data/internal/numeric_table_sycl_csr.h deleted file mode 100755 index 2caf2f0835d..00000000000 --- a/cpp/daal/include/data_management/data/internal/numeric_table_sycl_csr.h +++ /dev/null @@ -1,669 +0,0 @@ -/* file: numeric_table_sycl_csr.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of a compressed sparse row (CSR) numeric table. -//-- -*/ - -#ifndef __SYCL_CSR_NUMERIC_TABLE_H__ -#define __SYCL_CSR_NUMERIC_TABLE_H__ - -#include "services/base.h" -#include "data_management/data/numeric_table.h" -#include "data_management/data/csr_numeric_table.h" -#include "data_management/data/data_serialize.h" -#include "data_management/data/internal/conversion.h" -#include "services/internal/sycl/buffer_utils.h" - -namespace daal -{ -namespace data_management -{ -namespace internal -{ -namespace interface1 -{ -/** - * @ingroup sycl - * @{ - */ - -/** - * - * \brief Class that provides methods to access data stored in the CSR layout. - * Each array is represented by SYCL* buffer. - */ -class DAAL_EXPORT SyclCSRNumericTable : public SyclNumericTable, public CSRNumericTableIface -{ -public: - DECLARE_SERIALIZABLE_TAG() - DECLARE_SERIALIZABLE_IMPL() - - DAAL_CAST_OPERATOR(SyclCSRNumericTable) - - /** - * Constructs SYCL CSR numeric table with user-allocated memory - * \tparam DataType Type of values in the Numeric Table - * \param[in] bufferData Buffer of values in the CSR layout. Let ptr_size denote the size of an array ptr - * \param[in] bufferColIndices Buffer of column indices in the CSR layout. Values of indices are determined by the index base - * \param[in] bufferRowOffsets Buffer of row indices in the CSR layout. Size of the array is nrow+1. The first element is 0/1 - * in zero-/one-based indexing. The last element is ptr_size+0/1 in zero-/one-based indexing - * \param[in] nColumns Number of columns in the corresponding dense table - * \param[in] nRows Number of rows in the corresponding dense table - * \param[in] indexing Indexing scheme used to access data in the CSR layout - * \param[out] stat Status of the numeric table construction - * \return SYCL CSR numeric table with user-allocated memory - * \note Present version of Intel(R) oneAPI Data Analytics Library supports 1-based indexing only - */ - template - static services::SharedPtr create(const services::internal::Buffer & bufferData, - const services::internal::Buffer & bufferColIndices, - const services::internal::Buffer & bufferRowOffsets, size_t nColumns, size_t nRows, - CSRIndexing indexing = oneBased, services::Status * stat = NULL) - { - DAAL_DEFAULT_CREATE_IMPL_EX(SyclCSRNumericTable, bufferData, bufferColIndices, bufferRowOffsets, nColumns, nRows, indexing); - } - - virtual ~SyclCSRNumericTable() { freeDataMemoryImpl(); } - - virtual services::Status resize(size_t nrows) DAAL_C11_OVERRIDE { return setNumberOfRowsImpl(nrows); } - - /** - * Returns buffers to a data set stored in the CSR layout - * \param[out] values Buffer of values in the CSR layout - * \param[out] colIndices Buffer of column indices in the CSR layout - * \param[out] rowOffsets Buffer of row indices in the CSR layout - */ - template - services::Status getArrays(services::internal::Buffer & values, services::internal::Buffer & colIndices, - services::internal::Buffer & rowOffsets) const - { - values = _values.get(); - colIndices = _colIndices; - rowOffsets = _rowOffsets; - return services::Status(); - } - /** - * Sets a buffers to a CSR data set - * \param[in] values Buffer of values in the CSR layout - * \param[in] colIndices Buffer of column indices in the CSR layout - * \param[in] rowOffsets Buffer of row indices in the CSR layout - * \param[in] indexing The indexing scheme for access to data in the CSR layout - */ - template - services::Status setArrays(const services::internal::Buffer & values, const services::internal::Buffer & colIndices, - const services::internal::Buffer & rowOffsets, CSRIndexing indexing = oneBased) - { - freeDataMemoryImpl(); - _values = values; - _colIndices = colIndices; - _rowOffsets = rowOffsets; - _indexing = indexing; - _dataSize = values.size(); - - if (values && colIndices && rowOffsets) - { - _memStatus = userAllocated; - } - return services::Status(); - } - - virtual services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfRows(vector_idx, vector_num, rwflag, block); - } - return getTBlock(vector_idx, vector_num, rwflag, block); - } - virtual services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfRows(vector_idx, vector_num, rwflag, block); - } - - return getTBlock(vector_idx, vector_num, rwflag, block); - } - virtual services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfRows(vector_idx, vector_num, rwflag, block); - } - - return getTBlock(vector_idx, vector_num, rwflag, block); - } - - virtual services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfRows(block); - } - - return releaseTBlock(block); - } - virtual services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfRows(block); - } - - return releaseTBlock(block); - } - virtual services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfRows(block); - } - - return releaseTBlock(block); - } - - virtual services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfColumnValues(feature_idx, vector_idx, value_num, rwflag, block); - } - - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - virtual services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfColumnValues(feature_idx, vector_idx, value_num, rwflag, block); - } - - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - virtual services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfColumnValues(feature_idx, vector_idx, value_num, rwflag, block); - } - - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - - virtual services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfColumnValues(block); - } - - return releaseTFeature(block); - } - virtual services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfColumnValues(block); - } - - return releaseTFeature(block); - } - virtual services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfColumnValues(block); - } - - return releaseTFeature(block); - } - - virtual services::Status getSparseBlock(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, - CSRBlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getSparseBlock(vector_idx, vector_num, rwflag, block); - } - - return getSparseTBlock(vector_idx, vector_num, rwflag, block); - } - virtual services::Status getSparseBlock(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, - CSRBlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getSparseBlock(vector_idx, vector_num, rwflag, block); - } - - return getSparseTBlock(vector_idx, vector_num, rwflag, block); - } - virtual services::Status getSparseBlock(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, - CSRBlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getSparseBlock(vector_idx, vector_num, rwflag, block); - } - - return getSparseTBlock(vector_idx, vector_num, rwflag, block); - } - - virtual services::Status releaseSparseBlock(CSRBlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseSparseBlock(block); - } - - return releaseSparseTBlock(block); - } - virtual services::Status releaseSparseBlock(CSRBlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseSparseBlock(block); - } - - return releaseSparseTBlock(block); - } - virtual services::Status releaseSparseBlock(CSRBlockDescriptor & block) DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->releaseSparseBlock(block); - } - - return releaseSparseTBlock(block); - } - - /** - * Allocates memory for a data set - * \param[in] dataSize Number of non-zero values - * \param[in] type Memory type - */ - using daal::data_management::interface1::NumericTableIface::allocateDataMemory; - - services::Status allocateDataMemory(size_t dataSize, daal::MemType /*type*/ = daal::dram) - { - if (isCpuTable()) - { - return _cpuTable->allocateDataMemory(dataSize); - } - - using namespace services::internal::sycl; - - services::Status status; - auto & context = services::internal::getDefaultContext(); - _dataSize = dataSize; - freeDataMemoryImpl(); - size_t nrow = getNumberOfRows(); - - if (nrow == 0) - { - return services::Status(services::ErrorIncorrectNumberOfObservations); - } - - const NumericTableFeature & f = (*_ddict)[0]; - _values = allocateByNumericTableFeature(f, dataSize, status); - DAAL_CHECK_STATUS_VAR(status); - _colIndicesU = context.allocate(services::internal::sycl::TypeIds::id(), dataSize, status); - DAAL_CHECK_STATUS_VAR(status); - _rowOffsetsU = context.allocate(services::internal::sycl::TypeIds::id(), (nrow + 1), status); - DAAL_CHECK_STATUS_VAR(status); - - services::throwIfPossible(status); - DAAL_CHECK_STATUS_VAR(status); - - _colIndices = _colIndicesU.template get(); - _rowOffsets = _rowOffsetsU.template get(); - DAAL_ASSERT(dataSize == _colIndices.size()); - - _memStatus = internallyAllocated; - services::throwIfPossible(status); - return status; - } - - /** - * Returns the indexing scheme for access to data in the CSR layout - * \return CSR layout indexing - */ - CSRIndexing getCSRIndexing() const { return _indexing; } - - /** - * \copydoc NumericTableIface::check - */ - virtual services::Status check(const char * description, bool checkDataAllocation = true) const DAAL_C11_OVERRIDE - { - services::Status s; - if (_indexing != oneBased) - { - return services::Status(services::Error::create(services::ErrorUnsupportedCSRIndexing, services::ArgumentName, description)); - } - - return services::Status(); - } - -protected: - inline bool isCpuTable() const { return (bool)_cpuTable; } - - static bool isCpuContext() { return services::internal::getDefaultContext().getInfoDevice().isCpu; } - -protected: - NumericTableFeature _defaultFeature; - CSRIndexing _indexing; - size_t _dataSize; - - services::internal::sycl::UniversalBuffer _values; - services::internal::sycl::UniversalBuffer _colIndicesU; - services::internal::sycl::UniversalBuffer _rowOffsetsU; - services::internal::Buffer _colIndices; - services::internal::Buffer _rowOffsets; - - CSRNumericTablePtr _cpuTable; - - services::Status allocateDataMemoryImpl(daal::MemType /*type*/ = daal::dram) DAAL_C11_OVERRIDE - { - return services::Status(services::ErrorMethodNotSupported); - } - - void freeDataMemoryImpl() DAAL_C11_OVERRIDE - { - _values = services::internal::sycl::UniversalBuffer(); - _colIndices.reset(); - _rowOffsets.reset(); - _memStatus = notAllocated; - } - - /** \private */ - template - services::Status serialImpl(Archive * archive) - { - using namespace services::internal::sycl; - services::Status status = SyclNumericTable::serialImpl(archive); - - size_t dataSize = 0; - if (!onDeserialize) - { - dataSize = getDataSize(); - } - archive->set(dataSize); - - if (onDeserialize) - { - if (isCpuTable()) - { - DAAL_CHECK_STATUS(status, _cpuTable->allocateDataMemory(dataSize)); - } - else - { - DAAL_CHECK_STATUS(status, allocateDataMemory(dataSize)); - } - } - - size_t nfeat = getNumberOfColumns(); - size_t nobs = getNumberOfRows(); - - if (nfeat > 0) - { - NumericTableFeature & f = (*_ddict)[0]; - if (isCpuTable()) - { - char * data = NULL; - size_t * colIndices = NULL; - size_t * rowOffsets = NULL; - - _cpuTable->getArrays(&data, &colIndices, &rowOffsets); - archive->set(data, dataSize * f.typeSize); - archive->set(colIndices, dataSize); - archive->set(rowOffsets, nobs + 1); - } - else - { - const auto accessMode = onDeserialize ? data_management::writeOnly : data_management::readOnly; - - services::SharedPtr hostColIndices = _colIndices.toHost(accessMode, status); - DAAL_CHECK_STATUS_VAR(status); - services::SharedPtr hostRowOffsets = _rowOffsets.toHost(accessMode, status); - DAAL_CHECK_STATUS_VAR(status); - - BufferHostReinterpreter reinterpreter(_values, accessMode, dataSize); - TypeDispatcher::dispatch(_values.type(), reinterpreter, status); - DAAL_CHECK_STATUS_VAR(status); - services::SharedPtr charPtr = reinterpreter.getResult(); - - archive->set(charPtr.get(), dataSize * f.typeSize); - archive->set(hostColIndices.get(), dataSize); - archive->set(hostRowOffsets.get(), nobs + 1); - } - } - return status; - } - -public: - virtual size_t getDataSize() DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getDataSize(); - } - - return _dataSize; - } - -protected: - template - SyclCSRNumericTable(const services::internal::Buffer & bufferData, const services::internal::Buffer & bufferColIndices, - const services::internal::Buffer & bufferRowOffsets, size_t nColumns, size_t nRows, CSRIndexing indexing, - services::Status & st) - : SyclNumericTable(nColumns, nRows, DictionaryIface::equal, st), _indexing(indexing) - { - _layout = csrArray; - _dataSize = bufferData.size(); - _defaultFeature.setType(); - st |= _ddict->setAllFeatures(_defaultFeature); - - if (bufferData.size() != bufferColIndices.size()) - { - st |= services::Error::create(services::ErrorIncorrectSizeOfArray); - services::throwIfPossible(st); - return; - } - - if (bufferRowOffsets.size() != nRows + 1 && _dataSize) - { - st |= services::Error::create(services::ErrorIncorrectNumberOfRows); - services::throwIfPossible(st); - return; - } - - if (isCpuContext()) - { - if (!bufferData.size() && !bufferColIndices.size() && !bufferRowOffsets.size()) - { - _cpuTable = CSRNumericTable::create(NULL, NULL, NULL, nColumns, nRows, indexing, &st); - } - else - { - const services::SharedPtr hostData = bufferData.toHost(ReadWriteMode::readOnly, st); - const services::SharedPtr hostColIndices = bufferColIndices.toHost(ReadWriteMode::readOnly, st); - const services::SharedPtr hostRowOffsets = bufferRowOffsets.toHost(ReadWriteMode::readOnly, st); - if (!st) - { - services::throwIfPossible(st); - return; - } - - _cpuTable = CSRNumericTable::create(hostData, hostColIndices, hostRowOffsets, nColumns, nRows, indexing, &st); - } - - return; - } - if (_dataSize) - { - st |= setArrays(bufferData, bufferColIndices, bufferRowOffsets, indexing); - } - } - - template - services::Status getTBlock(size_t idx, size_t nrows, int rwFlag, BlockDescriptor & block) - { - return services::throwIfPossible(services::ErrorMethodNotImplemented); - } - - template - services::Status releaseTBlock(BlockDescriptor & block) - { - return services::throwIfPossible(services::ErrorMethodNotImplemented); - } - - template - services::Status getTFeature(size_t feat_idx, size_t idx, size_t nrows, int rwFlag, BlockDescriptor & block) - { - return services::throwIfPossible(services::ErrorMethodNotImplemented); - } - - template - services::Status releaseTFeature(BlockDescriptor & block) - { - return services::throwIfPossible(services::ErrorMethodNotImplemented); - } - - template - services::Status getSparseTBlock(size_t idx, size_t nrows, int rwFlag, CSRBlockDescriptor & block) - { - using namespace services::internal::sycl; - - size_t ncols = getNumberOfColumns(); - size_t nobs = getNumberOfRows(); - block.setDetails(ncols, idx, rwFlag); - - if (idx >= nobs) - { - block.resizeValuesBuffer(0); - return services::Status(); - } - - nrows = (idx + nrows < nobs) ? nrows : nobs - idx; - - services::Status st; - - block.setRowIndicesBuffer(_rowOffsets); - - size_t offset = 0; - size_t datasize = _dataSize; - if (idx == 0) - { - block.setRowIndicesBuffer(_rowOffsets); - } - else - { - services::internal::sycl::UniversalBuffer rowOffsetsNew = - services::internal::getDefaultContext().allocate(services::internal::sycl::TypeIds::id(), (nrows + 1), st); - DAAL_CHECK_STATUS_VAR(st); - services::SharedPtr hostRowOffsetsNew = rowOffsetsNew.get().toHost(ReadWriteMode::writeOnly, st); - DAAL_CHECK_STATUS_VAR(st); - const services::SharedPtr hostRowOffsets = _rowOffsets.toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - - size_t * rowOffsetsNewPtr = hostRowOffsetsNew.get(); - const size_t * rowOffsetsPtr = hostRowOffsets.get(); - - if (rowOffsetsNewPtr == NULL || rowOffsetsPtr == NULL) - { - return services::Status(services::ErrorNullPtr); - } - - rowOffsetsNewPtr[0] = 1; - for (size_t i = 0; i < nrows; ++i) - { - const size_t nNonZeroValuesInRow = rowOffsetsPtr[idx + i + 1] - rowOffsetsPtr[idx + i]; - rowOffsetsNewPtr[i + 1] = rowOffsetsNewPtr[i] + nNonZeroValuesInRow; - } - offset = rowOffsetsPtr[idx] - rowOffsetsPtr[0]; - datasize = rowOffsetsNewPtr[nrows + 1] - rowOffsetsNewPtr[1]; - block.setRowIndicesBuffer(rowOffsetsNew.get()); - } - - BufferConverterTo converter(_values, offset, datasize); - TypeDispatcher::dispatch(_values.type(), converter, st); - DAAL_CHECK_STATUS_VAR(st); - - services::internal::Buffer valuesBuffer = converter.getResult(); - block.setValuesBuffer(valuesBuffer); - block.setColumnIndicesBuffer(_colIndices.getSubBuffer(offset, datasize, st)); - - return st; - } - - template - services::Status releaseSparseTBlock(CSRBlockDescriptor & block) - { - using namespace services::internal::sycl; - - if (block.getRWFlag() & (int)writeOnly) - { - NumericTableFeature & f = (*_ddict)[0]; - const int indexType = f.indexType; - - if (data_management::features::DAAL_OTHER_T == indexType && features::internal::getIndexNumType() != indexType) - { - block.reset(); - return services::Status(services::ErrorDataTypeNotSupported); - } - - if (features::internal::getIndexNumType() != indexType) - { - services::Status st; - auto uniBuffer = _values; - BufferConverterFrom converter(block.getBlockValuesBuffer(), uniBuffer, block.getRowsOffset(), block.getNumberOfRows()); - TypeDispatcher::dispatch(uniBuffer.type(), converter, st); - DAAL_CHECK_STATUS_VAR(st); - - _values = converter.getResult(); - } - } - block.reset(); - return services::Status(); - } - - virtual services::Status setNumberOfColumnsImpl(size_t ncol) DAAL_C11_OVERRIDE - { - _ddict->setNumberOfFeatures(ncol); - _ddict->setAllFeatures(_defaultFeature); - return services::Status(); - } -}; -typedef services::SharedPtr SyclCSRNumericTablePtr; -/** @} */ -} // namespace interface1 -using interface1::SyclCSRNumericTable; -using interface1::SyclCSRNumericTablePtr; - -} // namespace internal -} // namespace data_management -} // namespace daal -#endif diff --git a/cpp/daal/include/data_management/data/internal/numeric_table_sycl_homogen.h b/cpp/daal/include/data_management/data/internal/numeric_table_sycl_homogen.h deleted file mode 100644 index 1d361c31915..00000000000 --- a/cpp/daal/include/data_management/data/internal/numeric_table_sycl_homogen.h +++ /dev/null @@ -1,657 +0,0 @@ -/* file: numeric_table_sycl_homogen.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __SYCL_HOMOGEN_NUMERIC_TABLE_H__ -#define __SYCL_HOMOGEN_NUMERIC_TABLE_H__ - -#ifdef DAAL_SYCL_INTERFACE - #include -#endif - -#include "data_management/data/internal/numeric_table_sycl.h" -#include "data_management/data/internal/conversion.h" -#include "data_management/data/homogen_numeric_table.h" -#include "services/internal/execution_context.h" - -namespace daal -{ -namespace data_management -{ -namespace internal -{ -namespace interface1 -{ -/** - * @ingroup sycl - * @{ - */ - -/** - * - * \brief Class that provides methods to access data stored as a one-dimentional SYCL* buffer. - * Table rows contain feature vectors, and columns contain values of individual features. - * \tparam DataType Defines the underlying data type that describes a Numeric Table - */ -template -class DAAL_EXPORT SyclHomogenNumericTable : public SyclNumericTable -{ -public: - DECLARE_SERIALIZABLE_TAG() - DECLARE_SERIALIZABLE_IMPL() - - DAAL_CAST_OPERATOR(SyclHomogenNumericTable) - -public: - /** - * Constructs a Numeric Table with buffer object - * \param[in] buffer Buffer with a homogeneous data set - * \param[in] nColumns Number of columns in the table - * \param[in] nRows Number of rows in the table - * \param[out] stat Status of the numeric table construction - * \return Numeric table with user-allocated memory - */ - static services::SharedPtr > create(const services::internal::Buffer & buffer, size_t nColumns = 0, - size_t nRows = 0, services::Status * stat = NULL) - { - DAAL_DEFAULT_CREATE_TEMPLATE_IMPL_EX(SyclHomogenNumericTable, DataType, DictionaryIface::notEqual, buffer, nColumns, nRows); - } - -#ifdef DAAL_SYCL_INTERFACE_USM - static services::SharedPtr > create(const services::SharedPtr & usmData, size_t nColumns, - size_t nRows, const ::sycl::queue & queue, services::Status * stat = NULL) - { - const size_t bufferSize = nColumns * nRows; - - // multiplication overflow check is done in the constructor. - // its not a safety problem to postpone this check since services::internal::Buffer() constructor - // do not perform any data allocations in case of input usm data - we can create it even with wrong bufferSize - - services::Status localStatus; - services::internal::Buffer buffer(usmData, bufferSize, queue, localStatus); - services::internal::tryAssignStatusAndThrow(stat, localStatus); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(localStatus, services::SharedPtr >()); - - return create(buffer, nColumns, nRows, stat); - } -#endif - -#ifdef DAAL_SYCL_INTERFACE_USM - static services::SharedPtr > create(DataType * usmData, size_t nColumns, size_t nRows, - const ::sycl::queue & queue, services::Status * stat = NULL) - { - const auto overflow_status = checkSizeOverflow(nRows, nColumns); - if (!overflow_status) - { - services::throwIfPossible(overflow_status); - DAAL_CHECK_COND_ERROR(stat, *stat, overflow_status); - return services::SharedPtr >(); - } - const size_t bufferSize = nColumns * nRows; - - services::Status localStatus; - services::internal::Buffer buffer(usmData, bufferSize, queue, localStatus); - services::internal::tryAssignStatusAndThrow(stat, localStatus); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(localStatus, services::SharedPtr >()); - - return create(buffer, nColumns, nRows, stat); - } -#endif - - /** - * Constructs a Numeric Table - * \param[in] nColumns Number of columns in the table - * \param[in] nRows Number of rows in the table - * \param[in] memoryAllocationFlag Flag that controls internal memory allocation for data in the numeric table - * \param[out] stat Status of the numeric table construction - * \return Numeric table with user-allocated memory - */ - static services::SharedPtr > create(size_t nColumns, size_t nRows, AllocationFlag memoryAllocationFlag, - services::Status * stat = NULL) - { - DAAL_DEFAULT_CREATE_TEMPLATE_IMPL_EX(SyclHomogenNumericTable, DataType, DictionaryIface::notEqual, nColumns, nRows, memoryAllocationFlag); - } - - /** - * Constructs a Numeric Table with memory allocation controlled via a flag and fills the table with a constant - * \param[in] nColumns Number of columns in the table - * \param[in] nRows Number of rows in the table - * \param[in] memoryAllocationFlag Flag that controls internal memory allocation for data in the numeric table - * \param[in] constValue Constant to initialize entries of the homogeneous numeric table - * \param[out] stat Status of the numeric table construction - * \return Numeric table initialized with a constant - */ - static services::SharedPtr > create(size_t nColumns, size_t nRows, AllocationFlag memoryAllocationFlag, - const DataType & constValue, services::Status * stat = NULL) - { - DAAL_DEFAULT_CREATE_TEMPLATE_IMPL_EX(SyclHomogenNumericTable, DataType, DictionaryIface::notEqual, nColumns, nRows, memoryAllocationFlag, - constValue); - } - - SyclHomogenNumericTable() : SyclNumericTable(0, 0, DictionaryIface::notEqual) {} - - ~SyclHomogenNumericTable() DAAL_C11_OVERRIDE - { - freeDataMemoryImpl(); - } - - services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTBlock(vector_idx, vector_num, rwflag, block); - } - - services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTBlock(vector_idx, vector_num, rwflag, block); - } - - services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTBlock(vector_idx, vector_num, rwflag, block); - } - - services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return releaseTBlock(block); - } - - services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return releaseTBlock(block); - } - - services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return releaseTBlock(block); - } - - services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - - services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - - services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - - services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return releaseTFeature(block); - } - - services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return releaseTFeature(block); - } - - services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return releaseTFeature(block); - } - - services::Status assign(float value) DAAL_C11_OVERRIDE - { - return assignImpl(value); - } - - services::Status assign(double value) DAAL_C11_OVERRIDE - { - return assignImpl(value); - } - - services::Status assign(int value) DAAL_C11_OVERRIDE - { - return assignImpl(value); - } - -protected: - SyclHomogenNumericTable(DictionaryIface::FeaturesEqual featuresEqual, size_t nColumns, size_t nRows, services::Status & st) - : SyclNumericTable(nColumns, nRows, featuresEqual, st) - { - _layout = NumericTableIface::aos; - - NumericTableFeature df; - df.setType(); - st |= _ddict->setAllFeatures(df); - services::throwIfPossible(st); - } - - SyclHomogenNumericTable(DictionaryIface::FeaturesEqual featuresEqual, const services::internal::Buffer & buffer, size_t nColumns, - size_t nRows, services::Status & st) - : SyclHomogenNumericTable(featuresEqual, nColumns, nRows, st) - { - st |= checkSizeOverflow(nRows, nColumns); - services::throwIfPossible(st); - - if (nColumns * nRows > buffer.size()) - { - st |= services::Error::create(services::ErrorIncorrectSizeOfArray, services::Row, "Buffer size is not enough to represent the table"); - services::throwIfPossible(st); - } - - if (st) - { - _buffer = buffer; - _memStatus = userAllocated; - } - } - - SyclHomogenNumericTable(DictionaryIface::FeaturesEqual featuresEqual, size_t nColumns, size_t nRows, - NumericTable::AllocationFlag memoryAllocationFlag, services::Status & st) - : SyclHomogenNumericTable(featuresEqual, nColumns, nRows, st) - { - if (memoryAllocationFlag == NumericTableIface::doAllocate) - { - st |= allocateDataMemoryImpl(); - } - } - - SyclHomogenNumericTable(DictionaryIface::FeaturesEqual featuresEqual, size_t nColumns, size_t nRows, - NumericTable::AllocationFlag memoryAllocationFlag, const DataType & constValue, services::Status & st) - : SyclHomogenNumericTable(featuresEqual, nColumns, nRows, memoryAllocationFlag, st) - { - st |= assignImpl(constValue); - } - - services::Status allocateDataMemoryImpl(daal::MemType type = daal::dram) DAAL_C11_OVERRIDE - { - if (type != daal::dram) - { - return services::throwIfPossible(services::ErrorIncorrectParameter); - } - - services::Status status; - - freeDataMemoryImpl(); - - if (!getNumberOfRows() || !getNumberOfColumns()) - { - return status; - } - - if (isCpuContext()) - { - status |= allocateDataMemoryOnCpu(); - DAAL_CHECK_STATUS_VAR(status); - } - else - { - status |= checkSizeOverflow(getNumberOfColumns(), getNumberOfRows()); - if (!status) return services::throwIfPossible(status); - - const size_t size = getNumberOfColumns() * getNumberOfRows(); - const auto universalBuffer = - services::internal::getDefaultContext().allocate(services::internal::sycl::TypeIds::id(), size, status); - - if (!status) return services::throwIfPossible(status); - - _buffer = universalBuffer.template get(); - } - - _memStatus = internallyAllocated; - return status; - } - - void freeDataMemoryImpl() DAAL_C11_OVERRIDE - { - _buffer.reset(); - _cpuTable.reset(); - _memStatus = notAllocated; - } - - services::Status setNumberOfColumnsImpl(size_t ncol) DAAL_C11_OVERRIDE - { - services::Status status; - - if (isCpuTable()) - { - status |= _cpuTable->setNumberOfColumns(ncol); - if (!status) return services::throwIfPossible(status); - } - - if (_ddict->getNumberOfFeatures() != ncol) - { - status |= _ddict->resetDictionary(); - if (!status) return services::throwIfPossible(status); - - status |= _ddict->setNumberOfFeatures(ncol); - if (!status) return services::throwIfPossible(status); - - NumericTableFeature df; - df.setType(); - status |= _ddict->setAllFeatures(df); - if (!status) return services::throwIfPossible(status); - } - - return status; - } - - template - services::Status serialImpl(Archive * archive) - { - auto st = NumericTable::serialImpl(archive); - DAAL_CHECK_STATUS_VAR(st); - - if (onDeserialize) - { - st |= allocateDataMemoryImpl(); - DAAL_CHECK_STATUS_VAR(st); - } - - const size_t size = getNumberOfColumns() * getNumberOfRows(); - // overflow checks done in constructors and allocateDataMemoryImpl() method - - if (isCpuTable()) - { - archive->set(_cpuTable->getArray(), size); - } - else - { - const auto hostData = _buffer.toHost(onDeserialize ? data_management::writeOnly : data_management::readOnly, st); - if (!st) return services::throwIfPossible(st); - - archive->set(hostData.get(), size); - } - - return st; - } - - template - services::Status assignImpl(T value) - { - services::Status status; - - if (_memStatus == notAllocated) - { - status |= services::Status(services::ErrorEmptyHomogenNumericTable); - return services::throwIfPossible(status); - } - - if (isCpuTable()) - { - return _cpuTable->assign(value); - } - - services::internal::getDefaultContext().fill(_buffer, (double)value, status); - return services::throwIfPossible(status); - } - -private: - static services::Status checkSizeOverflow(size_t nRows, size_t nCols) - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRows, nCols); - return services::Status(); - } - - static services::Status checkOffsetOverflow(size_t size, size_t offset) - { - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, size, offset); - return services::Status(); - } - - template - struct BufferIO - { - static services::Status read(const services::internal::Buffer & buffer, BlockDescriptor & block, size_t nRows, size_t nCols) - { - DAAL_ASSERT(buffer.size() == nRows * nCols); - services::Status status; - - if (!block.resizeBuffer(nCols, nRows)) - { - return services::throwIfPossible(services::ErrorMemoryAllocationFailed); - } - - auto hostPtr = buffer.toHost(data_management::readOnly, status); - if (!status) return services::throwIfPossible(status); - - internal::VectorUpCast()(nRows * nCols, hostPtr.get(), block.getBlockPtr()); - - return status; - } - - static services::Status write(services::internal::Buffer buffer, const BlockDescriptor & block, size_t nRows, size_t nCols) - { - services::Status status; - - DAAL_ASSERT(block.getNumberOfRows() == nRows); - DAAL_ASSERT(block.getNumberOfColumns() == nCols); - DAAL_ASSERT(buffer.size() == nRows * nCols); - - auto hostPtr = buffer.toHost(data_management::writeOnly, status); - if (!status) return services::throwIfPossible(status); - - if (!block.getBlockPtr()) - { - return services::throwIfPossible(services::ErrorNullPtr); - } - - internal::VectorDownCast()(nRows * nCols, block.getBlockPtr(), hostPtr.get()); - - return status; - } - }; - - template - struct BufferIO - { - static services::Status read(const services::internal::Buffer & buffer, BlockDescriptor & block, size_t nRows, size_t nCols) - { - DAAL_ASSERT(buffer.size() == nRows * nCols); - - block.setBuffer(buffer, nCols, nRows); - return services::Status(); - } - - static services::Status write(services::internal::Buffer buffer, const BlockDescriptor & block, size_t nRows, size_t nCols) - { - // The case when user calls block.setBuffer() on their side is not supported - // SYCL have no API to check that two buffers or subbuffers point to the same memory. - // Use of block.setBuffer() should be reviewed manually in the algorithms - return services::Status(); - } - }; - - services::internal::Buffer getSubBuffer(size_t rowOffset, size_t nRows, services::Status & st) - { - DAAL_ASSERT(rowOffset < getNumberOfRows()); - DAAL_ASSERT(nRows <= getNumberOfRows()); - - const size_t nCols = getNumberOfColumns(); - const size_t offset = rowOffset * nCols; - const size_t size = nRows * nCols; - - // Checks on offset+size correctness are done in getTBlock(), releaseTBlock() functions - - if (size == _buffer.size()) - { - return _buffer; - } - services::internal::Buffer subBuffer = _buffer.getSubBuffer(offset, size, st); - services::throwIfPossible(st); - - return subBuffer; - } - - template - services::Status getTBlock(size_t rowOffset, size_t nRowsBlockDesired, ReadWriteMode rwFlag, BlockDescriptor & block) - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfRows(rowOffset, nRowsBlockDesired, rwFlag, block); - } - - services::Status status; - - const size_t nRows = getNumberOfRows(); - const size_t nCols = getNumberOfColumns(); - block.setDetails(0, rowOffset, rwFlag); - - if (rowOffset >= nRows) - { - block.reset(); - return services::Status(); - } - - auto st = checkOffsetOverflow(nRowsBlockDesired, rowOffset); - if (!st) return services::throwIfPossible(st); - - const size_t nRowsBlock = (rowOffset + nRowsBlockDesired < nRows) ? nRowsBlockDesired : nRows - rowOffset; - - auto subbuffer = getSubBuffer(rowOffset, nRowsBlock, st); - DAAL_CHECK_STATUS_VAR(st); - - st |= BufferIO::read(subbuffer, block, nRowsBlock, nCols); - return st; - } - - template - services::Status releaseTBlock(BlockDescriptor & block) - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfRows(block); - } - - services::Status status; - - if (block.getRWFlag() & (int)writeOnly) - { - const size_t nCols = getNumberOfColumns(); - const size_t nRows = getNumberOfRows(); - const size_t nRowsBlock = block.getNumberOfRows(); - const size_t rowOffset = block.getRowsOffset(); - - status |= checkOffsetOverflow(nRowsBlock, rowOffset); - if (!status) return throwIfPossible(status); - - if ((nRowsBlock + rowOffset) > nRows || nCols != block.getNumberOfColumns()) - { - return services::throwIfPossible(services::ErrorIncorrectParameter); - } - auto subbuffer = getSubBuffer(rowOffset, nRowsBlock, status); - DAAL_CHECK_STATUS_VAR(status); - - status |= BufferIO::write(subbuffer, block, nRowsBlock, nCols); - } - - block.reset(); - return status; - } - - template - services::Status getTFeature(size_t columnIndex, size_t rowOffset, size_t nRowsBlockDesired, ReadWriteMode rwFlag, BlockDescriptor & block) - { - if (isCpuTable()) - { - return _cpuTable->getBlockOfColumnValues(columnIndex, rowOffset, nRowsBlockDesired, rwFlag, block); - } - - return services::throwIfPossible(services::ErrorMethodNotImplemented); - } - - template - services::Status releaseTFeature(BlockDescriptor & block) - { - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfColumnValues(block); - } - - return services::throwIfPossible(services::ErrorMethodNotImplemented); - } - - services::Status allocateDataMemoryOnCpu() - { - services::Status status; - - _cpuTable = HomogenNumericTable::create(getNumberOfColumns(), getNumberOfRows(), NumericTableIface::doAllocate, &status); - - return status; - } - - inline bool isCpuTable() const - { - return (bool)_cpuTable; - } - - static bool isCpuContext() - { - return services::internal::getDefaultContext().getInfoDevice().isCpu; - } - - services::internal::Buffer _buffer; - services::SharedPtr > _cpuTable; -}; -/** @} */ - -/** - * Converts numeric table with arbitrary storage layout to SYCL homogen numeric table of the given type - * \param[in] src Numeric table to be converted - * \param[in] st Status of conversion - * \return Pointer to SYCL homogen numeric table - */ -template -inline daal::data_management::NumericTablePtr convertToSyclHomogen(NumericTable & src, services::Status & st) -{ - using namespace daal::services; - - size_t ncols = src.getNumberOfColumns(); - size_t nrows = src.getNumberOfRows(); - daal::data_management::NumericTablePtr emptyPtr; - - NumericTablePtr dst = SyclHomogenNumericTable::create(ncols, nrows, NumericTableIface::doAllocate, &st); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(st, emptyPtr); - BlockDescriptor srcBlock; - st |= src.getBlockOfRows(0, nrows, readOnly, srcBlock); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(st, emptyPtr); - BlockDescriptor dstBlock; - st |= dst->getBlockOfRows(0, nrows, readOnly, dstBlock); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(st, emptyPtr); - T * srcData = srcBlock.getBlockPtr(); - auto hostDstData = dstBlock.getBuffer().toHost(writeOnly, st); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(st, emptyPtr); - T * dstData = hostDstData.get(); - for (size_t i = 0; i < ncols * nrows; i++) - { - dstData[i] = srcData[i]; - } - st |= src.releaseBlockOfRows(srcBlock); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(st, emptyPtr); - st |= dst->releaseBlockOfRows(dstBlock); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(st, emptyPtr); - return dst; -} - -} // namespace interface1 - -using interface1::SyclHomogenNumericTable; -using interface1::convertToSyclHomogen; - -} // namespace internal -} // namespace data_management -} // namespace daal - -#endif diff --git a/cpp/daal/include/data_management/data/internal/numeric_table_sycl_soa.h b/cpp/daal/include/data_management/data/internal/numeric_table_sycl_soa.h deleted file mode 100644 index fc6de2fc7b0..00000000000 --- a/cpp/daal/include/data_management/data/internal/numeric_table_sycl_soa.h +++ /dev/null @@ -1,640 +0,0 @@ -/* file: numeric_table_sycl_soa.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __SYCL_SOA_NUMERIC_TABLE_H__ -#define __SYCL_SOA_NUMERIC_TABLE_H__ - -#include "data_management/data/internal/numeric_table_sycl.h" -#include "data_management/data/soa_numeric_table.h" -#include "services/internal/sycl/buffer_utils.h" - -namespace daal -{ -namespace data_management -{ -namespace internal -{ -namespace interface1 -{ -/** - * @ingroup sycl - * @{ - */ - -/** - * - * \brief Class that provides methods to access data stored as a structure of arrays, - * where each (contiguous) array represents values corresponding to a specific feature. - * Each array is represented by SYCL* buffer. - */ -class DAAL_EXPORT SyclSOANumericTable : public SyclNumericTable -{ -public: - DECLARE_SERIALIZABLE_TAG() - DECLARE_SERIALIZABLE_IMPL() - - /** - * Constructs an empty Numeric Table - * \param[in] nColumns Number of columns in the table - * \param[in] nRows Number of rows in the table - * \param[in] featuresEqual Flag that makes all features in the NumericTableDictionary equal - * \param[out] stat Status of the numeric table construction - * \return Empty numeric table - */ - static services::SharedPtr create(size_t nColumns = 0, size_t nRows = 0, - DictionaryIface::FeaturesEqual featuresEqual = DictionaryIface::notEqual, - services::Status * stat = NULL) - { - DAAL_DEFAULT_CREATE_IMPL_EX(SyclSOANumericTable, nColumns, nRows, featuresEqual); - } - - static services::SharedPtr create(NumericTableDictionaryPtr ddict, size_t nRows, - AllocationFlag memoryAllocationFlag = notAllocate, services::Status * stat = NULL) - { - DAAL_DEFAULT_CREATE_IMPL_EX(SyclSOANumericTable, ddict, nRows, memoryAllocationFlag); - } - - virtual ~SyclSOANumericTable() { freeDataMemoryImpl(); } - - /** - * Sets an array of values for a given feature - * \tparam T Type of feature values - * \param[in] bf SYCL* buffer to the array of the T type that stores feature values - * \param[in] idx Feature index - */ - template - services::Status setArray(const services::internal::Buffer & bf, size_t idx) - { - if (_partialMemStatus != notAllocated && _partialMemStatus != userAllocated) - { - return services::throwIfPossible(services::ErrorIncorrectNumberOfFeatures); - } - - if (idx >= getNumberOfColumns() || idx >= _arrays.size()) - { - return services::throwIfPossible(services::ErrorIncorrectNumberOfFeatures); - } - - if (getNumberOfRows() != bf.size()) - { - return services::throwIfPossible(services::ErrorIncorrectParameter); - } - - _ddict->setFeature(idx); - - if (_arrays[idx].empty() && bf) - { - _arraysInitialized++; - } - else if (!_arrays[idx].empty() && !bf) - { - _arraysInitialized--; - } - - _arrays[idx] = services::internal::sycl::UniversalBuffer(bf); - - if (isCpuTable()) - { - services::Status status; - auto hostPtr = bf.toHost(readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - return _cpuTable->setArray(hostPtr, idx); - } - _partialMemStatus = userAllocated; - - if (_arraysInitialized == getNumberOfColumns()) - { - _memStatus = userAllocated; - } - return services::Status(); - } - - services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTBlock(vector_idx, vector_num, rwflag, block); - } - services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTBlock(vector_idx, vector_num, rwflag, block); - } - services::Status getBlockOfRows(size_t vector_idx, size_t vector_num, ReadWriteMode rwflag, BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTBlock(vector_idx, vector_num, rwflag, block); - } - - services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE { return releaseTBlock(block); } - services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE { return releaseTBlock(block); } - services::Status releaseBlockOfRows(BlockDescriptor & block) DAAL_C11_OVERRIDE { return releaseTBlock(block); } - - services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - services::Status getBlockOfColumnValues(size_t feature_idx, size_t vector_idx, size_t value_num, ReadWriteMode rwflag, - BlockDescriptor & block) DAAL_C11_OVERRIDE - { - return getTFeature(feature_idx, vector_idx, value_num, rwflag, block); - } - - services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE { return releaseTFeature(block); } - services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE { return releaseTFeature(block); } - services::Status releaseBlockOfColumnValues(BlockDescriptor & block) DAAL_C11_OVERRIDE { return releaseTFeature(block); } - - virtual MemoryStatus getDataMemoryStatus() const DAAL_C11_OVERRIDE - { - if (isCpuTable()) - { - return _cpuTable->getDataMemoryStatus(); - } - return _memStatus; - } - -protected: - explicit SyclSOANumericTable(size_t nColumns, size_t nRows, DictionaryIface::FeaturesEqual featuresEqual, services::Status & st) - : SyclNumericTable(nColumns, nRows, featuresEqual), _arrays(nColumns), _arraysInitialized(0), _partialMemStatus(notAllocated) - { - _layout = soa; - - if (isCpuContext()) - { - _cpuTable = SOANumericTable::create(nColumns, nRows, featuresEqual, &st); - } - else - { - if (!resizePointersArray(nColumns)) - { - st.add(services::ErrorMemoryAllocationFailed); - services::throwIfPossible(st); - return; - } - } - } - - explicit SyclSOANumericTable(NumericTableDictionaryPtr ddict, size_t nRows, AllocationFlag memoryAllocationFlag, services::Status & st) - : SyclNumericTable(ddict, st), _arraysInitialized(0), _partialMemStatus(notAllocated) - { - _layout = soa; - st |= setNumberOfRowsImpl(nRows); - - if (!resizePointersArray(getNumberOfColumns())) - { - st.add(services::ErrorMemoryAllocationFailed); - services::throwIfPossible(st); - return; - } - if (memoryAllocationFlag == doAllocate) - { - st |= allocateDataMemoryImpl(); - return; - } - } - - services::Status allocateArray(size_t idx, const NumericTableFeature & feature) - { - using namespace services; - using namespace services::internal::sycl; - - Status st; - const size_t nrows = getNumberOfRows(); - - if (idx >= _arrays.size()) - { - return throwIfPossible(services::ErrorIncorrectNumberOfFeatures); - } - - _arrays[idx] = allocateByNumericTableFeature(feature, nrows, st); - services::throwIfPossible(st); - return st; - } - - services::Status allocateDataMemoryImpl(daal::MemType type = daal::dram) DAAL_C11_OVERRIDE - { - DAAL_ASSERT(type == daal::dram); - - freeDataMemoryImpl(); - - const size_t ncol = _ddict->getNumberOfFeatures(); - const size_t nrows = getNumberOfRows(); - - if (isCpuContext()) - { - services::Status st; - _cpuTable = SOANumericTable::create(_ddict, nrows, doAllocate, &st); - return st; - } - else - { - auto status = checkSizeOverflow(nrows, ncol); - if (!status) return services::throwIfPossible(status); - - if (ncol * nrows == 0) - { - if (nrows == 0) - { - return services::throwIfPossible(services::ErrorIncorrectNumberOfObservations); - } - else - { - return services::throwIfPossible(services::ErrorIncorrectNumberOfFeatures); - } - } - - for (size_t i = 0; i < ncol; i++) - { - NumericTableFeature f = (*_ddict)[i]; - if (f.typeSize != 0) - { - status |= allocateArray(i, f); - DAAL_CHECK_STATUS_VAR(status); - _arraysInitialized++; - } - if (_arrays[i].empty()) - { - freeDataMemoryImpl(); - status.add(services::ErrorMemoryAllocationFailed); - return services::throwIfPossible(status); - } - } - - if (_arraysInitialized > 0) - { - _partialMemStatus = internallyAllocated; - } - - if (_arraysInitialized == ncol) - { - _memStatus = internallyAllocated; - } - } - - return services::Status(); - } - - bool resizePointersArray(size_t nColumns) - { - if (_arrays.size() >= nColumns) - { - size_t counter = 0; - for (size_t i = 0; i < nColumns; i++) - { - counter += (_arrays[i].empty() != true); - } - _arraysInitialized = counter; - - if (_arraysInitialized == nColumns) - { - _memStatus = _partialMemStatus; - } - else - { - _memStatus = notAllocated; - } - - return true; - } - - bool is_resized = _arrays.resize(nColumns); - if (is_resized) - { - _memStatus = notAllocated; - } - - return is_resized; - } - - void freeDataMemoryImpl() DAAL_C11_OVERRIDE - { - _cpuTable.reset(); - _arrays = services::Collection(_ddict->getNumberOfFeatures()); - _arraysInitialized = 0; - - _partialMemStatus = notAllocated; - _memStatus = notAllocated; - } - - template - services::Status serialImpl(Archive * arch) - { - using namespace services::internal::sycl; - - auto status = NumericTable::serialImpl(arch); - DAAL_CHECK_STATUS_VAR(status); - - ReadWriteMode rwMode = readOnly; - - if (onDeserialize) - { - rwMode = readWrite; - status |= allocateDataMemoryImpl(); - DAAL_CHECK_STATUS_VAR(status); - } - - const size_t ncol = _ddict->getNumberOfFeatures(); - const size_t nrows = getNumberOfRows(); - - if (isCpuTable()) - { - for (size_t i = 0; i < ncol; i++) - { - NumericTableFeature f = (*_ddict)[i]; - arch->set((char *)_cpuTable->getArray(i), nrows * f.typeSize); - } - } - else - { - for (size_t i = 0; i < ncol; i++) - { - NumericTableFeature f = (*_ddict)[i]; - - BufferHostReinterpreter reinterpreter(_arrays[i], rwMode, nrows); - TypeDispatcher::dispatch(_arrays[i].type(), reinterpreter, status); - services::throwIfPossible(status); - DAAL_CHECK_STATUS_VAR(status); - - auto charPtr = reinterpreter.getResult(); - arch->set(charPtr.get(), nrows * f.typeSize); - } - } - - return services::Status(); - } - -private: - static services::Status checkSizeOverflow(size_t nRows, size_t nCols) - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRows, nCols); - return services::Status(); - } - - static services::Status checkOffsetOverflow(size_t size, size_t offset) - { - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, size, offset); - return services::Status(); - } - - template - services::Status getTBlock(size_t idx, size_t nrows, ReadWriteMode rwFlag, BlockDescriptor & block) - { - using namespace services::internal::sycl; - - if (isCpuTable()) - { - return _cpuTable->getBlockOfRows(idx, nrows, rwFlag, block); - } - - const size_t ncols = getNumberOfColumns(); - const size_t nobs = getNumberOfRows(); - block.setDetails(0, idx, rwFlag); - - if (idx >= nobs) - { - if (!block.resizeBuffer(ncols, 0)) - { - return services::throwIfPossible(services::ErrorMethodNotSupported); - } - return services::Status(); - } - - auto status = checkOffsetOverflow(nrows, idx); - if (!status) return services::throwIfPossible(status); - - nrows = (idx + nrows < nobs) ? nrows : nobs - idx; - - if (!block.resizeBuffer(ncols, nrows)) - { - return services::throwIfPossible(services::ErrorMemoryAllocationFailed); - } - - if (!(block.getRWFlag() & (int)readOnly)) - { - return services::Status(); - } - - auto blockSharedPtr = block.getBlockSharedPtr(); - T * blockPtr = blockSharedPtr.get(); - - DAAL_ASSERT(_arrays.size() == ncols); - - for (size_t j = 0; j < ncols; j++) - { - services::Status st; - auto featureUniBuffer = _arrays[j]; - BufferConverterTo converter(featureUniBuffer, idx, nrows); - TypeDispatcher::dispatch(featureUniBuffer.type(), converter, st); - services::throwIfPossible(st); - DAAL_CHECK_STATUS_VAR(st); - - auto buffer = converter.getResult(); - DAAL_ASSERT(buffer.size() == nrows); - - auto colSharedPtr = buffer.toHost(readOnly, st); - services::throwIfPossible(st); - DAAL_CHECK_STATUS_VAR(st); - T * colPtr = colSharedPtr.get(); - - for (size_t i = 0; i < nrows; i++) - { - blockPtr[i * ncols + j] = colPtr[i]; - } - } - - return services::Status(); - } - - template - services::Status releaseTBlock(BlockDescriptor & block) - { - using namespace services::internal::sycl; - - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfRows(block); - } - - if (block.getRWFlag() & (int)writeOnly) - { - const size_t ncols = getNumberOfColumns(); - const size_t nrows = block.getNumberOfRows(); - services::Status st; - - if (block.getNumberOfColumns() != ncols) - { - st.add(services::ErrorIncorrectParameter); - return throwIfPossible(st); - } - - auto blockBuffer = block.getBuffer(); - auto blockSharedPtr = blockBuffer.toHost(readOnly, st); - if (!st) return services::throwIfPossible(st); - - T * blockPtr = blockSharedPtr.get(); - - auto & context = services::internal::getDefaultContext(); - auto tempColumn = context.allocate(TypeIds::id(), nrows, st); - if (!st) return services::throwIfPossible(st); - - for (size_t j = 0; j < ncols; j++) - { - { - auto tempColumnSharedPtr = tempColumn.template get().toHost(readWrite, st); - if (!st) return services::throwIfPossible(st); - - T * tempColumnPtr = tempColumnSharedPtr.get(); - - for (size_t i = 0; i < nrows; i++) - { - tempColumnPtr[i] = blockPtr[i * ncols + j]; - } - } - - auto uniBuffer = _arrays[j]; - BufferConverterFrom converter(tempColumn, uniBuffer, 0, nrows); - TypeDispatcher::dispatch(uniBuffer.type(), converter, st); - services::throwIfPossible(st); - DAAL_CHECK_STATUS_VAR(st); - - _arrays[j] = converter.getResult(); - } - } - block.reset(); - return services::Status(); - } - - template - services::Status getTFeature(size_t feat_idx, size_t idx, size_t nrows, ReadWriteMode rwFlag, BlockDescriptor & block) - { - using namespace services::internal::sycl; - - if (isCpuTable()) - { - return _cpuTable->getBlockOfColumnValues(feat_idx, idx, nrows, rwFlag, block); - } - - const size_t nobs = getNumberOfRows(); - const size_t ncols = getNumberOfColumns(); - - if (feat_idx >= ncols) - { - return services::throwIfPossible(services::ErrorIncorrectIndex); - } - - block.setDetails(feat_idx, idx, rwFlag); - - if (idx >= nobs) - { - if (!block.resizeBuffer(1, 0)) - { - return services::throwIfPossible(services::ErrorMethodNotSupported); - } - return services::Status(); - } - - auto st = checkOffsetOverflow(nrows, idx); - if (!st) return services::throwIfPossible(st); - - nrows = (idx + nrows < nobs) ? nrows : nobs - idx; - if (!(block.getRWFlag() & (int)readOnly)) - { - if (!block.resizeBuffer(1, nrows)) - { - return services::throwIfPossible(services::ErrorMemoryAllocationFailed); - } - return services::Status(); - } - - auto uniBuffer = _arrays[feat_idx]; - BufferConverterTo converter(uniBuffer, idx, nrows); - TypeDispatcher::dispatch(uniBuffer.type(), converter, st); - services::throwIfPossible(st); - DAAL_CHECK_STATUS_VAR(st); - - auto buffer = converter.getResult(); - block.setBuffer(buffer, 1, nrows); - - return st; - } - - template - services::Status releaseTFeature(BlockDescriptor & block) - { - using namespace services::internal::sycl; - - if (isCpuTable()) - { - return _cpuTable->releaseBlockOfColumnValues(block); - } - - if (block.getRWFlag() & (int)writeOnly) - { - const size_t feat_idx = block.getColumnsOffset(); - - if (feat_idx >= getNumberOfColumns()) - { - return services::throwIfPossible(services::ErrorIncorrectIndex); - } - - NumericTableFeature & f = (*_ddict)[feat_idx]; - - auto uniBuffer = _arrays[feat_idx]; - auto blockBuffer = block.getBuffer(); - if ((features::internal::getIndexNumType() != f.indexType) || (uniBuffer.get() != blockBuffer)) - { - services::Status st; - - auto uniBuffer = _arrays[feat_idx]; - BufferConverterFrom converter(block.getBuffer(), uniBuffer, block.getRowsOffset(), block.getNumberOfRows()); - TypeDispatcher::dispatch(uniBuffer.type(), converter, st); - services::throwIfPossible(st); - DAAL_CHECK_STATUS_VAR(st); - - _arrays[feat_idx] = converter.getResult(); - } - } - block.reset(); - return services::Status(); - } - - inline bool isCpuTable() const { return (bool)_cpuTable; } - - static bool isCpuContext() { return services::internal::getDefaultContext().getInfoDevice().isCpu; } - -private: - services::Collection _arrays; - size_t _arraysInitialized; - MemoryStatus _partialMemStatus; - - SOANumericTablePtr _cpuTable; -}; - -typedef services::SharedPtr SyclSOANumericTablePtr; -/** @} */ -} // namespace interface1 - -using interface1::SyclSOANumericTable; -using interface1::SyclSOANumericTablePtr; - -} // namespace internal -} // namespace data_management -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/daal_defines.h b/cpp/daal/include/services/daal_defines.h index f4d9731b7d3..3f2636da158 100644 --- a/cpp/daal/include/services/daal_defines.h +++ b/cpp/daal/include/services/daal_defines.h @@ -124,18 +124,6 @@ #define DAAL_THREAD_PINNING_DISABLED #endif -#ifdef DAAL_SYCL_INTERFACE - #include - #if (defined(__SYCL_COMPILER_VERSION) && (__SYCL_COMPILER_VERSION >= 20191001)) - #define DAAL_SYCL_INTERFACE_USM - #endif - #if (defined(__SYCL_COMPILER_VERSION) && (__SYCL_COMPILER_VERSION >= 20191024)) - #define DAAL_SYCL_INTERFACE_REVERSED_RANGE - #elif (defined(COMPUTECPP_VERSION_MAJOR) && (COMPUTECPP_VERSION_MAJOR >= 1) && (COMPUTECPP_VERSION_MINOR >= 1) && (COMPUTECPP_VERSION_PATCH >= 6)) - #define DAAL_SYCL_INTERFACE_REVERSED_RANGE - #endif -#endif - #if !(defined(__linux__) || defined(_WIN64)) #define DAAL_DISABLE_LEVEL_ZERO #endif diff --git a/cpp/daal/include/services/env_detect.h b/cpp/daal/include/services/env_detect.h index d132c55794d..8231bcd4404 100644 --- a/cpp/daal/include/services/env_detect.h +++ b/cpp/daal/include/services/env_detect.h @@ -27,7 +27,7 @@ #include "services/base.h" #include "services/daal_defines.h" -#include "services/internal/execution_context.h" +#include "services/error_handling.h" namespace daal { @@ -172,22 +172,6 @@ class DAAL_EXPORT Environment : public Base */ int setMemoryLimit(MemType type, size_t limit); - /** - * Sets execution context globally for all algorithms. - * After this method is called, all computations inside algorithms are performed - * using device information from execution context. - * \param[in] ctx Execution context with information on how to perform computations inside the library - */ - void setDefaultExecutionContext(const internal::ExecutionContext & ctx) - { - _executionContext = internal::ImplAccessor::getImplPtr(ctx); - } - - services::internal::sycl::ExecutionContextIface & getDefaultExecutionContext() - { - return *_executionContext; - } - private: Environment(); Environment(const Environment & e); @@ -203,7 +187,6 @@ class DAAL_EXPORT Environment : public Base // allow user to wait for completion of worker threads. void * _schedulerHandle; void * _globalControl; - SharedPtr _executionContext; }; } // namespace interface1 diff --git a/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h b/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h index 799525128ef..05b5a82531a 100644 --- a/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h +++ b/cpp/daal/include/services/internal/aarch64/aarch64_kernel_defines.h @@ -28,14 +28,12 @@ #define DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sve, __VA_ARGS__) #define DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, ...) extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sve, __VA_ARGS__); #define DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sve, __VA_ARGS__) - #define DAAL_KERNEL_SVE_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #else #define DAAL_KERNEL_SVE_ONLY(something) #define DAAL_KERNEL_SVE_ONLY_CODE(...) #define DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, ...) #define DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, ...) #define DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_SVE_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #endif #endif diff --git a/cpp/daal/include/services/internal/buffer.h b/cpp/daal/include/services/internal/buffer.h index 9d7777499c9..a85a8ab45d0 100644 --- a/cpp/daal/include/services/internal/buffer.h +++ b/cpp/daal/include/services/internal/buffer.h @@ -26,10 +26,6 @@ #include "services/internal/buffer_impl.h" -#ifdef DAAL_SYCL_INTERFACE - #include "services/internal/buffer_impl_sycl.h" -#endif - namespace daal { namespace services @@ -58,89 +54,6 @@ class Buffer : public Base */ Buffer() {} -#ifdef DAAL_SYCL_INTERFACE - /** - * Creates a Buffer object referencing a SYCL* buffer - * Does not copy the data from the SYCL* buffer - * \param[in] buffer SYCL* buffer - * \param[out] status Status of operation - */ - Buffer(const ::sycl::buffer & buffer, Status & status) : _impl(internal::SyclBuffer::create(buffer, status)) {} - - #ifndef DAAL_NOTHROW_EXCEPTIONS - /** - * Creates a Buffer object referencing a SYCL* buffer - * Does not copy the data from the SYCL* buffer - */ - Buffer(const ::sycl::buffer & buffer) - { - Status status; - _impl.reset(internal::SyclBuffer::create(buffer, status)); - throwIfPossible(status); - } - #endif // DAAL_NOTHROW_EXCEPTIONS -#endif // DAAL_SYCL_INTERFACE_USM - -#ifdef DAAL_SYCL_INTERFACE_USM - /** - * Creates a Buffer object referencing a USM pointer - * Does not copy the data from the USM pointer - * \param[in] usmData Pointer to the USM-allocated data - * \param[in] size Number of elements of type T stored in USM memory block - * \param[in] queue The SYCL* queue object - * \param[out] status Status of operation - */ - Buffer(T * usmData, size_t size, const ::sycl::queue & queue, Status & status) - : _impl(internal::UsmBuffer::create(usmData, size, queue, status)) - {} - - #ifndef DAAL_NOTHROW_EXCEPTIONS - /** - * Creates a Buffer object referencing a USM pointer - * Does not copy the data from the USM pointer - * \param[in] usmData Pointer to the USM-allocated data - * \param[in] size Number of elements of type T stored in USM memory block - * \param[in] queue The SYCL* queue object - */ - Buffer(T * usmData, size_t size, const ::sycl::queue & queue) - { - Status status; - _impl.reset(internal::UsmBuffer::create(usmData, size, queue, status)); - throwIfPossible(status); - } - #endif // DAAL_NOTHROW_EXCEPTIONS -#endif // DAAL_SYCL_INTERFACE_USM - -#ifdef DAAL_SYCL_INTERFACE_USM - /** - * Creates a Buffer object referencing a USM pointer - * Does not copy the data from the USM pointer - * \param[in] usmData Shared pointer to the USM-allocated data - * \param[in] size Number of elements of type T stored in USM block - * \param[in] queue The SYCL* queue object - * \param[out] status Status of operation - */ - Buffer(const SharedPtr & usmData, size_t size, const ::sycl::queue & queue, Status & status) - : _impl(internal::UsmBuffer::create(usmData, size, queue, status)) - {} - - #ifndef DAAL_NOTHROW_EXCEPTIONS - /** - * Creates a Buffer object referencing a USM pointer - * Does not copy the data from the USM pointer - * \param[in] usmData Shared pointer to the USM-allocated data - * \param[in] size Number of elements of type T stored in USM block - * \param[in] queue The SYCL* queue object - */ - Buffer(const SharedPtr & usmData, size_t size, const ::sycl::queue & queue) - { - Status status; - _impl.reset(internal::UsmBuffer::create(usmData, size, queue, status)); - throwIfPossible(status); - } - #endif // DAAL_NOTHROW_EXCEPTIONS -#endif // DAAL_SYCL_INTERFACE_USM - /** * Creates a Buffer object from host-allocated raw pointer * Buffer does not own this pointer @@ -232,80 +145,6 @@ class Buffer : public Base } #endif // DAAL_NOTHROW_EXCEPTIONS -#ifdef DAAL_SYCL_INTERFACE - /** - * Converts buffer to the SYCL* buffer - * \param[out] status Status of operation - * \return one-dimensional SYCL* buffer - */ - ::sycl::buffer toSycl(Status & status) const - { - if (!_impl) - { - status |= ErrorEmptyBuffer; - return ::sycl::buffer(::sycl::range<1>(1)); - } - return internal::SyclBufferConverter().toSycl(*_impl, status); - } - - #ifndef DAAL_NOTHROW_EXCEPTIONS - /** - * Converts buffer to the SYCL* buffer, throws exception if conversion fails - * \return one-dimensional SYCL* buffer - */ - ::sycl::buffer toSycl() const - { - Status status; - const ::sycl::buffer buffer = toSycl(status); - throwIfPossible(status); - return buffer; - } - #endif // DAAL_NOTHROW_EXCEPTIONS -#endif // DAAL_SYCL_INTERFACE - -#ifdef DAAL_SYCL_INTERFACE_USM - /** - * Converts buffer to the USM shared pointer - * \param[in] queue The SYCL* queue object - * \param[in] rwFlag Flag specifying read/write access to the buffer - * \param[out] status Status of operation - * \return USM shared pointer - */ - SharedPtr toUSM(::sycl::queue & queue, const data_management::ReadWriteMode & rwFlag, Status & status) const - { - if (!_impl) - { - status |= ErrorEmptyBuffer; - return SharedPtr(); - } - return internal::SyclBufferConverter().toUSM(*_impl, queue, rwFlag, status); - } - - #ifndef DAAL_NOTHROW_EXCEPTIONS - /** - * Converts buffer to the USM shared pointer, throws exception if conversion fails - * \param[in] queue The SYCL* queue object - * \param[in] rwFlag Flag specifying read/write access to the buffer - * \return USM shared pointer - */ - SharedPtr toUSM(::sycl::queue & queue, const data_management::ReadWriteMode & rwFlag) const - { - Status status; - const SharedPtr ptr = toUSM(queue, rwFlag, status); - throwIfPossible(status); - return ptr; - } - #endif // DAAL_NOTHROW_EXCEPTIONS - -#endif // DAAL_SYCL_INTERFACE_USM - -#ifdef DAAL_SYCL_INTERFACE_USM - inline bool isUSMBacked() const - { - return dynamic_cast *>(_impl.get()) != nullptr; - } -#endif // DAAL_SYCL_INTERFACE_USM - /** * Returns the total number of elements in the buffer */ diff --git a/cpp/daal/include/services/internal/buffer_impl_sycl.h b/cpp/daal/include/services/internal/buffer_impl_sycl.h deleted file mode 100644 index 238c60dfda1..00000000000 --- a/cpp/daal/include/services/internal/buffer_impl_sycl.h +++ /dev/null @@ -1,429 +0,0 @@ -/* file: buffer_impl_sycl.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_BUFFER_SYCL_H__ -#define __DAAL_SERVICES_INTERNAL_BUFFER_SYCL_H__ - -#ifndef DAAL_SYCL_INTERFACE - #error "DAAL_SYCL_INTERFACE must be defined to include this file" -#endif - -#include - -#include "services/internal/any.h" -#include "services/internal/buffer_impl.h" -#include "services/internal/sycl/error_handling_sycl.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -/** @ingroup services_internal - * @{ - */ - -template -inline ::sycl::buffer createEmptySyclBuffer() -{ - return ::sycl::buffer(nullptr, ::sycl::range<1> { 0 }); -} - -#ifdef DAAL_SYCL_INTERFACE_USM -/** - * - * \brief BufferIface implementation based on USM - */ -template -class UsmBuffer : public Base, public UsmBufferIface -{ -public: - static UsmBuffer * create(const SharedPtr & data, size_t size, const ::sycl::queue & queue, Status & status) - { - if (!data && size != size_t(0)) - { - status |= ErrorNullPtr; - return nullptr; - } - const auto newBuffer = new UsmBuffer(data, size, queue); - DAAL_CHECK_COND_ERROR(newBuffer, status, ErrorMemoryAllocationFailed); - return newBuffer; - } - - static UsmBuffer * create(T * data, size_t size, const ::sycl::queue & queue, Status & status) - { - return create(SharedPtr { data, EmptyDeleter() }, size, queue, status); - } - - size_t size() const DAAL_C11_OVERRIDE { return _size; } - - Status apply(BufferVisitor & visitor) const DAAL_C11_OVERRIDE { return visitor(*this); } - - UsmBuffer * getSubBuffer(size_t offset, size_t size, Status & status) const DAAL_C11_OVERRIDE - { - DAAL_ASSERT(offset + size <= _size); - return create(SharedPtr(_data, _data.get() + offset), size, _queue, status); - } - - SharedPtr getHostRead(Status & status) const DAAL_C11_OVERRIDE { return getHostPtr(true, false, status); } - - SharedPtr getHostWrite(Status & status) const DAAL_C11_OVERRIDE { return getHostPtr(false, true, status); } - - SharedPtr getHostReadWrite(Status & status) const DAAL_C11_OVERRIDE { return getHostPtr(true, true, status); } - - const SharedPtr & get() const DAAL_C11_OVERRIDE { return _data; } - -private: - UsmBuffer(const SharedPtr & data, size_t size, const ::sycl::queue & queue) : _data(data), _size(size), _queue(queue) - { - _allocType = ::sycl::get_pointer_type(data.get(), _queue.get_context()); - DAAL_ASSERT(_allocType != ::sycl::usm::alloc::unknown); - } - - SharedPtr getHostPtr(bool needCopyToHost, bool needSynchronize, Status & status) const - { - using namespace ::sycl::usm; - if (_allocType == alloc::host || _allocType == alloc::shared) - { - return _data; - } - else if (_allocType == alloc::device) - { - auto host_ptr = SharedPtr(::sycl::malloc_host(_size, _queue), // TODO: use daal_malloc - [q = this->_queue, data = this->_data, size = this->_size, needSynchronize](const void * hostData) mutable { - if (needSynchronize) - { - auto event = q.memcpy(data.get(), hostData, size * sizeof(T)); - event.wait_and_throw(); - } - ::sycl::free(const_cast(hostData), q); - }); - if (!host_ptr) - { - status |= services::ErrorMemoryAllocationFailed; - return host_ptr; - } - - if (needCopyToHost) - { - status |= internal::sycl::catchSyclExceptions([&, q = this->_queue]() mutable { - auto event = q.memcpy(host_ptr.get(), _data.get(), _size * sizeof(T)); - event.wait_and_throw(); - }); - } - return host_ptr; - } - - /* Note: `sycl::get_pointer_info` is not implemented right now. With - * the `get_pointer_info` logic shall be the following: If device is - * host or CPU, return `_data`, otherwise throw exception. */ - status |= Error::create(ErrorAccessUSMPointerOnOtherDevice, Sycl, "Cannot access unknown USM pointer on host"); - - return SharedPtr(); - } - - SharedPtr _data; - size_t _size; - ::sycl::queue _queue; - ::sycl::usm::alloc _allocType; -}; -#endif - -/** - * - * \brief Deleter for SharedPtr that owns host accessor for SYCL* buffer - */ -template -class SyclHostDeleter : public Base -{ -public: - typedef ::sycl::accessor HostAccessorType; - -public: - explicit SyclHostDeleter(const ::sycl::buffer & buffer, HostAccessorType * accessor) : _buffer(buffer), _hostAccessor(accessor) - { - DAAL_ASSERT(_hostAccessor); - } - - void operator()(const void * ptr) - { - if (!_hostAccessor) - { - DAAL_ASSERT(!"Potential attempt to delete host accessor twice"); - } - - DAAL_ASSERT(ptr == _hostAccessor->get_pointer()); - delete _hostAccessor; - _hostAccessor = nullptr; - } - -private: - ::sycl::buffer _buffer; - HostAccessorType * _hostAccessor; -}; - -/** - * - * \brief BufferIface implementation based on SYCL* buffer - */ -template -class SyclBuffer : public Base, public SyclBufferIface -{ -private: - typedef ::sycl::buffer BufferType; - -public: - static SyclBuffer * create(size_t size, Status & status) - { - const auto newBuffer = new SyclBuffer(size, status); - DAAL_CHECK_COND_ERROR(newBuffer, status, ErrorMemoryAllocationFailed); - return newBuffer; - } - - static SyclBuffer * create(const BufferType & syclBuffer, Status & status) - { - const auto newBuffer = new SyclBuffer(syclBuffer, status); - DAAL_CHECK_COND_ERROR(newBuffer, status, ErrorMemoryAllocationFailed); - return newBuffer; - } - - size_t size() const DAAL_C11_OVERRIDE { return _nativeBuffer.get_count(); } - - Status apply(BufferVisitor & visitor) const DAAL_C11_OVERRIDE { return visitor(*this); } - - SyclBuffer * getSubBuffer(size_t offset, size_t size, Status & status) const DAAL_C11_OVERRIDE - { - DAAL_ASSERT(offset + size <= this->size()); - - BufferType & nativeBuffer = const_cast(_nativeBuffer); - if (offset == 0 && size == this->size()) - { - return create(nativeBuffer, status); - } - - const auto nativeBufferWithOffset = createNativeBuffer(status, nativeBuffer, ::sycl::id<1>(offset), ::sycl::range<1>(size)); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, nullptr); - - return create(nativeBufferWithOffset, status); - } - - SharedPtr getHostRead(Status & status) const DAAL_C11_OVERRIDE { return getHostPtr< ::sycl::access::mode::read>(status); } - - SharedPtr getHostWrite(Status & status) const DAAL_C11_OVERRIDE { return getHostPtr< ::sycl::access::mode::write>(status); } - - SharedPtr getHostReadWrite(Status & status) const DAAL_C11_OVERRIDE { return getHostPtr< ::sycl::access::mode::read_write>(status); } - - const BufferType & get() const { return _nativeBuffer; } - -private: - explicit SyclBuffer(size_t size, Status & status) : _nativeBuffer(createNativeBuffer(status, size)) {} - - explicit SyclBuffer(const BufferType & nativeBuffer, Status & status) : _nativeBuffer(createNativeBuffer(status, nativeBuffer)) {} - - template < ::sycl::access::mode mode> - SharedPtr getHostPtr(Status & status) const - { - using DeleterType = SyclHostDeleter; - using AccessorType = typename DeleterType::HostAccessorType; - return internal::sycl::catchSyclExceptions( - status, - [&]() { - auto * accessor = new AccessorType(const_cast(_nativeBuffer)); - return SharedPtr(accessor->get_pointer(), DeleterType(_nativeBuffer, accessor)); - }, - [&]() { return SharedPtr(); }); - } - - template - static BufferType createNativeBuffer(Status & status, Args &&... args) - { - return internal::sycl::catchSyclExceptions( - status, [&]() { return BufferType(std::forward(args)...); }, [&]() { return createEmptySyclBuffer(); }); - } - - BufferType _nativeBuffer; -}; - -/** - * - * \brief BufferVisitor that converters any buffer to SYCL* buffer - */ -template -class ConvertToSycl : public BufferVisitor -{ -private: - typedef ::sycl::buffer SyclBufferType; - -public: - Status operator()(const HostBuffer & buffer) DAAL_C11_OVERRIDE - { - Status status; - _nativeBuffer = wrap(status, buffer.get(), buffer.size()); - return status; - } - - Status operator()(const UsmBufferIface & buffer) DAAL_C11_OVERRIDE - { - Status status; - auto hostPtr = buffer.getHostReadWrite(status); - DAAL_CHECK_STATUS_VAR(status); - - _nativeBuffer = internal::sycl::catchSyclExceptions( - status, - [&]() { - const auto bufferProperties = ::sycl::property_list { ::sycl::property::buffer::use_host_ptr() }; - - return SyclBufferType(std::shared_ptr { hostPtr.get(), [owner = hostPtr](T * ptr) {} }, ::sycl::range<1>(buffer.size()), - bufferProperties); - }, - [&]() { return createEmptySyclBuffer(); }); - - return status; - } - - Status operator()(const SyclBufferIface & buffer) DAAL_C11_OVERRIDE - { - _nativeBuffer = static_cast &>(buffer).get(); - return Status(); - } - - const SyclBufferType & get() const { return _nativeBuffer.get(); } - -private: - static SyclBufferType wrap(Status & status, const SharedPtr & ptr, size_t size, bool useHostPtr = false) - { - return internal::sycl::catchSyclExceptions( - status, - [&]() { - const auto bufferProperties = - (useHostPtr) ? ::sycl::property_list { ::sycl::property::buffer::use_host_ptr() } : ::sycl::property_list {}; - - return SyclBufferType(ptr.get(), ::sycl::range<1>(size), bufferProperties); - }, - [&]() { return createEmptySyclBuffer(); }); - } - - Any _nativeBuffer; -}; - -#ifdef DAAL_SYCL_INTERFACE_USM -/** - * - * \brief BufferVisitor that converters any buffer to USM pointer - */ -template -class ConvertToUsm : public BufferVisitor -{ -public: - ConvertToUsm(::sycl::queue & queue, const data_management::ReadWriteMode & rwFlag) : _q(queue), _rwFlag(rwFlag) {} - - Status makeCopyToUSM(const SharedPtr & hostData, size_t count) - { - Status st; - // TODO: use malloc_device and queue.memcpy() - auto usmData = ::sycl::malloc_shared(count, _q); - if (usmData == nullptr) - { - return services::ErrorMemoryAllocationFailed; - } - - const size_t size = sizeof(T) * count; - DAAL_ASSERT(size / sizeof(T) == count); - - if (_rwFlag & data_management::readOnly) - { - int result = daal_memcpy_s(usmData, size, hostData.get(), size); - if (result) - { - return services::ErrorMemoryCopyFailedInternal; - } - } - - _data = SharedPtr(usmData, [q = this->_q, rwFlag = this->_rwFlag, hostData, size](const void * data) mutable { - if (rwFlag & data_management::writeOnly) - { - daal_memcpy_s(hostData.get(), size, data, size); - } - ::sycl::free(const_cast(data), q); - }); - return st; - } - - Status operator()(const HostBuffer & buffer) DAAL_C11_OVERRIDE - { - auto hostData = buffer.get(); - return makeCopyToUSM(hostData, buffer.size()); - } - - Status operator()(const UsmBufferIface & buffer) DAAL_C11_OVERRIDE - { - _data = buffer.get(); - return Status(); - } - - Status operator()(const SyclBufferIface & buffer) DAAL_C11_OVERRIDE - { - Status st; - auto hostData = buffer.getHostReadWrite(st); - DAAL_CHECK_STATUS_VAR(st); - return makeCopyToUSM(hostData, buffer.size()); - } - - const SharedPtr & get() const { return _data; } - -private: - SharedPtr _data; - ::sycl::queue & _q; - data_management::ReadWriteMode _rwFlag; -}; -#endif - -/** - * - * \brief Groups high-level conversion methods for SYCL* buffer and USM - */ -template -class SyclBufferConverter -{ -public: - ::sycl::buffer toSycl(const internal::BufferIface & buffer, Status & status) - { - ConvertToSycl action; - status |= buffer.apply(action); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, createEmptySyclBuffer()); - return action.get(); - } - -#ifdef DAAL_SYCL_INTERFACE_USM - SharedPtr toUSM(const internal::BufferIface & buffer, ::sycl::queue & q, const data_management::ReadWriteMode & rwFlag, Status & status) - { - ConvertToUsm action(q, rwFlag); - status |= buffer.apply(action); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, SharedPtr()); - return action.get(); - } -#endif -}; - -/** @} */ - -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/daal_kernel_defines.h b/cpp/daal/include/services/internal/daal_kernel_defines.h index f2a1f27a2fc..61f17902562 100644 --- a/cpp/daal/include/services/internal/daal_kernel_defines.h +++ b/cpp/daal/include/services/internal/daal_kernel_defines.h @@ -53,14 +53,6 @@ case cpuType: \ _cntr = (new DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, cpuType, __VA_ARGS__)(daalEnv)); \ break; -#define DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, cpuType, ...) \ -case cpuType: \ -{ \ - using contTemplType = DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, cpuType, __VA_ARGS__); \ - static volatile daal::services::internal::GpuSupportRegistrar registrar; \ - _cntr = (new contTemplType(daalEnv)); \ - break; \ -} #define DAAL_EXPAND(...) __VA_ARGS__ /** @} */ diff --git a/cpp/daal/include/services/internal/execution_context.h b/cpp/daal/include/services/internal/execution_context.h deleted file mode 100644 index ee2bb3bb395..00000000000 --- a/cpp/daal/include/services/internal/execution_context.h +++ /dev/null @@ -1,188 +0,0 @@ -/* file: execution_context.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_EXECUTION_CONTEXT_H__ -#define __DAAL_SERVICES_EXECUTION_CONTEXT_H__ - -#include "services/internal/utilities.h" -#include "services/internal/sycl/execution_context.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace interface1 -{ -/** - * @defgroup sycl SYCL* - * \brief Contains classes designed to work with SYCL* and call - * oneAPI implementations of algorithms - * @{ - */ - -/** - * - * \brief Base class for device information needed to perform - * computations - */ -class ExecutionContext : public Base -{ - friend class daal::services::internal::ImplAccessor; - -private: - typedef daal::services::internal::sycl::ExecutionContextIface ImplType; - -public: - ExecutionContext() {} - -protected: - explicit ExecutionContext(ImplType * impl) : _impl(impl) {} - explicit ExecutionContext(ImplType * impl, bool needEmptyDeleter) - { - // This branch is needed to avoid problems with deleting SYCL entities - // after SYCL RT static objects are already released. - // This is caused by "C++ static initialization order fiasco" problem between - // Intel(R) oneAPI Data Analytics Library (oneDAL) static Environment object and internal static contexts of SYCL RT. - // Here we solve this temporary with a small memory leak. - // TODO: remove this after complete transition to DPC++ kernels. - if (needEmptyDeleter) - { - _impl = SharedPtr(impl, EmptyDeleter()); - } - else - { - _impl = SharedPtr(impl); - } - } - - const SharedPtr & getImplPtr() const { return _impl; } - -private: - SharedPtr _impl; -}; - -/** - * - * \brief Implementation of a CPU-host context class - */ -class CpuExecutionContext : public ExecutionContext -{ -private: - typedef services::internal::sycl::CpuExecutionContextImpl ImplType; - -public: - CpuExecutionContext() : ExecutionContext(new ImplType()) {} -}; -/** @} */ -} // namespace interface1 - -using interface1::ExecutionContext; -using interface1::CpuExecutionContext; - -} // namespace internal -} // namespace services -} // namespace daal - -#ifdef DAAL_SYCL_INTERFACE - #include "services/internal/sycl/execution_context_sycl.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace interface1 -{ -/** @ingroup sycl - * @{ - */ - -/** - * - * \brief Implementation of a device context class - * based on SYCL* queue object - */ -class SyclExecutionContext : public ExecutionContext -{ -public: - /** Constructor from SYCL* queue. - * When this execution context is selected, all computations - * are performed on the device associated with the queue - * \param[in] deviceQueue SYCL* queue object to the device that is selected to perform computations - */ - SyclExecutionContext(const ::sycl::queue & deviceQueue, const bool fromPython = false) - : ExecutionContext(createContext(deviceQueue, fromPython), !deviceQueue.get_device().is_cpu()) - {} - -private: - static daal::services::internal::sycl::ExecutionContextIface * createContext(const ::sycl::queue & queue, const bool fromPython = false) - { - /* XXX: Workaround to fix performance on CPU: SYCL* runtime loads one - thread with active spin-lock that waits for submissions in a queue. - In CPU mode DAAL does not submit kernels, and runs CPU code via TBB. - Spin-lock is active while the queue persists. We do not persist - the queue and avoid running spin-lock in a queue while any DAAL - algorithm is running. */ - if (queue.get_device().is_cpu()) - { - return new daal::services::internal::sycl::CpuExecutionContextImpl(); - } - else - { - try - { - return new daal::services::internal::sycl::SyclExecutionContextImpl(queue, fromPython); - } - catch (const std::runtime_error & e) - { - throw e; - } - } - } -}; -/** @} */ -} // namespace interface1 - -using interface1::SyclExecutionContext; - -} // namespace internal -} // namespace services -} // namespace daal -#endif // DAAL_SYCL_INTERFACE - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace interface1 -{ -DAAL_EXPORT sycl::ExecutionContextIface & getDefaultContext(); - -} // namespace interface1 - -using interface1::getDefaultContext; - -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/gpu_support_checker.h b/cpp/daal/include/services/internal/gpu_support_checker.h deleted file mode 100644 index c153e3e011c..00000000000 --- a/cpp/daal/include/services/internal/gpu_support_checker.h +++ /dev/null @@ -1,150 +0,0 @@ -/* file: gpu_support_checker.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Interface for GPU support check -//-- -*/ - -#ifndef __GPU_SUPPORT_CHECKER_H__ -#define __GPU_SUPPORT_CHECKER_H__ - -#include "algorithms/algorithm_container_base.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -/** - * @defgroup services_internal ServicesInternal - * \brief Contains internal classes definitions - * @{ - */ - -DAAL_EXPORT bool isImplementedForDevice(const services::internal::sycl::InfoDevice & deviceInfo, algorithms::AlgorithmContainerIface *); - -/** - * - * \brief Interface for algorithm container registration - */ -class TypeRegistrationCheckerIface -{ -public: - virtual bool operator()(algorithms::AlgorithmContainerIface *) = 0; -}; - -/** - * - * \brief Checker of algorithm container registration in runtime - */ -template -class DynamicTypeRegistrationChecker : public TypeRegistrationCheckerIface -{ -public: - DynamicTypeRegistrationChecker() {} - - virtual bool operator()(algorithms::AlgorithmContainerIface * ptr_to_check) DAAL_C11_OVERRIDE { return dynamic_cast(ptr_to_check) != NULL; } -}; - -/** - * - * \brief Checker whether the algorithm has GPU support - */ -class GpuSupportChecker -{ -public: - template - void registerClass() - { - DynamicTypeRegistrationChecker * detector_ptr = new DynamicTypeRegistrationChecker(); - add(detector_ptr); - } - bool check(daal::algorithms::AlgorithmContainerIface * ptr_to_check) - { - for (Entry * it = _list.head(); it != NULL; it = it->next) - if ((*it->checker_ptr)(ptr_to_check)) return true; - return false; - } - static GpuSupportChecker & GetInstance(); - -private: - GpuSupportChecker() {} - GpuSupportChecker(const GpuSupportChecker &); - GpuSupportChecker & operator=(const GpuSupportChecker &); - - struct Entry : public daal::Base - { - Entry(TypeRegistrationCheckerIface * new_checker, Entry * cur_head) : checker_ptr(new_checker), next(cur_head) {} - - TypeRegistrationCheckerIface * checker_ptr; - Entry * next; - }; - - class List - { - public: - List() : _head(NULL) {} - ~List() - { - Entry * it = _head; - while (it != NULL) - { - Entry * next = it->next; - delete it; - it = next; - } - _head = NULL; - } - - void add(TypeRegistrationCheckerIface * checker_ptr) - { - Entry * entry = new Entry(checker_ptr, _head); - DAAL_ASSERT(entry != NULL); - if (entry) _head = entry; - } - Entry * head() { return _head; } - - private: - Entry * _head; - List(const List &); - List & operator=(const List &); - }; - - void add(TypeRegistrationCheckerIface * new_checker) { _list.add(new_checker); } - List _list; -}; - -/** - * - * \brief Registers Algorithm as the one has GPU support - */ -template -class GpuSupportRegistrar -{ -public: - GpuSupportRegistrar() { GpuSupportChecker::GetInstance().registerClass(); } -}; - -/** @} */ -} //namespace internal -} //namespace services -} //namespace daal - -#endif // __GPU_SUPPORT_CHECKER_H__ diff --git a/cpp/daal/include/services/internal/riscv64/riscv64_kernel_defines.h b/cpp/daal/include/services/internal/riscv64/riscv64_kernel_defines.h index 2bb5452242e..21e8d7262a9 100644 --- a/cpp/daal/include/services/internal/riscv64/riscv64_kernel_defines.h +++ b/cpp/daal/include/services/internal/riscv64/riscv64_kernel_defines.h @@ -29,14 +29,12 @@ #define DAAL_KERNEL_RV64_CONTAINER1(ContainerTemplate, ...) \ extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, rv64, __VA_ARGS__); #define DAAL_KERNEL_RV64_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, rv64, __VA_ARGS__) - #define DAAL_KERNEL_RV64_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #else #define DAAL_KERNEL_RV64_ONLY(something) #define DAAL_KERNEL_RV64_ONLY_CODE(...) #define DAAL_KERNEL_RV64_CONTAINER(ContainerTemplate, ...) #define DAAL_KERNEL_RV64_CONTAINER1(ContainerTemplate, ...) #define DAAL_KERNEL_RV64_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_RV64_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #endif #endif diff --git a/cpp/daal/include/services/internal/sycl/buffer_utils.h b/cpp/daal/include/services/internal/sycl/buffer_utils.h deleted file mode 100644 index 5a442b86bd7..00000000000 --- a/cpp/daal/include/services/internal/sycl/buffer_utils.h +++ /dev/null @@ -1,258 +0,0 @@ -/* file: buffer_utils.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_BUFFER_UTILS_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_BUFFER_UTILS_H__ - -#include "services/internal/execution_context.h" -#include "services/internal/sycl/types_utils.h" -#include "data_management/data/internal/conversion.h" - -/// \cond INTERNAL -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -template -class BufferConverterFrom -{ -public: - BufferConverterFrom(const UniversalBuffer & src, UniversalBuffer & dest, size_t offset, size_t size) - : _src(src), _dest(dest), _offset(offset), _size(size) - {} - - UniversalBuffer getResult() { return _dest; } - - template - void operator()(Typelist, Status & st) - { - using namespace daal::data_management; - using namespace daal::data_management::internal; - - DAAL_ASSERT(!_src.empty()); - DAAL_ASSERT(!_dest.empty()); - DAAL_ASSERT_UNIVERSAL_BUFFER(_src, DataType, _size); - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(_dest, T); - - auto srcBuffer = _src.template get(); - auto srcHostPtr = srcBuffer.toHost(readOnly, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - - auto destBuffer = _dest.template get(); - auto destSubBuffer = destBuffer.getSubBuffer(_offset, _size, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - - auto destHostPtr = destSubBuffer.toHost(readWrite, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - - VectorDownCast()(_size, srcHostPtr.get(), destHostPtr.get()); - } - -private: - UniversalBuffer _src; - UniversalBuffer _dest; - size_t _offset; - size_t _size; -}; - -/** - * - * \brief Converts UniversalBuffer to compile-time known type from - * runtime-known type - */ -template -class BufferConverterTo -{ -public: - BufferConverterTo(const UniversalBuffer & src, size_t offset, size_t size) : _src(src), _offset(offset), _size(size) {} - - Buffer getResult() { return _dest; } - - template - void operator()(Typelist, Status & st) - { - using namespace daal::data_management; - using namespace daal::data_management::internal; - - DAAL_ASSERT(!_src.empty()); - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(_src, T); - - DAAL_ASSERT(_src.type() == TypeIds::id()); - - auto buffer = _src.template get(); - - auto subbuffer = buffer.getSubBuffer(_offset, _size, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - - auto memoryBlock = subbuffer.toHost(readOnly, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - - auto & context = getDefaultContext(); - auto uniBufferBlock = context.allocate(TypeIds::id(), _size, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - - auto bufferBlock = uniBufferBlock.template get(); - { - auto bufferHostPtr = bufferBlock.toHost(readWrite, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - VectorUpCast()(_size, memoryBlock.get(), bufferHostPtr.get()); - } - _dest = bufferBlock; - } - - void operator()(Typelist, Status & st) - { - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(_src, DataType); - - auto buffer = _src.template get(); - auto subbuffer = buffer.getSubBuffer(_offset, _size, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - _dest = subbuffer; - } - -private: - UniversalBuffer _src; - size_t _offset; - size_t _size; - - Buffer _dest; -}; - -/** - * - * \brief Reinterprets UniversalBuffer to host array of compile-time known type - */ -template -class BufferHostReinterpreter -{ -public: - BufferHostReinterpreter(const UniversalBuffer & src, const data_management::ReadWriteMode & mode, size_t size) - : _src(src), _mode(mode), _size(size) - {} - - SharedPtr getResult() { return _reinterpretedPtr; } - - template - void operator()(Typelist, Status & st) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(_src, T, _size); - - auto buffer = _src.template get(); - auto ptr = buffer.toHost(_mode, st); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(st); - - _reinterpretedPtr = reinterpretPointerCast(ptr); - } - -private: - UniversalBuffer _src; - data_management::ReadWriteMode _mode; - size_t _size; - SharedPtr _reinterpretedPtr; -}; - -/** - * - * \brief Allocate data by NumericTableFeature - */ -inline UniversalBuffer allocateByNumericTableFeature(const data_management::NumericTableFeature & feature, const size_t size, - services::Status & status) -{ - using namespace data_management; - auto & context = services::internal::getDefaultContext(); - UniversalBuffer buffer; - switch (feature.indexType) - { - case features::DAAL_INT8_U: - { - buffer = context.allocate(TypeId::uint8, size, status); - break; - } - case features::DAAL_INT16_U: - { - buffer = context.allocate(TypeId::uint16, size, status); - break; - } - case features::DAAL_INT32_U: - { - buffer = context.allocate(TypeId::uint32, size, status); - break; - } - case features::DAAL_INT64_U: - { - buffer = context.allocate(TypeId::uint64, size, status); - break; - } - - case features::DAAL_INT8_S: - { - buffer = context.allocate(TypeId::int8, size, status); - break; - } - case features::DAAL_INT16_S: - { - buffer = context.allocate(TypeId::int16, size, status); - break; - } - case features::DAAL_INT32_S: - { - buffer = context.allocate(TypeId::int32, size, status); - break; - } - case features::DAAL_INT64_S: - { - buffer = context.allocate(TypeId::int64, size, status); - break; - } - - case features::DAAL_FLOAT32: - { - buffer = context.allocate(TypeId::float32, size, status); - break; - } - case features::DAAL_FLOAT64: - { - buffer = context.allocate(TypeId::float64, size, status); - break; - } - - default: status = services::Status(services::ErrorIncorrectParameter); - } - return buffer; -} - -/** @} */ -} // namespace interface1 - -using interface1::BufferConverterFrom; -using interface1::BufferConverterTo; -using interface1::BufferHostReinterpreter; -using interface1::allocateByNumericTableFeature; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal -/// \endcond - -#endif diff --git a/cpp/daal/include/services/internal/sycl/buffer_utils_sycl.h b/cpp/daal/include/services/internal/sycl/buffer_utils_sycl.h deleted file mode 100644 index fa7c68974db..00000000000 --- a/cpp/daal/include/services/internal/sycl/buffer_utils_sycl.h +++ /dev/null @@ -1,308 +0,0 @@ -/* file: buffer_utils_sycl.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_BUFFER_UTILS_SYCL_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_BUFFER_UTILS_SYCL_H__ - -#ifndef DAAL_SYCL_INTERFACE - #error "DAAL_SYCL_INTERFACE must be defined to include this file" -#endif - -#include "services/internal/sycl/types_utils.h" - -/// \cond INTERNAL -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -class BufferAllocator -{ -private: -#ifdef DAAL_SYCL_INTERFACE_USM - struct UsmDeleter - { - ::sycl::queue queue; - - explicit UsmDeleter(const ::sycl::queue & q) : queue(q) {} - - void operator()(const void * ptr) const { ::sycl::free(const_cast(ptr), queue); } - }; - - struct AllocateUSMBacked - { - const ::sycl::queue & queue; - size_t bufferSize; - UniversalBuffer buffer; - - explicit AllocateUSMBacked(const ::sycl::queue & q, size_t size) : queue(q), bufferSize(size) {} - - template - void operator()(Typelist, Status & status) - { - T * usmPtr = ::sycl::malloc_device(bufferSize, queue); - if (usmPtr == nullptr) - { - status |= services::ErrorMemoryAllocationFailed; - return; - } - services::SharedPtr usmSharedPtr(usmPtr, UsmDeleter { queue }); - buffer = services::internal::Buffer(usmSharedPtr, bufferSize, queue, status); - } - }; - - static UniversalBuffer allocateUSMBacked(const ::sycl::queue & q, TypeId type, size_t bufferSize, Status & status) - { - AllocateUSMBacked allocateOp(q, bufferSize); - TypeDispatcher::dispatch(type, allocateOp, status); - return allocateOp.buffer; - } -#endif - -public: - static UniversalBuffer allocate(const ::sycl::queue & q, TypeId type, size_t bufferSize, Status & status) - { -#ifdef DAAL_SYCL_INTERFACE_USM - return BufferAllocator::allocateUSMBacked(q, type, bufferSize, status); -#else - static_assert(false, "Allocations of sycl buffers are no longer supported"); -#endif // DAAL_SYCL_INTERFACE_USM - } -}; - -class BufferCopier -{ -private: - struct Execute - { - ::sycl::queue & queue; - UniversalBuffer & dstUnivers; - size_t dstOffset; - UniversalBuffer & srcUnivers; - size_t srcOffset; - size_t count; - - explicit Execute(::sycl::queue & queue, UniversalBuffer & dst, size_t desOffset, UniversalBuffer & src, size_t srcOffset, size_t count) - : queue(queue), dstUnivers(dst), dstOffset(desOffset), srcUnivers(src), srcOffset(srcOffset), count(count) - {} - -#ifdef DAAL_SYCL_INTERFACE_USM - template - Status copyOp(const Buffer & srcBuffer, const Buffer & dstBuffer) - { - using namespace ::sycl; - - Status status; - auto src = srcBuffer.toUSM(queue, data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - auto dst = dstBuffer.toUSM(queue, data_management::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - auto * src_raw = src.get() + srcOffset; - auto * dst_raw = dst.get() + dstOffset; - - const size_t bytes_count = sizeof(T) * count; - DAAL_ASSERT(bytes_count >= count); - - return catchSyclExceptions([&]() mutable { - auto event = queue.memcpy(dst_raw, src_raw, bytes_count); - event.wait_and_throw(); - }); - } -#endif - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(srcUnivers, T); - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(dstUnivers, T); - - const auto & srcBuffer = srcUnivers.get(); - const auto & dstBuffer = dstUnivers.get(); - - DAAL_ASSERT(srcBuffer.size() >= srcOffset + count); - DAAL_ASSERT(dstBuffer.size() >= dstOffset + count); - -#ifdef DAAL_SYCL_INTERFACE_USM - status |= copyOp(srcBuffer, dstBuffer); -#else - static_assert(false, "Support of USM memory is required to copy data in service::Buffer"); -#endif - } - }; - -public: - static void copy(::sycl::queue & queue, UniversalBuffer & dest, size_t dstOffset, UniversalBuffer & src, size_t srcOffset, size_t count, - Status & status) - { - DAAL_ASSERT(!src.empty()); - DAAL_ASSERT(!dest.empty()); - DAAL_ASSERT(src.type() == dest.type()); - - Execute op(queue, dest, dstOffset, src, srcOffset, count); - TypeDispatcher::dispatch(dest.type(), op, status); - } -}; - -class ArrayCopier -{ -private: - struct Execute - { - ::sycl::queue & queue; - UniversalBuffer & dstUnivers; - size_t dstOffset; - void * srcArray; - size_t srcCount; - size_t srcOffset; - size_t count; - - explicit Execute(::sycl::queue & queue, UniversalBuffer & dst, size_t desOffset, void * src, size_t srcCount, size_t srcOffset, size_t count) - : queue(queue), dstUnivers(dst), dstOffset(desOffset), srcArray(src), srcCount(srcCount), srcOffset(srcOffset), count(count) - {} - -#ifdef DAAL_SYCL_INTERFACE_USM - template - Status copyOp(const T * src, const Buffer & dstBuffer) - { - using namespace ::sycl; - - Status status; - - auto sub = dstBuffer.getSubBuffer(dstOffset, count, status); - DAAL_CHECK_STATUS_VAR(status); - - { - // TODO: change to use toUSM() and queue.memcpy() - auto dst = sub.toHost(data_management::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - auto dst_raw = dst.get(); - - const size_t size = sizeof(T) * count; - DAAL_ASSERT(size >= count); - - int result = daal_memcpy_s(dst_raw, size, src, size); - if (result) - { - return services::ErrorMemoryCopyFailedInternal; - } - } - return status; - } -#endif - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(dstUnivers, T); - - auto src = (T *)srcArray; - const auto & dstBuffer = dstUnivers.get(); - - DAAL_ASSERT(srcArray); - DAAL_ASSERT(srcCount >= srcOffset + count); - DAAL_ASSERT(dstBuffer.size() >= dstOffset + count); - -#ifdef DAAL_SYCL_INTERFACE_USM - status |= copyOp(src, dstBuffer); -#else - static_assert(false, "Support of USM memory is required to copy data in service::Buffer"); -#endif - } - }; - -public: - static void copy(::sycl::queue & queue, UniversalBuffer & dest, size_t dstOffset, void * src, size_t srcCount, size_t srcOffset, size_t count, - Status & status) - { - DAAL_ASSERT(!dest.empty()); - - Execute op(queue, dest, dstOffset, src, srcCount, srcOffset, count); - TypeDispatcher::dispatch(dest.type(), op, status); - } -}; - -class BufferFiller -{ -private: - struct Execute - { - ::sycl::queue & queue; - UniversalBuffer & dstUnivers; - double value; - - explicit Execute(::sycl::queue & queue, UniversalBuffer & dest, double value) : queue(queue), dstUnivers(dest), value(value) {} - -#ifdef DAAL_SYCL_INTERFACE_USM - template - Status fillOp(const Buffer & dstBuffer) - { - Status status; - auto dstPtr = dstBuffer.toUSM(queue, data_management::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - return catchSyclExceptions([&]() mutable { - auto event = queue.fill(dstPtr.get(), static_cast(value), dstBuffer.size()); - event.wait_and_throw(); - }); - } -#endif - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(dstUnivers, T); - - const auto & dstBuffer = dstUnivers.get(); - -#ifdef DAAL_SYCL_INTERFACE_USM - status |= fillOp(dstBuffer); -#else - static_assert(false, "Support of USM memory is required to fill data in service::Buffer"); -#endif - } - }; - -public: - static void fill(::sycl::queue & queue, UniversalBuffer & dest, double value, Status & status) - { - DAAL_ASSERT(!dest.empty()); - - Execute op(queue, dest, value); - TypeDispatcher::dispatch(dest.type(), op, status); - } -}; -} // namespace interface1 - -using interface1::BufferAllocator; -using interface1::BufferCopier; -using interface1::BufferFiller; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal -/// \endcond - -#endif diff --git a/cpp/daal/include/services/internal/sycl/error_handling_sycl.h b/cpp/daal/include/services/internal/sycl/error_handling_sycl.h deleted file mode 100644 index 290551107c8..00000000000 --- a/cpp/daal/include/services/internal/sycl/error_handling_sycl.h +++ /dev/null @@ -1,236 +0,0 @@ -/* file: error_handling_sycl.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_ERROR_HANDLING_SYCL_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_ERROR_HANDLING_SYCL_H__ - -#ifndef DAAL_SYCL_INTERFACE - #error "DAAL_SYCL_INTERFACE must be defined to include this file" -#endif - -#include -#include - -#include "services/error_handling.h" -#ifndef DAAL_DISABLE_LEVEL_ZERO - #include "services/internal/sycl/level_zero_common.h" -#endif - -#define DAAL_CHECK_OPENCL(cl_error, status) \ - { \ - if (cl_error != CL_SUCCESS) \ - { \ - status |= convertOpenClErrorToErrorPtr(cl_error); \ - return; \ - } \ - } - -#ifndef DAAL_DISABLE_LEVEL_ZERO - #define DAAL_CHECK_LEVEL_ZERO(ze_error, status) \ - { \ - if (ze_error != ZE_RESULT_SUCCESS) \ - { \ - status |= convertLevelZeroErrorToErrorPtr(ze_error); \ - return; \ - } \ - } -#endif // DAAL_DISABLE_LEVEL_ZERO - -/// \cond INTERNAL -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -inline String getOpenClErrorDescription(cl_int clError) -{ -#define OPENCL_ERROR_CASE(x) \ -case x: return String(#x); - switch (clError) - { - OPENCL_ERROR_CASE(CL_BUILD_PROGRAM_FAILURE); - OPENCL_ERROR_CASE(CL_COMPILER_NOT_AVAILABLE); - OPENCL_ERROR_CASE(CL_DEVICE_NOT_AVAILABLE); - OPENCL_ERROR_CASE(CL_DEVICE_NOT_FOUND); - OPENCL_ERROR_CASE(CL_IMAGE_FORMAT_MISMATCH); - OPENCL_ERROR_CASE(CL_IMAGE_FORMAT_NOT_SUPPORTED); - OPENCL_ERROR_CASE(CL_INVALID_ARG_INDEX); - OPENCL_ERROR_CASE(CL_INVALID_ARG_SIZE); - OPENCL_ERROR_CASE(CL_INVALID_ARG_VALUE); - OPENCL_ERROR_CASE(CL_INVALID_BINARY); - OPENCL_ERROR_CASE(CL_INVALID_BUFFER_SIZE); - OPENCL_ERROR_CASE(CL_INVALID_BUILD_OPTIONS); - OPENCL_ERROR_CASE(CL_INVALID_COMMAND_QUEUE); - OPENCL_ERROR_CASE(CL_INVALID_CONTEXT); - OPENCL_ERROR_CASE(CL_INVALID_DEVICE); - OPENCL_ERROR_CASE(CL_INVALID_DEVICE_TYPE); - OPENCL_ERROR_CASE(CL_INVALID_EVENT); - OPENCL_ERROR_CASE(CL_INVALID_EVENT_WAIT_LIST); - OPENCL_ERROR_CASE(CL_INVALID_GL_OBJECT); - OPENCL_ERROR_CASE(CL_INVALID_GLOBAL_OFFSET); - OPENCL_ERROR_CASE(CL_INVALID_HOST_PTR); - OPENCL_ERROR_CASE(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR); - OPENCL_ERROR_CASE(CL_INVALID_IMAGE_SIZE); - OPENCL_ERROR_CASE(CL_INVALID_KERNEL_NAME); - OPENCL_ERROR_CASE(CL_INVALID_KERNEL); - OPENCL_ERROR_CASE(CL_INVALID_KERNEL_ARGS); - OPENCL_ERROR_CASE(CL_INVALID_KERNEL_DEFINITION); - OPENCL_ERROR_CASE(CL_INVALID_MEM_OBJECT); - OPENCL_ERROR_CASE(CL_INVALID_OPERATION); - OPENCL_ERROR_CASE(CL_INVALID_PLATFORM); - OPENCL_ERROR_CASE(CL_INVALID_PROGRAM); - OPENCL_ERROR_CASE(CL_INVALID_PROGRAM_EXECUTABLE); - OPENCL_ERROR_CASE(CL_INVALID_QUEUE_PROPERTIES); - OPENCL_ERROR_CASE(CL_INVALID_SAMPLER); - OPENCL_ERROR_CASE(CL_INVALID_VALUE); - OPENCL_ERROR_CASE(CL_INVALID_WORK_DIMENSION); - OPENCL_ERROR_CASE(CL_INVALID_WORK_GROUP_SIZE); - OPENCL_ERROR_CASE(CL_INVALID_WORK_ITEM_SIZE); - OPENCL_ERROR_CASE(CL_MAP_FAILURE); - OPENCL_ERROR_CASE(CL_MEM_OBJECT_ALLOCATION_FAILURE); - OPENCL_ERROR_CASE(CL_MEM_COPY_OVERLAP); - OPENCL_ERROR_CASE(CL_OUT_OF_HOST_MEMORY); - OPENCL_ERROR_CASE(CL_OUT_OF_RESOURCES); - OPENCL_ERROR_CASE(CL_PROFILING_INFO_NOT_AVAILABLE); - } - return String("Unknown OpenCL error"); - -#undef OPENCL_ERROR_CASE -} - -inline ErrorPtr convertOpenClErrorToErrorPtr(cl_int clError) -{ - return Error::create(ErrorID::ErrorExecutionContext, ErrorDetailID::OpenCL, getOpenClErrorDescription(clError)); -} - -#ifndef DAAL_DISABLE_LEVEL_ZERO -inline String getLevelZeroErrorDescription(ze_result_t zeError) -{ - #define LEVEL_ZERO_ERROR_CASE(x) \ - case x: return String(#x); - switch (zeError) - { - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_SUCCESS); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_NOT_READY); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_DEVICE_LOST); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_MODULE_BUILD_FAILURE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_MODULE_LINK_FAILURE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_NOT_AVAILABLE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNINITIALIZED); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNSUPPORTED_VERSION); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNSUPPORTED_FEATURE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_ARGUMENT); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_NULL_HANDLE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_NULL_POINTER); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_SIZE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNSUPPORTED_SIZE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_ENUMERATION); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_NATIVE_BINARY); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_NAME); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_NAME); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_FUNCTION_NAME); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_OVERLAPPING_REGIONS); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_ERROR_UNKNOWN); - LEVEL_ZERO_ERROR_CASE(ZE_RESULT_FORCE_UINT32); - } - return String("Unknown LevelZero error"); - - #undef LEVEL_ZERO_ERROR_CASE -} - -inline ErrorPtr convertLevelZeroErrorToErrorPtr(ze_result_t zeError) -{ - return Error::create(ErrorID::ErrorExecutionContext, ErrorDetailID::LevelZero, getLevelZeroErrorDescription(zeError)); -} -#endif // DAAL_DISABLE_LEVEL_ZERO - -inline Status convertSyclExceptionToStatus(const std::exception & ex) -{ - return Error::create(ErrorID::ErrorExecutionContext, ErrorDetailID::Sycl, String(ex.what())); -} - -template -DAAL_FORCEINLINE auto catchSyclExceptions(Status & status, TryBody && tryBody, CatchBody && catchBody) -> decltype(tryBody()) -{ - try - { - return tryBody(); - } - catch (const std::bad_alloc &) - { - status |= ErrorMemoryAllocationFailed; - return catchBody(); - } - catch (const std::exception & ex) - { - status |= convertSyclExceptionToStatus(ex); - return catchBody(); - } - catch (...) - { - status |= UnknownError; - return catchBody(); - } -} - -template -DAAL_FORCEINLINE Status catchSyclExceptions(Body && body) -{ - Status status; - return catchSyclExceptions( - status, - [&]() { - body(); - return status; - }, - [&]() { return status; }); -} - -} // namespace interface1 - -using interface1::convertOpenClErrorToErrorPtr; -using interface1::convertSyclExceptionToStatus; -using interface1::catchSyclExceptions; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal -/// \endcond - -#endif diff --git a/cpp/daal/include/services/internal/sycl/execution_context.h b/cpp/daal/include/services/internal/sycl/execution_context.h deleted file mode 100644 index d9a55b64d55..00000000000 --- a/cpp/daal/include/services/internal/sycl/execution_context.h +++ /dev/null @@ -1,458 +0,0 @@ -/* file: execution_context.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_EXECUTION_CONTEXT_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_EXECUTION_CONTEXT_H__ - -#include "services/error_handling.h" -#include "services/internal/any.h" -#include "services/internal/buffer.h" -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/math/types.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -/** @ingroup oneapi_internal - * @{ - */ - -/** - * - * \brief Local range of computations run in parallel - */ -class KernelRange : public Base -{ -public: - KernelRange() : _upper1(0), _upper2(0), _upper3(0), _dimensions(0) {} - - explicit KernelRange(size_t upper) : _upper1(upper), _upper2(1), _upper3(1), _dimensions(1) {} - - explicit KernelRange(size_t upper1, size_t upper2) : _upper1(upper1), _upper2(upper2), _upper3(1), _dimensions(2) {} - - explicit KernelRange(size_t upper1, size_t upper2, size_t upper3) : _upper1(upper1), _upper2(upper2), _upper3(upper3), _dimensions(3) {} - - size_t upper1() const { return _upper1; } - - size_t upper2() const { return _upper2; } - - size_t upper3() const { return _upper3; } - - size_t dimensions() const { return _dimensions; } - -private: - size_t _upper1; - size_t _upper2; - size_t _upper3; - - size_t _dimensions; -}; - -/** - * - * \brief Class containing local and global ranges - */ -class KernelNDRange : public Base -{ -public: - explicit KernelNDRange(size_t dimensions) : _dimensions(dimensions) {} - - void global(const KernelRange & range, Status & st) - { - if (_dimensions != range.dimensions()) - { - st |= ErrorIncorrectParameter; - return; - } - _globalRange = range; - } - - void local(const KernelRange & range, Status & st) - { - if (_dimensions != range.dimensions()) - { - st |= ErrorIncorrectParameter; - return; - } - _localRange = range; - } - - const KernelRange & local() { return _localRange; } - - const KernelRange & global() const { return _globalRange; } - - const KernelRange & local() const { return _localRange; } - - size_t dimensions() const { return _dimensions; } - -private: - KernelRange _globalRange; - KernelRange _localRange; - size_t _dimensions; -}; - -/** - * - * \brief Types of arguments can be passed into kernel - */ -class KernelArgumentTypes -{ -public: - enum Type - { - publicBuffer, - publicConstant, - privateBuffer - }; - -private: - KernelArgumentTypes(); -}; -typedef KernelArgumentTypes::Type KernelArgumentType; - -/** - * - * \brief Container for argument to be passed into kernel - */ -class KernelArgument : public Base -{ -public: - KernelArgument() : _dataType(TypeIds::custom), _argType(KernelArgumentTypes::publicConstant), _accessMode(AccessModeIds::read) {} - - template - void set(const T & value) - { - _dataType = TypeIds::id(); - _value = value; - _argType = KernelArgumentTypes::publicConstant; - _accessMode = AccessModeIds::read; - } - - template - void set(const Buffer & buffer, AccessModeId accessMode = AccessModeIds::read) - { - _dataType = TypeIds::id(); - _value = buffer; - _argType = KernelArgumentTypes::publicBuffer; - _accessMode = accessMode; - } - - void set(const UniversalBuffer & buffer, AccessModeId accessMode = AccessModeIds::read) - { - _dataType = buffer.type(); - _value = buffer.any(); - _argType = KernelArgumentTypes::publicBuffer; - _accessMode = accessMode; - } - - void set(const LocalBuffer & buffer) - { - _dataType = buffer.type(); - _value = buffer; - _argType = KernelArgumentTypes::privateBuffer; - _accessMode = AccessModeIds::readwrite; - } - - TypeId dataType() const { return _dataType; } - - AccessModeId accessMode() const { return _accessMode; } - - KernelArgumentType argType() const { return _argType; } - - template - const T & get() const - { - DAAL_ASSERT(_value.check()); - return _value.get(); - } - -private: - TypeId _dataType; - KernelArgumentType _argType; - AccessModeId _accessMode; - Any _value; -}; - -/** - * - * \brief Container for all kernel arguments - */ -class KernelArguments : public Base -{ -public: - KernelArguments() {} - - explicit KernelArguments(size_t argsNum, Status & status) : _args(argsNum) - { - DAAL_CHECK_COND_ERROR(_args.data(), status, ErrorMemoryAllocationFailed); - } - - template - void set(size_t index, const T & value) - { - _args[index].set(value); - } - - template - void set(size_t index, const Buffer & buffer, AccessModeId accessMode = AccessModeIds::read) - { - _args[index].set(buffer, accessMode); - } - - void set(size_t index, const UniversalBuffer & buffer, AccessModeId accessMode = AccessModeIds::read) { _args[index].set(buffer, accessMode); } - - void set(size_t index, const LocalBuffer & buffer) { _args[index].set(buffer); } - - const KernelArgument & get(size_t index) const { return _args[index]; } - - size_t size() const { return _args.size(); } - -private: - /* Disable copy & assignment */ - KernelArguments(const KernelArguments &); - KernelArguments & operator=(const KernelArguments &); - - Collection _args; -}; - -/* Forward declarations of possible kernel types */ -class OpenClKernel; - -/** - * - * \brief Interface for pushing kernel to run - */ -class KernelSchedulerIface -{ -public: - virtual ~KernelSchedulerIface() {} - - virtual void schedule(const OpenClKernel & kernel, const KernelRange & range, const KernelArguments & args, Status & st) = 0; - - virtual void schedule(const OpenClKernel & kernel, const KernelNDRange & range, const KernelArguments & args, Status & st) = 0; -}; - -/** - * - * \brief Interface for kernel - */ -class KernelIface -{ -public: - virtual ~KernelIface() {} - - virtual void schedule(KernelSchedulerIface & scheduler, const KernelRange & range, const KernelArguments & args, Status & st) const = 0; - - virtual void schedule(KernelSchedulerIface & scheduler, const KernelNDRange & range, const KernelArguments & args, Status & st) const = 0; -}; -typedef SharedPtr KernelPtr; - -/** - * - * \brief Interface for factory of kernels - */ -class ClKernelFactoryIface -{ -public: - virtual ~ClKernelFactoryIface() {} - virtual void build(ExecutionTargetId target, const char * key, const char * program, const char * options, Status & st) = 0; - virtual SharedPtr getKernel(const char * kernelName, Status & st) = 0; -}; - -/** - * - * \brief Struct containing device information - */ -struct InfoDevice -{ - size_t maxWorkGroupSize; - bool isCpu; - size_t maxMemAllocSize; - size_t globalMemSize; -}; - -/** - * - * \brief Interface of execution context - */ -class ExecutionContextIface -{ -public: - virtual ~ExecutionContextIface() {} - - virtual void run(const KernelRange & range, const KernelPtr & kernel, const KernelArguments & args, Status & st) = 0; - - virtual void run(const KernelNDRange & range, const KernelPtr & kernel, const KernelArguments & args, Status & st) = 0; - - virtual void gemm(math::Transpose transa, math::Transpose transb, size_t m, size_t n, size_t k, double alpha, const UniversalBuffer & a_buffer, - size_t lda, size_t offsetA, const UniversalBuffer & b_buffer, size_t ldb, size_t offsetB, double beta, - UniversalBuffer & c_buffer, size_t ldc, size_t offsetC, Status & st) = 0; - - virtual void syrk(math::UpLo upper_lower, math::Transpose trans, size_t n, size_t k, double alpha, const UniversalBuffer & a_buffer, size_t lda, - size_t offsetA, double beta, UniversalBuffer & c_buffer, size_t ldc, size_t offsetC, Status & st) = 0; - - virtual void axpy(const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, const UniversalBuffer y_buffer, - const int incy, Status & st) = 0; - - virtual void potrf(math::UpLo uplo, size_t n, UniversalBuffer & a_buffer, size_t lda, Status & st) = 0; - - virtual void potrs(math::UpLo uplo, size_t n, size_t ny, UniversalBuffer & a_buffer, size_t lda, UniversalBuffer & b_buffer, size_t ldb, - Status & st) = 0; - - virtual void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, Status & st) = 0; - - virtual void fill(UniversalBuffer dest, double value, Status & st) = 0; - - virtual UniversalBuffer allocate(TypeId type, size_t bufferSize, Status & st) = 0; - - virtual ClKernelFactoryIface & getClKernelFactory() = 0; - - virtual InfoDevice & getInfoDevice() = 0; - - virtual void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcCount, size_t srcOffset, size_t count, Status & st) = 0; -}; - -/** - * - * \brief Factory for host-only kernels - */ -class CpuKernelFactory : public Base, public ClKernelFactoryIface -{ -public: - virtual void build(ExecutionTargetId /*target*/, const char * /*key*/, const char * /*program*/, const char * /*options = ""*/, - Status & /*status*/) DAAL_C11_OVERRIDE - {} - virtual SharedPtr getKernel(const char * /*kernelName*/, Status & /*status*/) DAAL_C11_OVERRIDE { return SharedPtr(); } -}; - -/** - * - * \brief Host execution context - */ -class CpuExecutionContextImpl : public Base, public ExecutionContextIface -{ -public: - CpuExecutionContextImpl() - { - _infoDevice.isCpu = true; - _infoDevice.maxWorkGroupSize = 0; - _infoDevice.maxMemAllocSize = 0; - _infoDevice.globalMemSize = 0; - } - - void run(const KernelRange & /*range*/, const KernelPtr & /*kernel*/, const KernelArguments & /*args*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void run(const KernelNDRange & /*range*/, const KernelPtr & /*kernel*/, const KernelArguments & /*args*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void gemm(math::Transpose /*transa*/, math::Transpose /*transb*/, size_t /*m*/, size_t /*n*/, size_t /*k*/, double /*alpha*/, - const UniversalBuffer & /*a_buffer*/, size_t /*lda*/, size_t /*offsetA*/, const UniversalBuffer & /*b_buffer*/, size_t /*ldb*/, - size_t /*offsetB*/, double /*beta*/, UniversalBuffer & /*c_buffer*/, size_t /*ldc*/, size_t /*offsetC*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void syrk(math::UpLo /*upper_lower*/, math::Transpose /*trans*/, size_t /*n*/, size_t /*k*/, double /*alpha*/, - const UniversalBuffer & /*a_buffer*/, size_t /*lda*/, size_t /*offsetA*/, double /*beta*/, UniversalBuffer & /*c_buffer*/, - size_t /*ldc*/, size_t /*offsetC*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void axpy(const uint32_t /*n*/, const double /*a*/, const UniversalBuffer /*x_buffer*/, const int /*incx*/, const UniversalBuffer /*y_buffer*/, - const int /*incy*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void potrf(math::UpLo /*uplo*/, size_t /*n*/, UniversalBuffer & /*a_buffer*/, size_t /*lda*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void potrs(math::UpLo /*uplo*/, size_t /*n*/, size_t /*ny*/, UniversalBuffer & /*a_buffer*/, size_t /*lda*/, UniversalBuffer & /*b_buffer*/, - size_t /*ldb*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void copy(UniversalBuffer /*dest*/, size_t /*desOffset*/, UniversalBuffer /*src*/, size_t /*srcOffset*/, size_t /*count*/, - Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - - void fill(UniversalBuffer /*dest*/, double /*value*/, Status & st) DAAL_C11_OVERRIDE { st |= ErrorMethodNotImplemented; } - - UniversalBuffer allocate(TypeId /*type*/, size_t /*bufferSize*/, Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - return UniversalBuffer(); - } - - ClKernelFactoryIface & getClKernelFactory() DAAL_C11_OVERRIDE { return _factory; } - - InfoDevice & getInfoDevice() DAAL_C11_OVERRIDE { return _infoDevice; } - - void copy(UniversalBuffer /*dest*/, size_t /*desOffset*/, void * /*src*/, size_t /*srcCount*/, size_t /*srcOffset*/, size_t /*count*/, - Status & st) DAAL_C11_OVERRIDE - { - st |= ErrorMethodNotImplemented; - } - -private: - CpuKernelFactory _factory; - InfoDevice _infoDevice; -}; - -/** @} */ -} // namespace interface1 - -using interface1::KernelRange; -using interface1::KernelNDRange; -using interface1::KernelArgumentTypes; -using interface1::KernelArgumentType; -using interface1::KernelArgument; -using interface1::KernelArguments; -using interface1::UniversalBuffer; -using interface1::KernelSchedulerIface; -using interface1::KernelIface; -using interface1::KernelPtr; -using interface1::ClKernelFactoryIface; -using interface1::InfoDevice; -using interface1::ExecutionContextIface; -using interface1::CpuKernelFactory; -using interface1::CpuExecutionContextImpl; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/execution_context_sycl.h b/cpp/daal/include/services/internal/sycl/execution_context_sycl.h deleted file mode 100644 index edff73f72bf..00000000000 --- a/cpp/daal/include/services/internal/sycl/execution_context_sycl.h +++ /dev/null @@ -1,308 +0,0 @@ -/* file: execution_context_sycl.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_EXECUTION_CONTEXT_SYCL_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_EXECUTION_CONTEXT_SYCL_H__ - -#ifndef DAAL_SYCL_INTERFACE - #error "DAAL_SYCL_INTERFACE must be defined to include this file" -#endif - -#include -#include -#include - -#include "services/daal_string.h" -#include "services/internal/hash_table.h" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/kernel_scheduler_sycl.h" -#include "services/internal/sycl/error_handling_sycl.h" -#include "services/internal/sycl/math/blas_executor.h" -#include "services/internal/sycl/math/lapack_executor.h" - -/// \cond INTERNAL -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -class OpenClKernelFactory : public Base, public ClKernelFactoryIface -{ -public: - explicit OpenClKernelFactory(::sycl::queue & deviceQueue) - : _currentProgramRef(nullptr), _executionTarget(ExecutionTargetIds::unspecified), _deviceQueue(deviceQueue) - {} - - void build(ExecutionTargetId target, const char * name, const char * program, const char * options, Status & status) DAAL_C11_OVERRIDE - { - DAAL_ASSERT(name); - DAAL_ASSERT(program); - - String key = name; - DAAL_CHECK_COND_ERROR(key.c_str(), status, ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - const bool res = programHashTable.contain(key, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - if (!res) - { -#ifndef DAAL_DISABLE_LEVEL_ZERO - const bool isOpenCLBackendAvailable = !_deviceQueue.get_device().template get_info< ::sycl::info::device::opencl_c_version>().empty(); - if (isOpenCLBackendAvailable) - { -#endif // DAAL_DISABLE_LEVEL_ZERO - - // OpenCl branch - auto programPtr = - OpenClProgramRef::create(::sycl::get_native< ::sycl::backend::opencl>(_deviceQueue.get_context()), - ::sycl::get_native< ::sycl::backend::opencl>(_deviceQueue.get_device()), name, program, options, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - programHashTable.add(key, programPtr, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - _currentProgramRef = programPtr.get(); - -#ifndef DAAL_DISABLE_LEVEL_ZERO - } - else - { - // Level zero branch - if (nullptr == _levelZeroOpenClInteropContext.getOpenClDeviceRef().get()) - { - _levelZeroOpenClInteropContext.reset(_deviceQueue, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - - auto programPtr = - OpenClProgramRef::create(_levelZeroOpenClInteropContext.getOpenClContextRef().get(), - _levelZeroOpenClInteropContext.getOpenClDeviceRef().get(), _deviceQueue, name, program, options, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - programHashTable.add(key, programPtr, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - _currentProgramRef = programPtr.get(); - } -#endif // DAAL_DISABLE_LEVEL_ZERO - } - else - { - _currentProgramRef = programHashTable.get(key, status).get(); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - - _executionTarget = target; - } - - KernelPtr getKernel(const char * kernelName, Status & status) DAAL_C11_OVERRIDE - { - if (!_currentProgramRef) - { - status |= ErrorExecutionContext; - return KernelPtr(); - } - - String kernelNameStr = kernelName; - DAAL_CHECK_COND_ERROR(kernelNameStr.c_str(), status, ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - String key = _currentProgramRef->getName(); - DAAL_CHECK_COND_ERROR(key.c_str(), status, ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - key.add(kernelNameStr); - DAAL_CHECK_COND_ERROR(key.c_str(), status, ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - bool res = kernelHashTable.contain(key, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - if (res) - { - auto kernel = kernelHashTable.get(key, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - return kernel; - } - else - { - KernelPtr kernel; -#ifndef DAAL_DISABLE_LEVEL_ZERO - const bool isOpenCLBackendAvailable = !_deviceQueue.get_device().template get_info< ::sycl::info::device::opencl_c_version>().empty(); - if (isOpenCLBackendAvailable) - { -#endif // DAAL_DISABLE_LEVEL_ZERO - - // OpenCL branch - auto kernelRef = OpenClKernelRef(_currentProgramRef->get(), kernelNameStr, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - kernel = OpenClKernelNative::create(_executionTarget, *_currentProgramRef, kernelRef, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - -#ifndef DAAL_DISABLE_LEVEL_ZERO - } - else - { - // Level zero branch - auto kernelRef = OpenClKernelLevelZeroRef(*_currentProgramRef, kernelNameStr, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - kernel = OpenClKernelLevelZero::create(_executionTarget, *_currentProgramRef, kernelRef, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - } -#endif // DAAL_DISABLE_LEVEL_ZERO - kernelHashTable.add(key, kernel, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, KernelPtr()); - - return kernel; - } - } - -private: - static const size_t SIZE_HASHTABLE_PROGRAM = 1024; - static const size_t SIZE_HASHTABLE_KERNEL = 4096; - HashTable programHashTable; - HashTable kernelHashTable; - - OpenClProgramRef * _currentProgramRef; -#ifndef DAAL_DISABLE_LEVEL_ZERO - LevelZeroOpenClInteropContext _levelZeroOpenClInteropContext; -#endif // DAAL_DISABLE_LEVEL_ZERO - - ExecutionTargetId _executionTarget; - ::sycl::queue & _deviceQueue; -}; - -class [[deprecated("CPP SYCL interfaces have been removed as of 2024.0 release.")]] SyclExecutionContextImpl : public Base, - public ExecutionContextIface -{ -public: - explicit SyclExecutionContextImpl(const ::sycl::queue & deviceQueue, const bool fromPython = false) - : _deviceQueue(deviceQueue), _kernelFactory(_deviceQueue), _kernelScheduler(_deviceQueue) - { - if (!fromPython) - { - throw std::runtime_error("CPP SYCL interfaces have been removed as of 2024.0 release."); - } - const auto & device = _deviceQueue.get_device(); - _infoDevice.isCpu = device.is_cpu(); - _infoDevice.maxWorkGroupSize = device.get_info< ::sycl::info::device::max_work_group_size>(); - _infoDevice.maxMemAllocSize = device.get_info< ::sycl::info::device::max_mem_alloc_size>(); - _infoDevice.globalMemSize = device.get_info< ::sycl::info::device::global_mem_size>(); - } - - void run(const KernelRange & range, const KernelPtr & kernel, const KernelArguments & args, Status & status) DAAL_C11_OVERRIDE - { - kernel->schedule(_kernelScheduler, range, args, status); - } - - void run(const KernelNDRange & range, const KernelPtr & kernel, const KernelArguments & args, Status & status) DAAL_C11_OVERRIDE - { - kernel->schedule(_kernelScheduler, range, args, status); - } - - void gemm(math::Transpose transa, math::Transpose transb, size_t m, size_t n, size_t k, double alpha, const UniversalBuffer & a_buffer, - size_t lda, size_t offsetA, const UniversalBuffer & b_buffer, size_t ldb, size_t offsetB, double beta, UniversalBuffer & c_buffer, - size_t ldc, size_t offsetC, Status & status) DAAL_C11_OVERRIDE - { - math::GemmExecutor::run(_deviceQueue, transa, transb, m, n, k, alpha, a_buffer, lda, offsetA, b_buffer, ldb, offsetB, beta, c_buffer, ldc, - offsetC, status); - } - - void syrk(math::UpLo upper_lower, math::Transpose trans, size_t n, size_t k, double alpha, const UniversalBuffer & a_buffer, size_t lda, - size_t offsetA, double beta, UniversalBuffer & c_buffer, size_t ldc, size_t offsetC, Status & status) DAAL_C11_OVERRIDE - { - math::SyrkExecutor::run(_deviceQueue, upper_lower, trans, n, k, alpha, a_buffer, lda, offsetA, beta, c_buffer, ldc, offsetC, status); - } - - void axpy(const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, const UniversalBuffer y_buffer, const int incy, - Status & status) DAAL_C11_OVERRIDE - { - math::AxpyExecutor::run(_deviceQueue, n, a, x_buffer, incx, y_buffer, incy, status); - } - - void potrf(math::UpLo uplo, size_t n, UniversalBuffer & a_buffer, size_t lda, Status & status) DAAL_C11_OVERRIDE - { - math::PotrfExecutor::run(_deviceQueue, uplo, n, a_buffer, lda, status); - } - - void potrs(math::UpLo uplo, size_t n, size_t ny, UniversalBuffer & a_buffer, size_t lda, UniversalBuffer & b_buffer, size_t ldb, Status & status) - DAAL_C11_OVERRIDE - { - math::PotrsExecutor::run(_deviceQueue, uplo, n, ny, a_buffer, lda, b_buffer, ldb, status); - } - - UniversalBuffer allocate(TypeId type, size_t bufferSize, Status & status) DAAL_C11_OVERRIDE - { - return BufferAllocator::allocate(_deviceQueue, type, bufferSize, status); - } - - void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, Status & status) DAAL_C11_OVERRIDE - { - BufferCopier::copy(_deviceQueue, dest, desOffset, src, srcOffset, count, status); - } - - void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcCount, size_t srcOffset, size_t count, Status & status) DAAL_C11_OVERRIDE - { - ArrayCopier::copy(_deviceQueue, dest, desOffset, src, srcCount, srcOffset, count, status); - } - - void fill(UniversalBuffer dest, double value, Status & status) DAAL_C11_OVERRIDE - { - BufferFiller::fill(_deviceQueue, dest, value, status); - } - - ClKernelFactoryIface & getClKernelFactory() DAAL_C11_OVERRIDE - { - return _kernelFactory; - } - - InfoDevice & getInfoDevice() DAAL_C11_OVERRIDE - { - return _infoDevice; - } - - const ::sycl::queue & getQueue() const - { - return _deviceQueue; - } - -private: - ::sycl::queue _deviceQueue; - OpenClKernelFactory _kernelFactory; - SyclKernelScheduler _kernelScheduler; - InfoDevice _infoDevice; -}; -} // namespace interface1 - -using interface1::SyclExecutionContextImpl; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal -/// \endcond - -#endif diff --git a/cpp/daal/include/services/internal/sycl/kernel_scheduler_sycl.h b/cpp/daal/include/services/internal/sycl/kernel_scheduler_sycl.h deleted file mode 100644 index b0a8daabe97..00000000000 --- a/cpp/daal/include/services/internal/sycl/kernel_scheduler_sycl.h +++ /dev/null @@ -1,662 +0,0 @@ -/* file: kernel_scheduler_sycl.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_KERNEL_SCHEDULER_SYCL_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_KERNEL_SCHEDULER_SYCL_H__ - -#ifndef DAAL_SYCL_INTERFACE - #error "DAAL_SYCL_INTERFACE must be defined to include this file" -#endif - -#include - -#include -#include - -#include "services/internal/sycl/error_handling_sycl.h" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/buffer_utils_sycl.h" - -#ifndef DAAL_DISABLE_LEVEL_ZERO - #include "services/internal/sycl/level_zero_module_sycl.h" -#endif - -/// \cond INTERNAL -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -template -class OpenClResourceRef : public Base -{ -public: - OpenClResourceRef() : _resource(nullptr) {} - - OpenClResourceRef(OpenClType & resource) : _resource(resource) {} - - OpenClResourceRef(const OpenClResourceRef & other) - { - _resource = other._resource; - OpenClRetain()(_resource); - } - - OpenClResourceRef(OpenClResourceRef && other) { _resource = other.release(); } - - ~OpenClResourceRef() { reset(); } - - operator bool() const { return get() == nullptr; } - - OpenClResourceRef & operator=(OpenClResourceRef other) { return swap(other); } - - OpenClResourceRef & operator=(OpenClResourceRef && other) - { - _resource = other.release(); - return *this; - } - - OpenClResourceRef & swap(OpenClResourceRef & other) - { - OpenClType tmp = _resource; - _resource = other._resource; - other._resource = tmp; - return *this; - } - - OpenClType release() - { - OpenClType tmp = _resource; - _resource = nullptr; - return tmp; - } - - void reset(OpenClType resource = nullptr) - { - OpenClRelease()(_resource); - _resource = resource; - } - - OpenClType get() const { return _resource; } - -private: - OpenClType _resource; -}; - -#define DAAL_DECLARE_OPENCL_OPERATOR(type_, name_) \ - struct OpenCl##name_ \ - { \ - void operator()(type_ p) { cl##name_(p); } \ - } - -DAAL_DECLARE_OPENCL_OPERATOR(cl_program, RetainProgram); -DAAL_DECLARE_OPENCL_OPERATOR(cl_program, ReleaseProgram); -DAAL_DECLARE_OPENCL_OPERATOR(cl_kernel, RetainKernel); -DAAL_DECLARE_OPENCL_OPERATOR(cl_kernel, ReleaseKernel); -DAAL_DECLARE_OPENCL_OPERATOR(cl_context, RetainContext); -DAAL_DECLARE_OPENCL_OPERATOR(cl_context, ReleaseContext); -DAAL_DECLARE_OPENCL_OPERATOR(cl_device_id, RetainDevice); -DAAL_DECLARE_OPENCL_OPERATOR(cl_device_id, ReleaseDevice); - -#undef DAAL_DECLARE_OPENCL_OPERATOR - -typedef OpenClResourceRef OpenClDeviceRef; - -class OpenClContextRef : public OpenClResourceRef -{ -public: - OpenClContextRef() = default; - - explicit OpenClContextRef(cl_device_id clDevice, Status & status) : _clDeviceRef(clDevice) - { - cl_int err = 0; - reset(clCreateContext(nullptr, 1, &clDevice, nullptr, nullptr, &err)); - DAAL_CHECK_OPENCL(err, status) - } - - OpenClDeviceRef getDeviceRef() { return _clDeviceRef; } - -private: - OpenClDeviceRef _clDeviceRef; -}; - -#ifndef DAAL_DISABLE_LEVEL_ZERO -class LevelZeroOpenClInteropContext : public Base -{ -public: - LevelZeroOpenClInteropContext() = default; - - LevelZeroOpenClInteropContext(const LevelZeroOpenClInteropContext &) = delete; - - explicit LevelZeroOpenClInteropContext(::sycl::queue & deviceQueue, Status & status) { reset(deviceQueue, status); } - - void reset(::sycl::queue & deviceQueue, Status & status) - { - cl_device_id clDevice; - findDevice(&clDevice, deviceQueue.get_device().get_info< ::sycl::info::device::vendor_id>(), - deviceQueue.get_device().get_info< ::sycl::info::device::max_clock_frequency>(), status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - _clDeviceRef.reset(clDevice); - - cl_int err = 0; - _clContextRef.reset(clCreateContext(nullptr, 1, &clDevice, nullptr, nullptr, &err)); - DAAL_CHECK_OPENCL(err, status) - } - - void findDevice(cl_device_id * pClDevice, unsigned int vendor_id, unsigned int frq, Status & status) - { - constexpr cl_uint maxPlatforms = 16; - cl_platform_id platIds[maxPlatforms]; - cl_uint nplat, ndev; - - DAAL_CHECK_OPENCL(clGetPlatformIDs(maxPlatforms, platIds, &nplat), status); - - for (cl_uint pidx = 0; pidx < nplat && pidx < maxPlatforms; pidx++) - { - if (clGetDeviceIDs(platIds[pidx], CL_DEVICE_TYPE_GPU, 1, pClDevice, &ndev) == CL_SUCCESS) - { - if (ndev > 0) - { - cl_uint dVid = 0; - clGetDeviceInfo(*pClDevice, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &dVid, nullptr); - - cl_uint dFrq = 0; - clGetDeviceInfo(*pClDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &dFrq, nullptr); - - if (dVid == vendor_id && dFrq == frq) return; - } - } - } - - status |= ErrorDeviceSupportNotImplemented; - } - - OpenClDeviceRef & getOpenClDeviceRef() { return _clDeviceRef; } - OpenClContextRef & getOpenClContextRef() { return _clContextRef; } - -private: - OpenClContextRef _clContextRef; - OpenClDeviceRef _clDeviceRef; -}; -#endif // DAAL_DISABLE_LEVEL_ZERO - -class OpenClProgramRef : public OpenClResourceRef -{ -public: - static SharedPtr create(cl_context clContext, cl_device_id clDevice, const char * programName, const char * programSrc, - const char * options, Status & status) - { - auto ptr = new OpenClProgramRef(); - if (!ptr) - { - status |= ErrorMemoryAllocationFailed; - return SharedPtr(); - } - ptr->initOpenClProgramRef(clContext, clDevice, programName, programSrc, options, status); - return SharedPtr(ptr); - } - -#ifndef DAAL_DISABLE_LEVEL_ZERO - static SharedPtr create(cl_context clContext, cl_device_id clDevice, ::sycl::queue & deviceQueue, const char * programName, - const char * programSrc, const char * options, Status & status) - { - auto ptr = new OpenClProgramRef(); - if (!ptr) - { - status |= ErrorMemoryAllocationFailed; - return SharedPtr(); - } - ptr->initOpenClProgramRef(clContext, clDevice, programName, programSrc, options, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, SharedPtr()); - - ptr->initModuleLevelZero(deviceQueue, status); - return SharedPtr(ptr); - } -#endif // DAAL_DISABLE_LEVEL_ZERO - -#ifndef DAAL_DISABLE_LEVEL_ZERO - ZeModulePtr getModuleLevelZeroPtr() const - { - return _moduleLevelZeroPtr; - } -#endif // DAAL_DISABLE_LEVEL_ZERO - - const String & getName() const - { - return _programName; - } - -private: - OpenClProgramRef() = default; - - void initOpenClProgramRef(cl_context clContext, cl_device_id clDevice, const char * programName, const char * programSrc, const char * options, - Status & status) - { - _programName = programName; - DAAL_CHECK_COND_ERROR(_programName.c_str(), status, ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - cl_int err = 0; - const char * sources[] = { programSrc }; - const size_t lengths[] = { std::strlen(programSrc) }; - reset(clCreateProgramWithSource(clContext, 1, sources, lengths, &err)); - DAAL_CHECK_OPENCL(err, status) - - err = clBuildProgram(get(), 1, &clDevice, options, nullptr, nullptr); - - DAAL_ASSERT_DECL(if (err == CL_BUILD_PROGRAM_FAILURE) { - size_t logLen = 0; - clGetProgramBuildInfo(get(), clDevice, CL_PROGRAM_BUILD_LOG, 0, nullptr, &logLen); - String buildLog(logLen); - if (buildLog.c_str()) - { - clGetProgramBuildInfo(get(), clDevice, CL_PROGRAM_BUILD_LOG, logLen, (void *)(buildLog.c_str()), nullptr); - fprintf(stderr, "Failed to build OpenCL program (%d):\n%s", err, buildLog.c_str()); - } - }) - - DAAL_CHECK_OPENCL(err, status) - } - -#ifndef DAAL_DISABLE_LEVEL_ZERO - void initModuleLevelZero(::sycl::queue & deviceQueue, Status & status) - { - size_t binarySize = 0; - DAAL_CHECK_OPENCL(clGetProgramInfo(get(), CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, nullptr), status); - - Collection binaryCollection(binarySize); - DAAL_CHECK_COND_ERROR(binaryCollection.data(), status, ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - byte * binary = binaryCollection.data(); - DAAL_CHECK_OPENCL(clGetProgramInfo(get(), CL_PROGRAM_BINARIES, sizeof(binary), &binary, nullptr), status); - - _moduleLevelZeroPtr = ZeModule::create(deviceQueue, binarySize, binary, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } -#endif // DAAL_DISABLE_LEVEL_ZERO - -private: - String _programName; - -#ifndef DAAL_DISABLE_LEVEL_ZERO - ZeModulePtr _moduleLevelZeroPtr; -#endif -}; - -class OpenClKernelRef : public OpenClResourceRef -{ -public: - OpenClKernelRef() = default; - - explicit OpenClKernelRef(cl_program clProgram, const String & kernelName, Status & status) - { - cl_int err = 0; - reset(clCreateKernel(clProgram, kernelName.c_str(), &err)); - DAAL_CHECK_OPENCL(err, status) - } -}; - -#ifndef DAAL_DISABLE_LEVEL_ZERO -class OpenClKernelLevelZeroRef : public Base -{ -public: - OpenClKernelLevelZeroRef() = default; - - explicit OpenClKernelLevelZeroRef(const OpenClProgramRef & programRef, const String & kernelName, Status & status) - { - _kernelLevelZeroPtr = programRef.getModuleLevelZeroPtr()->createKernel(kernelName.c_str(), status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - - ZeKernelPtr getKernelLevelZeroPtr() const { return _kernelLevelZeroPtr; } - -private: - ZeKernelPtr _kernelLevelZeroPtr; -}; -#endif // DAAL_DISABLE_LEVEL_ZERO - -class OpenClKernel : public Base, public KernelIface -{ -public: - explicit OpenClKernel(ExecutionTargetId executionTarget, const OpenClProgramRef & programRef) - : _executionTarget(executionTarget), _clProgramRef(programRef) - {} - - void schedule(KernelSchedulerIface & scheduler, const KernelRange & range, const KernelArguments & args, Status & status) const DAAL_C11_OVERRIDE - { - scheduler.schedule(*this, range, args, status); - } - - void schedule(KernelSchedulerIface & scheduler, const KernelNDRange & range, const KernelArguments & args, - Status & status) const DAAL_C11_OVERRIDE - { - scheduler.schedule(*this, range, args, status); - } - - ExecutionTargetId getTarget() const { return _executionTarget; } - - virtual ::sycl::kernel toSycl(const ::sycl::context & ctx) const = 0; - - const OpenClProgramRef & getProgramRef() const { return _clProgramRef; } - -private: - ExecutionTargetId _executionTarget; - OpenClProgramRef _clProgramRef; -}; - -class OpenClKernelNative : public OpenClKernel -{ -public: - static SharedPtr create(ExecutionTargetId executionTarget, const OpenClProgramRef & programRef, - const OpenClKernelRef & kernelRef, Status & status) - { - auto * ptr = new OpenClKernelNative(executionTarget, programRef, kernelRef); - if (!ptr) status |= ErrorMemoryAllocationFailed; - return SharedPtr(ptr); - } - - ::sycl::kernel toSycl(const ::sycl::context & ctx) const DAAL_C11_OVERRIDE - { - return ::sycl::make_kernel< ::sycl::backend::opencl>(_clKernelRef.get(), ctx); - } - -private: - explicit OpenClKernelNative(ExecutionTargetId executionTarget, const OpenClProgramRef & programRef, const OpenClKernelRef & kernelRef) - : OpenClKernel(executionTarget, programRef), _clKernelRef(kernelRef) - {} - - OpenClKernelRef _clKernelRef; -}; - -#ifndef DAAL_DISABLE_LEVEL_ZERO -class OpenClKernelLevelZero : public OpenClKernel -{ -public: - static SharedPtr create(ExecutionTargetId executionTarget, const OpenClProgramRef & programRef, - const OpenClKernelLevelZeroRef & kernelRef, Status & status) - { - auto * ptr = new OpenClKernelLevelZero(executionTarget, programRef, kernelRef); - if (!ptr) status |= ErrorMemoryAllocationFailed; - return SharedPtr(ptr); - } - - ::sycl::kernel toSycl(const ::sycl::context & ctx) const DAAL_C11_OVERRIDE - { - using namespace ::sycl; - kernel_bundle _kernelBundle = make_kernel_bundle( - { getProgramRef().getModuleLevelZeroPtr()->get(), ext::oneapi::level_zero::ownership::keep }, ctx); - return make_kernel( - { _kernelBundle, _zeKernelRef.getKernelLevelZeroPtr()->get(), ext::oneapi::level_zero::ownership::keep }, ctx); - } - -private: - OpenClKernelLevelZero(ExecutionTargetId executionTarget, const OpenClProgramRef & programRef, const OpenClKernelLevelZeroRef & kernelRef) - : OpenClKernel(executionTarget, programRef), _zeKernelRef(kernelRef) - {} - - OpenClKernelLevelZeroRef _zeKernelRef; -}; -#endif // DAAL_DISABLE_LEVEL_ZERO - -class UsmPointerStorage -{ -public: - UsmPointerStorage() = default; - UsmPointerStorage(const UsmPointerStorage &) = delete; - UsmPointerStorage & operator=(const UsmPointerStorage &) = delete; - - template - bool add(const SharedPtr & usmPointer) - { - return _pointers.safe_push_back(Any(usmPointer)); - } - -private: - Collection _pointers; -}; - -class SyclKernelSchedulerArgHandler -{ -public: - SyclKernelSchedulerArgHandler(::sycl::queue & queue, ::sycl::handler & handler, UsmPointerStorage & storage, size_t argumentIndex, - const KernelArgument & arg) - : _queue(queue), _handler(handler), _storage(storage), _argumentIndex(argumentIndex), _argument(arg) - {} - - template - void operator()(Typelist, Status & status) - { - switch (_argument.argType()) - { - case KernelArgumentTypes::publicBuffer: return handlePublicBuffer(status); - case KernelArgumentTypes::privateBuffer: return handlePrivateBuffer(status); - case KernelArgumentTypes::publicConstant: return handlePublicConstant(status); - default: DAAL_ASSERT(!"Unexpected kernel argument type"); - } - } - -private: - template - void handlePublicBuffer(Status & status) - { - auto service_buffer = _argument.get >(); -#ifdef DAAL_SYCL_INTERFACE_USM - switch (_argument.accessMode()) - { - case AccessModeIds::read: return handlePublicBuffer(service_buffer, status); - - case AccessModeIds::write: return handlePublicBuffer(service_buffer, status); - - case AccessModeIds::readwrite: return handlePublicBuffer(service_buffer, status); - - default: DAAL_ASSERT(!"Unexpected buffer access mode"); - } -#else - static_assert(false, "USM memory support is required for kernel execution"); -#endif - } - -#ifdef DAAL_SYCL_INTERFACE_USM - template - void handlePublicBuffer(Buffer & buffer, Status & status) - { - auto shared_pointer = buffer.toUSM(_queue, mode, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - // Note: we need this storage to keep all usm shared pointers alive - // while the kernel is running - if (!_storage.add(shared_pointer)) - { - status |= ErrorMemoryAllocationFailed; - return; - } - - _handler.set_arg((int)_argumentIndex, shared_pointer.get()); - } -#endif - - template - void handlePrivateBuffer(Status & status) - { - DAAL_ASSERT(!"Local buffers are not supported"); - } - - template - void handlePublicConstant(Status & status) - { - T value = _argument.get(); - _handler.set_arg((int)_argumentIndex, value); - } - - ::sycl::queue & _queue; - ::sycl::handler & _handler; - UsmPointerStorage & _storage; - size_t _argumentIndex; - const KernelArgument & _argument; -}; - -template -inline ::sycl::range convertToSyclRange(const KernelRange &); - -template <> -inline ::sycl::range<1> convertToSyclRange<1>(const KernelRange & r) -{ - return ::sycl::range<1>(r.upper1()); -} - -template <> -inline ::sycl::range<2> convertToSyclRange<2>(const KernelRange & r) -{ -#ifdef DAAL_SYCL_INTERFACE_REVERSED_RANGE - return ::sycl::range<2>(r.upper2(), r.upper1()); -#else - return ::sycl::range<2>(r.upper1(), r.upper2()); -#endif -} - -template <> -inline ::sycl::range<3> convertToSyclRange<3>(const KernelRange & r) -{ -#ifdef DAAL_SYCL_INTERFACE_REVERSED_RANGE - return ::sycl::range<3>(r.upper3(), r.upper2(), r.upper1()); -#else - return ::sycl::range<3>(r.upper1(), r.upper2(), r.upper3()); -#endif -} - -template -inline ::sycl::nd_range convertToSyclRange(const KernelNDRange &); - -template <> -inline ::sycl::nd_range<1> convertToSyclRange<1>(const KernelNDRange & r) -{ - return ::sycl::nd_range<1>(::sycl::range<1>(r.global().upper1()), ::sycl::range<1>(r.local().upper1())); -} - -template <> -inline ::sycl::nd_range<2> convertToSyclRange<2>(const KernelNDRange & r) -{ - return ::sycl::nd_range<2>( -#ifdef DAAL_SYCL_INTERFACE_REVERSED_RANGE - ::sycl::range<2>(r.global().upper2(), r.global().upper1()), ::sycl::range<2>(r.local().upper2(), r.local().upper1()) -#else - ::sycl::range<2>(r.global().upper1(), r.global().upper2()), ::sycl::range<2>(r.local().upper1(), r.local().upper2()) -#endif - ); -} - -template <> -inline ::sycl::nd_range<3> convertToSyclRange<3>(const KernelNDRange & r) -{ - return ::sycl::nd_range<3>( -#ifdef DAAL_SYCL_INTERFACE_REVERSED_RANGE - ::sycl::range<3>(r.global().upper3(), r.global().upper2(), r.global().upper1()), - ::sycl::range<3>(r.local().upper3(), r.local().upper2(), r.local().upper1()) -#else - ::sycl::range<3>(r.global().upper1(), r.global().upper2(), r.global().upper3()), - ::sycl::range<3>(r.local().upper1(), r.local().upper2(), r.local().upper3()) -#endif - ); -} - -class SyclKernelScheduler : public Base, public KernelSchedulerIface -{ -public: - explicit SyclKernelScheduler(::sycl::queue & deviceQueue) : _queue(deviceQueue) {} - - void schedule(const OpenClKernel & kernel, const KernelRange & range, const KernelArguments & args, Status & status) DAAL_C11_OVERRIDE - { - scheduleImpl(range, kernel, args, status); - } - - void schedule(const OpenClKernel & kernel, const KernelNDRange & range, const KernelArguments & args, Status & status) DAAL_C11_OVERRIDE - { - scheduleImpl(range, kernel, args, status); - } - -private: - template - void scheduleImpl(const Range & range, const OpenClKernel & kernel, const KernelArguments & args, Status & status) - { - switch (kernel.getTarget()) - { - case ExecutionTargetIds::device: return scheduleOnDevice(range, kernel, args, status); - - case ExecutionTargetIds::host: status |= ErrorMethodNotImplemented; return; - - default: DAAL_ASSERT(!"Unexpected execution target"); - } - } - - template - void scheduleOnDevice(const Range & range, const OpenClKernel & kernel, const KernelArguments & args, Status & status) - { - switch (range.dimensions()) - { - case 1: return scheduleSycl(convertToSyclRange<1>(range), kernel, args, status); - case 2: return scheduleSycl(convertToSyclRange<2>(range), kernel, args, status); - case 3: return scheduleSycl(convertToSyclRange<3>(range), kernel, args, status); - default: DAAL_ASSERT(!"Unexpected number of dimensions"); - } - } - - template - void scheduleSycl(const Range & range, const OpenClKernel & kernel, const KernelArguments & args, Status & status) - { - UsmPointerStorage bufferStorage; - - status |= catchSyclExceptions([&]() mutable { - ::sycl::kernel syclKernel = kernel.toSycl(_queue.get_context()); - - auto event = _queue.submit([&](::sycl::handler & cgh) { - passArguments(_queue, cgh, bufferStorage, args, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - cgh.parallel_for(range, syclKernel); - }); - event.wait_and_throw(); - }); - } - - void passArguments(::sycl::queue & queue, ::sycl::handler & cgh, UsmPointerStorage & storage, const KernelArguments & args, Status & status) const - { - for (size_t i = 0; i < args.size(); i++) - { - const auto & arg = args.get(i); - SyclKernelSchedulerArgHandler argHandler(queue, cgh, storage, i, arg); - TypeDispatcher::dispatch(arg.dataType(), argHandler, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - } - -private: - ::sycl::queue & _queue; -}; - -} // namespace interface1 -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal -/// \endcond - -#endif diff --git a/cpp/daal/include/services/internal/sycl/level_zero_common.h b/cpp/daal/include/services/internal/sycl/level_zero_common.h deleted file mode 100644 index 048bc6c3ace..00000000000 --- a/cpp/daal/include/services/internal/sycl/level_zero_common.h +++ /dev/null @@ -1,38 +0,0 @@ -/* file: level_zero_common.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_LEVEL_ZERO_COMMON_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_LEVEL_ZERO_COMMON_H__ - -#ifdef DAAL_DISABLE_LEVEL_ZERO - #error "DAAL_DISABLE_LEVEL_ZERO must be undefined to include this file" -#endif - -#ifndef _ZE_API_H - #include "services/internal/sycl/level_zero_types.h" -#endif - -typedef ze_result_t (*zeModuleCreateFT)(ze_context_handle_t, ze_device_handle_t, const ze_module_desc_t *, ze_module_handle_t *, - ze_module_build_log_handle_t *); - -typedef ze_result_t (*zeModuleDestroyFT)(ze_module_handle_t hModule); - -typedef ze_result_t (*zeKernelCreateFT)(ze_module_handle_t hModule, const ze_kernel_desc_t * desc, ze_kernel_handle_t * phKernel); - -typedef ze_result_t (*zeKernelDestroyFT)(ze_kernel_handle_t hKernel); - -#endif diff --git a/cpp/daal/include/services/internal/sycl/level_zero_module_sycl.h b/cpp/daal/include/services/internal/sycl/level_zero_module_sycl.h deleted file mode 100644 index 93374655224..00000000000 --- a/cpp/daal/include/services/internal/sycl/level_zero_module_sycl.h +++ /dev/null @@ -1,185 +0,0 @@ -/* file: level_zero_module_sycl.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_DAAL_ZE_MODULE_SYCL_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_DAAL_ZE_MODULE_SYCL_H__ - -#ifndef DAAL_SYCL_INTERFACE - #error "DAAL_SYCL_INTERFACE must be defined to include this file" -#endif - -#ifdef DAAL_DISABLE_LEVEL_ZERO - #error "DAAL_DISABLE_LEVEL_ZERO must be undefined to include this file" -#endif - -#include - -#include "services/daal_shared_ptr.h" -#include "services/internal/dynamic_lib_helper.h" -#include "services/internal/sycl/error_handling_sycl.h" -#include "services/internal/sycl/level_zero_common.h" - -#if (defined(__SYCL_COMPILER_VERSION) && (__SYCL_COMPILER_VERSION >= 20211025)) - #include -#elif (defined(__SYCL_COMPILER_VERSION) && (__SYCL_COMPILER_VERSION >= 20200701)) - #include -#else - #include -#endif - -/// \cond INTERNAL -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -#define DAAL_LEVEL_ZERO_VERSION_SUFF ".1" -#define DAAL_LEVEL_ZERO_LIB_VERSIONED_NAME(n) #n DAAL_LEVEL_ZERO_VERSION_SUFF - -#ifdef __linux__ -static const char * zeLoaderName = DAAL_LEVEL_ZERO_LIB_VERSIONED_NAME(libze_loader.so); -static const int libLoadFlags = RTLD_NOLOAD | RTLD_NOW | RTLD_LOCAL; -#elif defined(_WIN64) -static const char * zeLoaderName = "ze_loader.dll"; -static const int libLoadFlags = LOAD_LIBRARY_SEARCH_SYSTEM32; -#else - #error "Level Zero support is unavailable for this platform" -#endif - -static const char * zeModuleCreateFuncName = "zeModuleCreate"; -static const char * zeModuleDestroyFuncName = "zeModuleDestroy"; -static const char * zeKernelCreateFuncName = "zeKernelCreate"; -static const char * zeKernelDestroyFuncName = "zeKernelDestroy"; -class ZeModule; - -class ZeKernel : public Base -{ - friend ZeModule; - -public: - ZeKernel(const ZeModule &) = delete; - ZeKernel & operator=(const ZeModule &) = delete; - - ze_kernel_handle_t get() const { return _kernelLevelZero; } - -private: - explicit ZeKernel(ze_module_handle_t moduleLevelZero, const char * kernelName, Status & status) - { - static DynamicLibHelper zeLib(zeLoaderName, libLoadFlags, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - static zeKernelCreateFT stZeKernelCreateF = zeLib.getSymbol(zeKernelCreateFuncName, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - _zeKernelCreateF = stZeKernelCreateF; - - ze_kernel_desc_t desc; - desc.stype = ZE_STRUCTURE_TYPE_KERNEL_DESC; - desc.pNext = nullptr; - desc.flags = ze_kernel_flags_t(0); - desc.pKernelName = kernelName; - - DAAL_CHECK_LEVEL_ZERO(_zeKernelCreateF(moduleLevelZero, &desc, &_kernelLevelZero), status); - } - - zeKernelCreateFT _zeKernelCreateF; - - ze_kernel_handle_t _kernelLevelZero; -}; - -typedef SharedPtr ZeKernelPtr; - -class ZeModule : public Base -{ -public: - static SharedPtr create(::sycl::queue & deviceQueue, size_t binarySize, const uint8_t * pBinary, Status & status) - { - auto ptr = new ZeModule(deviceQueue, binarySize, pBinary, status); - if (!status) - { - if (ptr) delete ptr; - ptr = nullptr; - } - else if (!ptr) - status |= ErrorMemoryAllocationFailed; - return SharedPtr(ptr); - } - - ZeModule(const ZeModule &) = delete; - ZeModule & operator=(const ZeModule &) = delete; - - ZeKernelPtr createKernel(const char * kernelName, Status & status) - { - auto ptr = new ZeKernel(_moduleLevelZero, kernelName, status); - if (!status) - { - if (ptr) delete ptr; - ptr = nullptr; - } - else if (!ptr) - status |= ErrorMemoryAllocationFailed; - return ZeKernelPtr(ptr); - } - - ze_module_handle_t get() const { return _moduleLevelZero; } - -private: - explicit ZeModule(::sycl::queue & deviceQueue, size_t binarySize, const uint8_t * pBinary, Status & status) - { - static DynamicLibHelper zeLib(zeLoaderName, libLoadFlags, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - static zeModuleCreateFT stZeModuleCreateF = zeLib.getSymbol(zeModuleCreateFuncName, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - _zeModuleCreateF = stZeModuleCreateF; - - ze_module_desc_t desc; - desc.stype = ZE_STRUCTURE_TYPE_MODULE_DESC; - desc.format = ZE_MODULE_FORMAT_NATIVE; - desc.inputSize = binarySize; - desc.pInputModule = pBinary; - desc.pBuildFlags = ""; - desc.pConstants = nullptr; - desc.pNext = nullptr; - - DAAL_CHECK_LEVEL_ZERO(_zeModuleCreateF(::sycl::get_native< ::sycl::backend::ext_oneapi_level_zero>(deviceQueue.get_context()), - ::sycl::get_native< ::sycl::backend::ext_oneapi_level_zero>(deviceQueue.get_device()), &desc, - &_moduleLevelZero, nullptr), - status); - } - - zeModuleCreateFT _zeModuleCreateF; - - ze_module_handle_t _moduleLevelZero; -}; - -typedef SharedPtr ZeModulePtr; - -} // namespace interface1 -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal -/// \endcond - -#endif diff --git a/cpp/daal/include/services/internal/sycl/level_zero_types.h b/cpp/daal/include/services/internal/sycl/level_zero_types.h deleted file mode 100644 index 947e75c0cd8..00000000000 --- a/cpp/daal/include/services/internal/sycl/level_zero_types.h +++ /dev/null @@ -1,8191 +0,0 @@ -/* file: level_zero_types.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef _ZE_API_H -#define _ZE_API_H -#if defined(__cplusplus) - #pragma once -#endif - -// standard headers -#include -#include - -#if defined(__cplusplus) -extern "C" -{ -#endif - -// Intel 'oneAPI' Level-Zero API common types -#if !defined(__GNUC__) - #pragma region common -#endif -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAKE_VERSION - /// @brief Generates generic 'oneAPI' API versions - #define ZE_MAKE_VERSION(_major, _minor) ((_major << 16) | (_minor & 0x0000ffff)) -#endif // ZE_MAKE_VERSION - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAJOR_VERSION - /// @brief Extracts 'oneAPI' API major version - #define ZE_MAJOR_VERSION(_ver) (_ver >> 16) -#endif // ZE_MAJOR_VERSION - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MINOR_VERSION - /// @brief Extracts 'oneAPI' API minor version - #define ZE_MINOR_VERSION(_ver) (_ver & 0x0000ffff) -#endif // ZE_MINOR_VERSION - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_APICALL - #if defined(_WIN32) - /// @brief Calling convention for all API functions - #define ZE_APICALL __cdecl - #else - #define ZE_APICALL - #endif // defined(_WIN32) -#endif // ZE_APICALL - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_APIEXPORT - #if defined(_WIN32) - /// @brief Microsoft-specific dllexport storage-class attribute - #define ZE_APIEXPORT __declspec(dllexport) - #else - #define ZE_APIEXPORT - #endif // defined(_WIN32) -#endif // ZE_APIEXPORT - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_DLLEXPORT - #if defined(_WIN32) - /// @brief Microsoft-specific dllexport storage-class attribute - #define ZE_DLLEXPORT __declspec(dllexport) - #endif // defined(_WIN32) -#endif // ZE_DLLEXPORT - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_DLLEXPORT - #if __GNUC__ >= 4 - /// @brief GCC-specific dllexport storage-class attribute - #define ZE_DLLEXPORT __attribute__((visibility("default"))) - #else - #define ZE_DLLEXPORT - #endif // __GNUC__ >= 4 -#endif // ZE_DLLEXPORT - - /////////////////////////////////////////////////////////////////////////////// - /// @brief compiler-independent type - typedef uint8_t ze_bool_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of a driver instance - typedef struct _ze_driver_handle_t * ze_driver_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's device object - typedef struct _ze_device_handle_t * ze_device_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's context object - typedef struct _ze_context_handle_t * ze_context_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's command queue object - typedef struct _ze_command_queue_handle_t * ze_command_queue_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's command list object - typedef struct _ze_command_list_handle_t * ze_command_list_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's fence object - typedef struct _ze_fence_handle_t * ze_fence_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's event pool object - typedef struct _ze_event_pool_handle_t * ze_event_pool_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's event object - typedef struct _ze_event_handle_t * ze_event_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's image object - typedef struct _ze_image_handle_t * ze_image_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's module object - typedef struct _ze_module_handle_t * ze_module_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of module's build log object - typedef struct _ze_module_build_log_handle_t * ze_module_build_log_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's kernel object - typedef struct _ze_kernel_handle_t * ze_kernel_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of driver's sampler object - typedef struct _ze_sampler_handle_t * ze_sampler_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Handle of physical memory object - typedef struct _ze_physical_mem_handle_t * ze_physical_mem_handle_t; - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_IPC_HANDLE_SIZE - /// @brief Maximum IPC handle size - #define ZE_MAX_IPC_HANDLE_SIZE 64 -#endif // ZE_MAX_IPC_HANDLE_SIZE - - /////////////////////////////////////////////////////////////////////////////// - /// @brief IPC handle to a memory allocation - typedef struct _ze_ipc_mem_handle_t - { - char data[ZE_MAX_IPC_HANDLE_SIZE]; ///< [out] Opaque data representing an IPC handle - } ze_ipc_mem_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief IPC handle to a event pool allocation - typedef struct _ze_ipc_event_pool_handle_t - { - char data[ZE_MAX_IPC_HANDLE_SIZE]; ///< [out] Opaque data representing an IPC handle - } ze_ipc_event_pool_handle_t; - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_BIT - /// @brief Generic macro for enumerator bit masks - #define ZE_BIT(_i) (1 << _i) -#endif // ZE_BIT - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Defines Return/Error codes - typedef enum _ze_result_t - { - ZE_RESULT_SUCCESS = 0, ///< [Core] success - ZE_RESULT_NOT_READY = 1, ///< [Core] synchronization primitive not signaled - ZE_RESULT_ERROR_DEVICE_LOST = 0x70000001, ///< [Core] device hung, reset, was removed, or driver update occurred - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY = 0x70000002, ///< [Core] insufficient host memory to satisfy call - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 0x70000003, ///< [Core] insufficient device memory to satisfy call - ZE_RESULT_ERROR_MODULE_BUILD_FAILURE = 0x70000004, ///< [Core] error occurred when building module, see build log for details - ZE_RESULT_ERROR_MODULE_LINK_FAILURE = 0x70000005, ///< [Core] error occurred when linking modules, see build log for details - ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS = 0x70010000, ///< [Sysman] access denied due to permission level - ZE_RESULT_ERROR_NOT_AVAILABLE = 0x70010001, ///< [Sysman] resource already in use and simultaneous access not allowed - ///< or resource was removed - ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE = 0x70020000, ///< [Tools] external required dependency is unavailable or missing - ZE_RESULT_ERROR_UNINITIALIZED = 0x78000001, ///< [Validation] driver is not initialized - ZE_RESULT_ERROR_UNSUPPORTED_VERSION = 0x78000002, ///< [Validation] generic error code for unsupported versions - ZE_RESULT_ERROR_UNSUPPORTED_FEATURE = 0x78000003, ///< [Validation] generic error code for unsupported features - ZE_RESULT_ERROR_INVALID_ARGUMENT = 0x78000004, ///< [Validation] generic error code for invalid arguments - ZE_RESULT_ERROR_INVALID_NULL_HANDLE = 0x78000005, ///< [Validation] handle argument is not valid - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 0x78000006, ///< [Validation] object pointed to by handle still in-use by device - ZE_RESULT_ERROR_INVALID_NULL_POINTER = 0x78000007, ///< [Validation] pointer argument may not be nullptr - ZE_RESULT_ERROR_INVALID_SIZE = 0x78000008, ///< [Validation] size argument is invalid (e.g., must not be zero) - ZE_RESULT_ERROR_UNSUPPORTED_SIZE = 0x78000009, ///< [Validation] size argument is not supported by the device (e.g., too - ///< large) - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 0x7800000a, ///< [Validation] alignment argument is not supported by the device (e.g., - ///< too small) - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 0x7800000b, ///< [Validation] synchronization object in invalid state - ZE_RESULT_ERROR_INVALID_ENUMERATION = 0x7800000c, ///< [Validation] enumerator argument is not valid - ZE_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 0x7800000d, ///< [Validation] enumerator argument is not supported by the device - ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 0x7800000e, ///< [Validation] image format is not supported by the device - ZE_RESULT_ERROR_INVALID_NATIVE_BINARY = 0x7800000f, ///< [Validation] native binary is not supported by the device - ZE_RESULT_ERROR_INVALID_GLOBAL_NAME = 0x78000010, ///< [Validation] global variable is not found in the module - ZE_RESULT_ERROR_INVALID_KERNEL_NAME = 0x78000011, ///< [Validation] kernel name is not found in the module - ZE_RESULT_ERROR_INVALID_FUNCTION_NAME = 0x78000012, ///< [Validation] function name is not found in the module - ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 0x78000013, ///< [Validation] group size dimension is not valid for the kernel or - ///< device - ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 0x78000014, ///< [Validation] global width dimension is not valid for the kernel or - ///< device - ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX = 0x78000015, ///< [Validation] kernel argument index is not valid for kernel - ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE = 0x78000016, ///< [Validation] kernel argument size does not match kernel - ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE = 0x78000017, ///< [Validation] value of kernel attribute is not valid for the kernel or - ///< device - ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED = 0x78000018, ///< [Validation] module with imports needs to be linked before kernels can - ///< be created from it. - ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE = 0x78000019, ///< [Validation] command list type does not match command queue type - ZE_RESULT_ERROR_OVERLAPPING_REGIONS = 0x7800001a, ///< [Validation] copy operations do not support overlapping regions of - ///< memory - ZE_RESULT_ERROR_UNKNOWN = 0x7ffffffe, ///< [Core] unknown or internal error - ZE_RESULT_FORCE_UINT32 = 0x7fffffff - } ze_result_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Defines structure types - typedef enum _ze_structure_type_t - { - ZE_STRUCTURE_TYPE_DRIVER_PROPERTIES = 0x1, ///< ze_driver_properties_t - ZE_STRUCTURE_TYPE_DRIVER_IPC_PROPERTIES = 0x2, ///< ze_driver_ipc_properties_t - ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x3, ///< ze_device_properties_t - ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES = 0x4, ///< ze_device_compute_properties_t - ZE_STRUCTURE_TYPE_DEVICE_MODULE_PROPERTIES = 0x5, ///< ze_device_module_properties_t - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_GROUP_PROPERTIES = 0x6, ///< ze_command_queue_group_properties_t - ZE_STRUCTURE_TYPE_DEVICE_MEMORY_PROPERTIES = 0x7, ///< ze_device_memory_properties_t - ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES = 0x8, ///< ze_device_memory_access_properties_t - ZE_STRUCTURE_TYPE_DEVICE_CACHE_PROPERTIES = 0x9, ///< ze_device_cache_properties_t - ZE_STRUCTURE_TYPE_DEVICE_IMAGE_PROPERTIES = 0xa, ///< ze_device_image_properties_t - ZE_STRUCTURE_TYPE_DEVICE_P2P_PROPERTIES = 0xb, ///< ze_device_p2p_properties_t - ZE_STRUCTURE_TYPE_DEVICE_EXTERNAL_MEMORY_PROPERTIES = 0xc, ///< ze_device_external_memory_properties_t - ZE_STRUCTURE_TYPE_CONTEXT_DESC = 0xd, ///< ze_context_desc_t - ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC = 0xe, ///< ze_command_queue_desc_t - ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC = 0xf, ///< ze_command_list_desc_t - ZE_STRUCTURE_TYPE_EVENT_POOL_DESC = 0x10, ///< ze_event_pool_desc_t - ZE_STRUCTURE_TYPE_EVENT_DESC = 0x11, ///< ze_event_desc_t - ZE_STRUCTURE_TYPE_FENCE_DESC = 0x12, ///< ze_fence_desc_t - ZE_STRUCTURE_TYPE_IMAGE_DESC = 0x13, ///< ze_image_desc_t - ZE_STRUCTURE_TYPE_IMAGE_PROPERTIES = 0x14, ///< ze_image_properties_t - ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC = 0x15, ///< ze_device_mem_alloc_desc_t - ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC = 0x16, ///< ze_host_mem_alloc_desc_t - ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES = 0x17, ///< ze_memory_allocation_properties_t - ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_DESC = 0x18, ///< ze_external_memory_export_desc_t - ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD = 0x19, ///< ze_external_memory_import_fd_t - ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD = 0x1a, ///< ze_external_memory_export_fd_t - ZE_STRUCTURE_TYPE_MODULE_DESC = 0x1b, ///< ze_module_desc_t - ZE_STRUCTURE_TYPE_MODULE_PROPERTIES = 0x1c, ///< ze_module_properties_t - ZE_STRUCTURE_TYPE_KERNEL_DESC = 0x1d, ///< ze_kernel_desc_t - ZE_STRUCTURE_TYPE_KERNEL_PROPERTIES = 0x1e, ///< ze_kernel_properties_t - ZE_STRUCTURE_TYPE_SAMPLER_DESC = 0x1f, ///< ze_sampler_desc_t - ZE_STRUCTURE_TYPE_PHYSICAL_MEM_DESC = 0x20, ///< ze_physical_mem_desc_t - ZE_STRUCTURE_TYPE_RAYTRACING_MEM_ALLOC_EXT_DESC = 0x00010001, ///< ze_raytracing_mem_alloc_ext_desc_t - ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff - } ze_structure_type_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief External memory type flags - typedef uint32_t ze_external_memory_type_flags_t; - typedef enum _ze_external_memory_type_flag_t - { - ZE_EXTERNAL_MEMORY_TYPE_FLAG_OPAQUE_FD = ZE_BIT(0), ///< an opaque POSIX file descriptor handle - ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF = ZE_BIT(1), ///< a file descriptor handle for a Linux dma_buf - ZE_EXTERNAL_MEMORY_TYPE_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_external_memory_type_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Base for all properties types - typedef struct _ze_base_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - } ze_base_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Base for all descriptor types - typedef struct _ze_base_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - } ze_base_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forces driver to only report devices (and sub-devices) as specified by - /// values - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forces driver to report devices from lowest to highest PCI bus ID - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forces all shared allocations into device memory - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_ipc_mem_handle_t - typedef struct _ze_ipc_mem_handle_t ze_ipc_mem_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_ipc_event_pool_handle_t - typedef struct _ze_ipc_event_pool_handle_t ze_ipc_event_pool_handle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_base_properties_t - typedef struct _ze_base_properties_t ze_base_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_base_desc_t - typedef struct _ze_base_desc_t ze_base_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_driver_uuid_t - typedef struct _ze_driver_uuid_t ze_driver_uuid_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_driver_properties_t - typedef struct _ze_driver_properties_t ze_driver_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_driver_ipc_properties_t - typedef struct _ze_driver_ipc_properties_t ze_driver_ipc_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_driver_extension_properties_t - typedef struct _ze_driver_extension_properties_t ze_driver_extension_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_uuid_t - typedef struct _ze_device_uuid_t ze_device_uuid_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_properties_t - typedef struct _ze_device_properties_t ze_device_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_thread_t - typedef struct _ze_device_thread_t ze_device_thread_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_compute_properties_t - typedef struct _ze_device_compute_properties_t ze_device_compute_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_native_kernel_uuid_t - typedef struct _ze_native_kernel_uuid_t ze_native_kernel_uuid_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_module_properties_t - typedef struct _ze_device_module_properties_t ze_device_module_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_command_queue_group_properties_t - typedef struct _ze_command_queue_group_properties_t ze_command_queue_group_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_memory_properties_t - typedef struct _ze_device_memory_properties_t ze_device_memory_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_memory_access_properties_t - typedef struct _ze_device_memory_access_properties_t ze_device_memory_access_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_cache_properties_t - typedef struct _ze_device_cache_properties_t ze_device_cache_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_image_properties_t - typedef struct _ze_device_image_properties_t ze_device_image_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_external_memory_properties_t - typedef struct _ze_device_external_memory_properties_t ze_device_external_memory_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_p2p_properties_t - typedef struct _ze_device_p2p_properties_t ze_device_p2p_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_context_desc_t - typedef struct _ze_context_desc_t ze_context_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_command_queue_desc_t - typedef struct _ze_command_queue_desc_t ze_command_queue_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_command_list_desc_t - typedef struct _ze_command_list_desc_t ze_command_list_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_copy_region_t - typedef struct _ze_copy_region_t ze_copy_region_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_image_region_t - typedef struct _ze_image_region_t ze_image_region_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_event_pool_desc_t - typedef struct _ze_event_pool_desc_t ze_event_pool_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_event_desc_t - typedef struct _ze_event_desc_t ze_event_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_kernel_timestamp_data_t - typedef struct _ze_kernel_timestamp_data_t ze_kernel_timestamp_data_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_kernel_timestamp_result_t - typedef struct _ze_kernel_timestamp_result_t ze_kernel_timestamp_result_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_fence_desc_t - typedef struct _ze_fence_desc_t ze_fence_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_image_format_t - typedef struct _ze_image_format_t ze_image_format_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_image_desc_t - typedef struct _ze_image_desc_t ze_image_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_image_properties_t - typedef struct _ze_image_properties_t ze_image_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_device_mem_alloc_desc_t - typedef struct _ze_device_mem_alloc_desc_t ze_device_mem_alloc_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_host_mem_alloc_desc_t - typedef struct _ze_host_mem_alloc_desc_t ze_host_mem_alloc_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_memory_allocation_properties_t - typedef struct _ze_memory_allocation_properties_t ze_memory_allocation_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_external_memory_export_desc_t - typedef struct _ze_external_memory_export_desc_t ze_external_memory_export_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_external_memory_import_fd_t - typedef struct _ze_external_memory_import_fd_t ze_external_memory_import_fd_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_external_memory_export_fd_t - typedef struct _ze_external_memory_export_fd_t ze_external_memory_export_fd_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_module_constants_t - typedef struct _ze_module_constants_t ze_module_constants_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_module_desc_t - typedef struct _ze_module_desc_t ze_module_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_module_properties_t - typedef struct _ze_module_properties_t ze_module_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_kernel_desc_t - typedef struct _ze_kernel_desc_t ze_kernel_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_kernel_uuid_t - typedef struct _ze_kernel_uuid_t ze_kernel_uuid_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_kernel_properties_t - typedef struct _ze_kernel_properties_t ze_kernel_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_group_count_t - typedef struct _ze_group_count_t ze_group_count_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_raytracing_mem_alloc_ext_desc_t - typedef struct _ze_raytracing_mem_alloc_ext_desc_t ze_raytracing_mem_alloc_ext_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_sampler_desc_t - typedef struct _ze_sampler_desc_t ze_sampler_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Forward-declare ze_physical_mem_desc_t - typedef struct _ze_physical_mem_desc_t ze_physical_mem_desc_t; - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs -#if !defined(__GNUC__) - #pragma region driver -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported initialization flags - typedef uint32_t ze_init_flags_t; - typedef enum _ze_init_flag_t - { - ZE_INIT_FLAG_GPU_ONLY = ZE_BIT(0), ///< only initialize GPU drivers - ZE_INIT_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_init_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Initialize the 'oneAPI' driver(s) - /// - /// @details - /// - This function must be called before any other API function. - /// - If this function is not called then all other functions will return - /// ZE_RESULT_ERROR_UNINITIALIZED. - /// - Only one instance of each driver will be initialized per process. - /// - This function is thread-safe for scenarios where multiple libraries - /// may initialize the driver(s) simultaneously. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x1 < flags` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeInit(ze_init_flags_t flags ///< [in] initialization flags. - ///< must be 0 (default) or a combination of ze_init_flag_t. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves driver instances - /// - /// @details - /// - A driver represents a collection of physical devices. - /// - Multiple calls to this function will return identical driver handles, - /// in the same order. - /// - The application may pass nullptr for pDrivers when only querying the - /// number of drivers. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clGetPlatformIDs - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDriverGet( - uint32_t * pCount, ///< [in,out] pointer to the number of driver instances. - ///< if count is zero, then the loader will update the value with the total - ///< number of drivers available. - ///< if count is non-zero, then the loader will only retrieve that number - ///< of drivers. - ///< if count is larger than the number of drivers available, then the - ///< loader will update the value with the correct number of drivers available. - ze_driver_handle_t * phDrivers ///< [in,out][optional][range(0, *pCount)] array of driver instance handles - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported API versions - /// - /// @details - /// - API versions contain major and minor attributes, use - /// ZE_MAJOR_VERSION and ZE_MINOR_VERSION - typedef enum _ze_api_version_t - { - ZE_API_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 - ZE_API_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version - ZE_API_VERSION_FORCE_UINT32 = 0x7fffffff - } ze_api_version_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Returns the API version supported by the specified driver - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDriver` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == version` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDriverGetApiVersion(ze_driver_handle_t hDriver, ///< [in] handle of the driver instance - ze_api_version_t * version ///< [out] api version - ); - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_DRIVER_UUID_SIZE - /// @brief Maximum driver universal unique id (UUID) size in bytes - #define ZE_MAX_DRIVER_UUID_SIZE 16 -#endif // ZE_MAX_DRIVER_UUID_SIZE - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Driver universal unique id (UUID) - typedef struct _ze_driver_uuid_t - { - uint8_t id[ZE_MAX_DRIVER_UUID_SIZE]; ///< [out] opaque data representing a driver UUID - } ze_driver_uuid_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Driver properties queried using zeDriverGetProperties - typedef struct _ze_driver_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_driver_uuid_t uuid; ///< [out] universal unique identifier. - uint32_t driverVersion; ///< [out] driver version - ///< The driver version is a non-zero, monotonically increasing value where - ///< higher values always indicate a more recent version. - } ze_driver_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves properties of the driver. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clGetPlatformInfo** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDriver` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pDriverProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDriverGetProperties( - ze_driver_handle_t hDriver, ///< [in] handle of the driver instance - ze_driver_properties_t * pDriverProperties ///< [in,out] query result for driver properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported IPC property flags - typedef uint32_t ze_ipc_property_flags_t; - typedef enum _ze_ipc_property_flag_t - { - ZE_IPC_PROPERTY_FLAG_MEMORY = ZE_BIT(0), ///< Supports passing memory allocations between processes. See - ///< zeMemGetIpcHandle. - ZE_IPC_PROPERTY_FLAG_EVENT_POOL = ZE_BIT(1), ///< Supports passing event pools between processes. See - ///< zeEventPoolGetIpcHandle. - ZE_IPC_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_ipc_property_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief IPC properties queried using zeDriverGetIpcProperties - typedef struct _ze_driver_ipc_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_ipc_property_flags_t flags; ///< [out] 0 (none) or a valid combination of ze_ipc_property_flag_t - } ze_driver_ipc_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves IPC attributes of the driver - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDriver` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pIpcProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDriverGetIpcProperties( - ze_driver_handle_t hDriver, ///< [in] handle of the driver instance - ze_driver_ipc_properties_t * pIpcProperties ///< [out] query result for IPC properties - ); - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_EXTENSION_NAME - /// @brief Maximum extension name string size - #define ZE_MAX_EXTENSION_NAME 256 -#endif // ZE_MAX_EXTENSION_NAME - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Extension properties queried using zeDriverGetExtensionProperties - typedef struct _ze_driver_extension_properties_t - { - char name[ZE_MAX_EXTENSION_NAME]; ///< [out] extension name - uint32_t version; ///< [out] extension version using ZE_MAKE_VERSION - } ze_driver_extension_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves extension properties - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **vkEnumerateInstanceExtensionProperties** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDriver` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDriverGetExtensionProperties( - ze_driver_handle_t hDriver, ///< [in] handle of the driver instance - uint32_t * pCount, ///< [in,out] pointer to the number of extension properties. - ///< if count is zero, then the driver will update the value with the total - ///< number of extension properties available. - ///< if count is non-zero, then driver will only retrieve that number of - ///< extension properties. - ///< if count is larger than the number of extension properties available, - ///< then the driver will update the value with the correct number of - ///< extension properties available. - ze_driver_extension_properties_t * pExtensionProperties ///< [in,out][optional][range(0, *pCount)] array of query results for - ///< extension properties - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Device -#if !defined(__GNUC__) - #pragma region device -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves devices within a driver - /// - /// @details - /// - Multiple calls to this function will return identical device handles, - /// in the same order. - /// - The number and order of handles returned from this function is - /// affected by the ZE_AFFINITY_MASK and ZE_ENABLE_PCI_ID_DEVICE_ORDER - /// environment variables. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDriver` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGet( - ze_driver_handle_t hDriver, ///< [in] handle of the driver instance - uint32_t * pCount, ///< [in,out] pointer to the number of devices. - ///< if count is zero, then the driver will update the value with the total - ///< number of devices available. - ///< if count is non-zero, then driver will only retrieve that number of devices. - ///< if count is larger than the number of devices available, then the - ///< driver will update the value with the correct number of devices available. - ze_device_handle_t * phDevices ///< [in,out][optional][range(0, *pCount)] array of handle of devices - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves a sub-device from a device - /// - /// @details - /// - Multiple calls to this function will return identical device handles, - /// in the same order. - /// - The number of handles returned from this function is affected by the - /// ZE_AFFINITY_MASK environment variable. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clCreateSubDevices - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetSubDevices( - ze_device_handle_t hDevice, ///< [in] handle of the device object - uint32_t * pCount, ///< [in,out] pointer to the number of sub-devices. - ///< if count is zero, then the driver will update the value with the total - ///< number of sub-devices available. - ///< if count is non-zero, then driver will only retrieve that number of sub-devices. - ///< if count is larger than the number of sub-devices available, then the - ///< driver will update the value with the correct number of sub-devices available. - ze_device_handle_t * phSubdevices ///< [in,out][optional][range(0, *pCount)] array of handle of sub-devices - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported device types - typedef enum _ze_device_type_t - { - ZE_DEVICE_TYPE_GPU = 1, ///< Graphics Processing Unit - ZE_DEVICE_TYPE_CPU = 2, ///< Central Processing Unit - ZE_DEVICE_TYPE_FPGA = 3, ///< Field Programmable Gate Array - ZE_DEVICE_TYPE_MCA = 4, ///< Memory Copy Accelerator - ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff - } ze_device_type_t; - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_DEVICE_UUID_SIZE - /// @brief Maximum device universal unique id (UUID) size in bytes - #define ZE_MAX_DEVICE_UUID_SIZE 16 -#endif // ZE_MAX_DEVICE_UUID_SIZE - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device universal unique id (UUID) - typedef struct _ze_device_uuid_t - { - uint8_t id[ZE_MAX_DEVICE_UUID_SIZE]; ///< [out] opaque data representing a device UUID - } ze_device_uuid_t; - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_DEVICE_NAME - /// @brief Maximum device name string size - #define ZE_MAX_DEVICE_NAME 256 -#endif // ZE_MAX_DEVICE_NAME - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported device property flags - typedef uint32_t ze_device_property_flags_t; - typedef enum _ze_device_property_flag_t - { - ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0), ///< Device is integrated with the Host. - ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1), ///< Device handle used for query represents a sub-device. - ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2), ///< Device supports error correction memory access. - ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3), ///< Device supports on-demand page-faulting. - ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_device_property_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device properties queried using zeDeviceGetProperties - typedef struct _ze_device_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_device_type_t type; ///< [out] generic device type - uint32_t vendorId; ///< [out] vendor id from PCI configuration - uint32_t deviceId; ///< [out] device id from PCI configuration - ze_device_property_flags_t flags; ///< [out] 0 (none) or a valid combination of ze_device_property_flag_t - uint32_t subdeviceId; ///< [out] sub-device id. Only valid if ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE - ///< is set. - uint32_t coreClockRate; ///< [out] Clock rate for device core. - uint64_t maxMemAllocSize; ///< [out] Maximum memory allocation size. - uint32_t maxHardwareContexts; ///< [out] Maximum number of logical hardware contexts. - uint32_t maxCommandQueuePriority; ///< [out] Maximum priority for command queues. Higher value is higher - ///< priority. - uint32_t numThreadsPerEU; ///< [out] Number of threads per EU. - uint32_t physicalEUSimdWidth; ///< [out] The physical EU simd width. - uint32_t numEUsPerSubslice; ///< [out] Number of EUs per sub-slice. - uint32_t numSubslicesPerSlice; ///< [out] Number of sub-slices per slice. - uint32_t numSlices; ///< [out] Number of slices. - uint64_t timerResolution; ///< [out] Returns the resolution of device timer in nanoseconds used for - ///< profiling, timestamps, etc. - uint32_t timestampValidBits; ///< [out] Returns the number of valid bits in the timestamp value. - uint32_t kernelTimestampValidBits; ///< [out] Returns the number of valid bits in the kernel timestamp values - ze_device_uuid_t uuid; ///< [out] universal unique identifier. Note: Subdevices will have their - ///< own uuid. - char name[ZE_MAX_DEVICE_NAME]; ///< [out] Device name - } ze_device_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device thread identifier. - typedef struct _ze_device_thread_t - { - uint32_t slice; ///< [in,out] the slice number. - ///< Must be UINT32_MAX (all) or less than ze_device_properties_t.numSlices. - uint32_t subslice; ///< [in,out] the sub-slice number within its slice. - ///< Must be UINT32_MAX (all) or less than ze_device_properties_t.numSubslicesPerSlice. - uint32_t eu; ///< [in,out] the EU number within its sub-slice. - ///< Must be UINT32_MAX (all) or less than ze_device_properties_t.numEUsPerSubslice. - uint32_t thread; ///< [in,out] the thread number within its EU. - ///< Must be UINT32_MAX (all) or less than ze_device_properties_t.numThreadsPerEU. - } ze_device_thread_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves properties of the device. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clGetDeviceInfo - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pDeviceProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_device_properties_t * pDeviceProperties ///< [in,out] query result for device properties - ); - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_SUBGROUPSIZE_COUNT - /// @brief Maximum number of subgroup sizes supported. - #define ZE_SUBGROUPSIZE_COUNT 8 -#endif // ZE_SUBGROUPSIZE_COUNT - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device compute properties queried using zeDeviceGetComputeProperties - typedef struct _ze_device_compute_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - uint32_t maxTotalGroupSize; ///< [out] Maximum items per compute group. (groupSizeX * groupSizeY * - ///< groupSizeZ) <= maxTotalGroupSize - uint32_t maxGroupSizeX; ///< [out] Maximum items for X dimension in group - uint32_t maxGroupSizeY; ///< [out] Maximum items for Y dimension in group - uint32_t maxGroupSizeZ; ///< [out] Maximum items for Z dimension in group - uint32_t maxGroupCountX; ///< [out] Maximum groups that can be launched for x dimension - uint32_t maxGroupCountY; ///< [out] Maximum groups that can be launched for y dimension - uint32_t maxGroupCountZ; ///< [out] Maximum groups that can be launched for z dimension - uint32_t maxSharedLocalMemory; ///< [out] Maximum shared local memory per group. - uint32_t numSubGroupSizes; ///< [out] Number of subgroup sizes supported. This indicates number of - ///< entries in subGroupSizes. - uint32_t subGroupSizes[ZE_SUBGROUPSIZE_COUNT]; ///< [out] Size group sizes supported. - } ze_device_compute_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves compute properties of the device. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clGetDeviceInfo - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pComputeProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetComputeProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_device_compute_properties_t * pComputeProperties ///< [in,out] query result for compute properties - ); - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_NATIVE_KERNEL_UUID_SIZE - /// @brief Maximum native kernel universal unique id (UUID) size in bytes - #define ZE_MAX_NATIVE_KERNEL_UUID_SIZE 16 -#endif // ZE_MAX_NATIVE_KERNEL_UUID_SIZE - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Native kernel universal unique id (UUID) - typedef struct _ze_native_kernel_uuid_t - { - uint8_t id[ZE_MAX_NATIVE_KERNEL_UUID_SIZE]; ///< [out] opaque data representing a native kernel UUID - } ze_native_kernel_uuid_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported device module flags - typedef uint32_t ze_device_module_flags_t; - typedef enum _ze_device_module_flag_t - { - ZE_DEVICE_MODULE_FLAG_FP16 = ZE_BIT(0), ///< Device supports 16-bit floating-point operations - ZE_DEVICE_MODULE_FLAG_FP64 = ZE_BIT(1), ///< Device supports 64-bit floating-point operations - ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS = ZE_BIT(2), ///< Device supports 64-bit atomic operations - ZE_DEVICE_MODULE_FLAG_DP4A = ZE_BIT(3), ///< Device supports four component dot product and accumulate operations - ZE_DEVICE_MODULE_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_device_module_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported floating-Point capability flags - typedef uint32_t ze_device_fp_flags_t; - typedef enum _ze_device_fp_flag_t - { - ZE_DEVICE_FP_FLAG_DENORM = ZE_BIT(0), ///< Supports denorms - ZE_DEVICE_FP_FLAG_INF_NAN = ZE_BIT(1), ///< Supports INF and quiet NaNs - ZE_DEVICE_FP_FLAG_ROUND_TO_NEAREST = ZE_BIT(2), ///< Supports rounding to nearest even rounding mode - ZE_DEVICE_FP_FLAG_ROUND_TO_ZERO = ZE_BIT(3), ///< Supports rounding to zero. - ZE_DEVICE_FP_FLAG_ROUND_TO_INF = ZE_BIT(4), ///< Supports rounding to both positive and negative INF. - ZE_DEVICE_FP_FLAG_FMA = ZE_BIT(5), ///< Supports IEEE754-2008 fused multiply-add. - ZE_DEVICE_FP_FLAG_ROUNDED_DIVIDE_SQRT = ZE_BIT(6), ///< Supports rounding as defined by IEEE754 for divide and sqrt - ///< operations. - ZE_DEVICE_FP_FLAG_SOFT_FLOAT = ZE_BIT(7), ///< Uses software implementation for basic floating-point operations. - ZE_DEVICE_FP_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_device_fp_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device module properties queried using zeDeviceGetModuleProperties - typedef struct _ze_device_module_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - uint32_t spirvVersionSupported; ///< [out] Maximum supported SPIR-V version. - ///< Returns zero if SPIR-V is not supported. - ///< Contains major and minor attributes, use ZE_MAJOR_VERSION and ZE_MINOR_VERSION. - ze_device_module_flags_t flags; ///< [out] 0 or a valid combination of ze_device_module_flag_t - ze_device_fp_flags_t fp16flags; ///< [out] Capabilities for half-precision floating-point operations. - ///< returns 0 (if ZE_DEVICE_MODULE_FLAG_FP16 is not set) or a - ///< combination of ze_device_fp_flag_t. - ze_device_fp_flags_t fp32flags; ///< [out] Capabilities for single-precision floating-point operations. - ///< returns a combination of ze_device_fp_flag_t. - ze_device_fp_flags_t fp64flags; ///< [out] Capabilities for double-precision floating-point operations. - ///< returns 0 (if ZE_DEVICE_MODULE_FLAG_FP64 is not set) or a - ///< combination of ze_device_fp_flag_t. - uint32_t maxArgumentsSize; ///< [out] Maximum kernel argument size that is supported. - uint32_t printfBufferSize; ///< [out] Maximum size of internal buffer that holds output of printf - ///< calls from kernel. - ze_native_kernel_uuid_t nativeKernelSupported; ///< [out] Compatibility UUID of supported native kernel. - ///< UUID may or may not be the same across driver release, devices, or - ///< operating systems. - ///< Application is responsible for ensuring UUID matches before creating - ///< module using - ///< previously created native kernel. - } ze_device_module_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves module properties of the device - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pModuleProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetModuleProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_device_module_properties_t * pModuleProperties ///< [in,out] query result for module properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported command queue group property flags - typedef uint32_t ze_command_queue_group_property_flags_t; - typedef enum _ze_command_queue_group_property_flag_t - { - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE = ZE_BIT(0), ///< Command queue group supports enqueing compute commands. - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY = ZE_BIT(1), ///< Command queue group supports enqueing copy commands. - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COOPERATIVE_KERNELS = ZE_BIT(2), ///< Command queue group supports cooperative kernels. - ///< See zeCommandListAppendLaunchCooperativeKernel for more details. - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_METRICS = ZE_BIT(3), ///< Command queue groups supports metric streamers and queries. - ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_command_queue_group_property_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Command queue group properties queried using - /// zeDeviceGetCommandQueueGroupProperties - typedef struct _ze_command_queue_group_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_command_queue_group_property_flags_t flags; ///< [out] 0 (none) or a valid combination of - ///< ze_command_queue_group_property_flag_t - size_t maxMemoryFillPatternSize; ///< [out] maximum `pattern_size` supported by command queue group. - ///< See zeCommandListAppendMemoryFill for more details. - uint32_t numQueues; ///< [out] the number of physical command queues within the group. - } ze_command_queue_group_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves command queue group properties of the device. - /// - /// @details - /// - Properties are reported for each physical command queue type supported - /// by the device. - /// - Multiple calls to this function will return properties in the same - /// order. - /// - The order in which the properties are returned defines the command - /// queue group's ordinal. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **vkGetPhysicalDeviceQueueFamilyProperties** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetCommandQueueGroupProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - uint32_t * pCount, ///< [in,out] pointer to the number of command queue group properties. - ///< if count is zero, then the driver will update the value with the total - ///< number of command queue group properties available. - ///< if count is non-zero, then driver will only retrieve that number of - ///< command queue group properties. - ///< if count is larger than the number of command queue group properties - ///< available, then the driver will update the value with the correct - ///< number of command queue group properties available. - ze_command_queue_group_properties_t * pCommandQueueGroupProperties ///< [in,out][optional][range(0, *pCount)] array of query results for - ///< command queue group properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported device memory property flags - typedef uint32_t ze_device_memory_property_flags_t; - typedef enum _ze_device_memory_property_flag_t - { - ZE_DEVICE_MEMORY_PROPERTY_FLAG_TBD = ZE_BIT(0), ///< reserved for future use - ZE_DEVICE_MEMORY_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_device_memory_property_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device local memory properties queried using - /// zeDeviceGetMemoryProperties - typedef struct _ze_device_memory_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_device_memory_property_flags_t flags; ///< [out] 0 (none) or a valid combination of - ///< ze_device_memory_property_flag_t - uint32_t maxClockRate; ///< [out] Maximum clock rate for device memory. - uint32_t maxBusWidth; ///< [out] Maximum bus width between device and memory. - uint64_t totalSize; ///< [out] Total memory size in bytes that is available to the device. - char name[ZE_MAX_DEVICE_NAME]; ///< [out] Memory name - } ze_device_memory_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves local memory properties of the device. - /// - /// @details - /// - Properties are reported for each physical memory type supported by the - /// device. - /// - Multiple calls to this function will return properties in the same - /// order. - /// - The order in which the properties are returned defines the device's - /// local memory ordinal. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clGetDeviceInfo - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetMemoryProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - uint32_t * pCount, ///< [in,out] pointer to the number of memory properties. - ///< if count is zero, then the driver will update the value with the total - ///< number of memory properties available. - ///< if count is non-zero, then driver will only retrieve that number of - ///< memory properties. - ///< if count is larger than the number of memory properties available, - ///< then the driver will update the value with the correct number of - ///< memory properties available. - ze_device_memory_properties_t * pMemProperties ///< [in,out][optional][range(0, *pCount)] array of query results for - ///< memory properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Memory access capability flags - /// - /// @details - /// - Supported access capabilities for different types of memory - /// allocations - typedef uint32_t ze_memory_access_cap_flags_t; - typedef enum _ze_memory_access_cap_flag_t - { - ZE_MEMORY_ACCESS_CAP_FLAG_RW = ZE_BIT(0), ///< Supports load/store access - ZE_MEMORY_ACCESS_CAP_FLAG_ATOMIC = ZE_BIT(1), ///< Supports atomic access - ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT = ZE_BIT(2), ///< Supports concurrent access - ZE_MEMORY_ACCESS_CAP_FLAG_CONCURRENT_ATOMIC = ZE_BIT(3), ///< Supports concurrent atomic access - ZE_MEMORY_ACCESS_CAP_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_memory_access_cap_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device memory access properties queried using - /// zeDeviceGetMemoryAccessProperties - typedef struct _ze_device_memory_access_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_memory_access_cap_flags_t hostAllocCapabilities; ///< [out] host memory capabilities. - ///< returns 0 (unsupported) or a combination of ze_memory_access_cap_flag_t. - ze_memory_access_cap_flags_t deviceAllocCapabilities; ///< [out] device memory capabilities. - ///< returns 0 (unsupported) or a combination of ze_memory_access_cap_flag_t. - ze_memory_access_cap_flags_t sharedSingleDeviceAllocCapabilities; ///< [out] shared, single-device memory capabilities. - ///< returns 0 (unsupported) or a combination of ze_memory_access_cap_flag_t. - ze_memory_access_cap_flags_t sharedCrossDeviceAllocCapabilities; ///< [out] shared, cross-device memory capabilities. - ///< returns 0 (unsupported) or a combination of ze_memory_access_cap_flag_t. - ze_memory_access_cap_flags_t sharedSystemAllocCapabilities; ///< [out] shared, system memory capabilities. - ///< returns 0 (unsupported) or a combination of ze_memory_access_cap_flag_t. - } ze_device_memory_access_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves memory access properties of the device. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clGetDeviceInfo - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pMemAccessProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetMemoryAccessProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_device_memory_access_properties_t * pMemAccessProperties ///< [in,out] query result for memory access properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported cache control property flags - typedef uint32_t ze_device_cache_property_flags_t; - typedef enum _ze_device_cache_property_flag_t - { - ZE_DEVICE_CACHE_PROPERTY_FLAG_USER_CONTROL = ZE_BIT(0), ///< Device support User Cache Control (i.e. SLM section vs Generic Cache) - ZE_DEVICE_CACHE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_device_cache_property_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device cache properties queried using zeDeviceGetCacheProperties - typedef struct _ze_device_cache_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_device_cache_property_flags_t flags; ///< [out] 0 (none) or a valid combination of - ///< ze_device_cache_property_flag_t - size_t cacheSize; ///< [out] Per-cache size, in bytes - } ze_device_cache_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves cache properties of the device - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clGetDeviceInfo - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetCacheProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - uint32_t * pCount, ///< [in,out] pointer to the number of cache properties. - ///< if count is zero, then the driver will update the value with the total - ///< number of cache properties available. - ///< if count is non-zero, then driver will only retrieve that number of - ///< cache properties. - ///< if count is larger than the number of cache properties available, then - ///< the driver will update the value with the correct number of cache - ///< properties available. - ze_device_cache_properties_t * pCacheProperties ///< [in,out][optional][range(0, *pCount)] array of query results for cache - ///< properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device image properties queried using zeDeviceGetImageProperties - typedef struct _ze_device_image_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - uint32_t maxImageDims1D; ///< [out] Maximum image dimensions for 1D resources. if 0, then 1D images - ///< are unsupported. - uint32_t maxImageDims2D; ///< [out] Maximum image dimensions for 2D resources. if 0, then 2D images - ///< are unsupported. - uint32_t maxImageDims3D; ///< [out] Maximum image dimensions for 3D resources. if 0, then 3D images - ///< are unsupported. - uint64_t maxImageBufferSize; ///< [out] Maximum image buffer size in bytes. if 0, then buffer images are - ///< unsupported. - uint32_t maxImageArraySlices; ///< [out] Maximum image array slices. if 0, then image arrays are - ///< unsupported. - uint32_t maxSamplers; ///< [out] Max samplers that can be used in kernel. if 0, then sampling is - ///< unsupported. - uint32_t maxReadImageArgs; ///< [out] Returns the maximum number of simultaneous image objects that - ///< can be read from by a kernel. if 0, then reading images is - ///< unsupported. - uint32_t maxWriteImageArgs; ///< [out] Returns the maximum number of simultaneous image objects that - ///< can be written to by a kernel. if 0, then writing images is - ///< unsupported. - } ze_device_image_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves image properties of the device - /// - /// @details - /// - See zeImageGetProperties for format-specific capabilities. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pImageProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetImageProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_device_image_properties_t * pImageProperties ///< [in,out] query result for image properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device external memory import and export properties - typedef struct _ze_device_external_memory_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_external_memory_type_flags_t memoryAllocationImportTypes; ///< [out] Supported external memory import types for memory allocations. - ze_external_memory_type_flags_t memoryAllocationExportTypes; ///< [out] Supported external memory export types for memory allocations. - ze_external_memory_type_flags_t imageImportTypes; ///< [out] Supported external memory import types for images. - ze_external_memory_type_flags_t imageExportTypes; ///< [out] Supported external memory export types for images. - } ze_device_external_memory_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves external memory import and export of the device - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pExternalMemoryProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetExternalMemoryProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_device_external_memory_properties_t * pExternalMemoryProperties ///< [in,out] query result for external memory properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported device peer-to-peer property flags - typedef uint32_t ze_device_p2p_property_flags_t; - typedef enum _ze_device_p2p_property_flag_t - { - ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS = ZE_BIT(0), ///< Device supports access between peer devices. - ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS = ZE_BIT(1), ///< Device supports atomics between peer devices. - ZE_DEVICE_P2P_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_device_p2p_property_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device peer-to-peer properties queried using - /// zeDeviceGetP2PProperties - typedef struct _ze_device_p2p_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_device_p2p_property_flags_t flags; ///< [out] 0 (none) or a valid combination of - ///< ze_device_p2p_property_flag_t - } ze_device_p2p_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves peer-to-peer properties between one device and a peer - /// devices - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// + `nullptr == hPeerDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pP2PProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetP2PProperties( - ze_device_handle_t hDevice, ///< [in] handle of the device performing the access - ze_device_handle_t hPeerDevice, ///< [in] handle of the peer device with the allocation - ze_device_p2p_properties_t * pP2PProperties ///< [in,out] Peer-to-Peer properties between source and peer device - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Queries if one device can directly access peer device allocations - /// - /// @details - /// - Any device can access any other device within a node through a - /// scale-up fabric. - /// - The following are conditions for CanAccessPeer query. - /// + If both device and peer device are the same then return true. - /// + If both sub-device and peer sub-device are the same then return - /// true. - /// + If both are sub-devices and share the same parent device then - /// return true. - /// + If both device and remote device are connected by a direct or - /// indirect scale-up fabric or over PCIe (same root complex or shared - /// PCIe switch) then true. - /// + If both sub-device and remote parent device (and vice-versa) are - /// connected by a direct or indirect scale-up fabric or over PCIe - /// (same root complex or shared PCIe switch) then true. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// + `nullptr == hPeerDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == value` - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceCanAccessPeer(ze_device_handle_t hDevice, ///< [in] handle of the device performing the access - ze_device_handle_t hPeerDevice, ///< [in] handle of the peer device with the allocation - ze_bool_t * value ///< [out] returned access capability - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Returns current status of the device. - /// - /// @details - /// - Once a device is reset, this call will update the OS handle attached - /// to the device handle. - /// - The application may call this function from simultaneous threads with - /// the same device handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_SUCCESS - /// + Device is available for use. - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// + Device is lost; must be reset for use. - ZE_APIEXPORT ze_result_t ZE_APICALL zeDeviceGetStatus(ze_device_handle_t hDevice ///< [in] handle of the device - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Context -#if !defined(__GNUC__) - #pragma region context -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported context creation flags - typedef uint32_t ze_context_flags_t; - typedef enum _ze_context_flag_t - { - ZE_CONTEXT_FLAG_TBD = ZE_BIT(0), ///< reserved for future use - ZE_CONTEXT_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_context_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Context descriptor - typedef struct _ze_context_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_context_flags_t flags; ///< [in] creation flags. - ///< must be 0 (default) or a valid combination of ze_context_flag_t; - ///< default behavior may use implicit driver-based heuristics. - } ze_context_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates a context for the driver. - /// - /// @details - /// - The application must only use the context for the driver which was - /// provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDriver` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phContext` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x1 < desc->flags` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextCreate(ze_driver_handle_t hDriver, ///< [in] handle of the driver object - const ze_context_desc_t * desc, ///< [in] pointer to context descriptor - ze_context_handle_t * phContext ///< [out] pointer to handle of context object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys a context. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the context before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this context. - /// - The application must **not** call this function from simultaneous - /// threads with the same context handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextDestroy(ze_context_handle_t hContext ///< [in][release] handle of context object to destroy - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Returns current status of the context. - /// - /// @details - /// - The application may call this function from simultaneous threads with - /// the same context handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_SUCCESS - /// + Context is available for use. - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// + Context is invalid; due to device lost or reset. - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextGetStatus(ze_context_handle_t hContext ///< [in] handle of context object - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Command Queue -#if !defined(__GNUC__) - #pragma region cmdqueue -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported command queue flags - typedef uint32_t ze_command_queue_flags_t; - typedef enum _ze_command_queue_flag_t - { - ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY = ZE_BIT(0), ///< command queue should be optimized for submission to a single device engine. - ///< driver **must** disable any implicit optimizations for distributing - ///< work across multiple engines. - ///< this flag should be used when applications want full control over - ///< multi-engine submission and scheduling. - ZE_COMMAND_QUEUE_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_command_queue_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported command queue modes - typedef enum _ze_command_queue_mode_t - { - ZE_COMMAND_QUEUE_MODE_DEFAULT = 0, ///< implicit default behavior; uses driver-based heuristics - ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS = 1, ///< Device execution always completes immediately on execute; - ///< Host thread is blocked using wait on implicit synchronization object - ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS = 2, ///< Device execution is scheduled and will complete in future; - ///< explicit synchronization object must be used to determine completeness - ZE_COMMAND_QUEUE_MODE_FORCE_UINT32 = 0x7fffffff - } ze_command_queue_mode_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported command queue priorities - typedef enum _ze_command_queue_priority_t - { - ZE_COMMAND_QUEUE_PRIORITY_NORMAL = 0, ///< [default] normal priority - ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW = 1, ///< lower priority than normal - ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH = 2, ///< higher priority than normal - ZE_COMMAND_QUEUE_PRIORITY_FORCE_UINT32 = 0x7fffffff - } ze_command_queue_priority_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Command Queue descriptor - typedef struct _ze_command_queue_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - uint32_t ordinal; ///< [in] command queue group ordinal - uint32_t index; ///< [in] command queue index within the group; - ///< must be zero if ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY is not set - ze_command_queue_flags_t flags; ///< [in] usage flags. - ///< must be 0 (default) or a valid combination of ze_command_queue_flag_t; - ///< default behavior may use implicit driver-based heuristics to balance - ///< latency and throughput. - ze_command_queue_mode_t mode; ///< [in] operation mode - ze_command_queue_priority_t priority; ///< [in] priority - } ze_command_queue_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates a command queue on the context. - /// - /// @details - /// - A command queue represents a logical input stream to the device, tied - /// to a physical input stream. - /// - The application must only use the command queue for the device, or its - /// sub-devices, which was provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @remarks - /// _Analogues_ - /// - **clCreateCommandQueue** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phCommandQueue` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x1 < desc->flags` - /// + `ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < desc->mode` - /// + `ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < desc->priority` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandQueueCreate( - ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device object - const ze_command_queue_desc_t * desc, ///< [in] pointer to command queue descriptor - ze_command_queue_handle_t * phCommandQueue ///< [out] pointer to handle of command queue object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys a command queue. - /// - /// @details - /// - The application must destroy all fence handles created from the - /// command queue before destroying the command queue itself - /// - The application must ensure the device is not currently referencing - /// the command queue before it is deleted - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this command queue - /// - The application must **not** call this function from simultaneous - /// threads with the same command queue handle. - /// - The implementation of this function must be thread-safe. - /// - /// @remarks - /// _Analogues_ - /// - **clReleaseCommandQueue** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandQueue` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandQueueDestroy( - ze_command_queue_handle_t hCommandQueue ///< [in][release] handle of command queue object to destroy - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Executes a command list in a command queue. - /// - /// @details - /// - The application must ensure the command lists are accessible by the - /// device on which the command queue was created. - /// - The application must only execute command lists created with an - /// identical command queue group ordinal to the command queue. - /// - The application must use a fence created using the same command queue. - /// - The application must ensure the command queue, command list and fence - /// were created on the same context. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - vkQueueSubmit - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandQueue` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == phCommandLists` - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `0 == numCommandLists` - /// - ZE_RESULT_ERROR_INVALID_COMMAND_LIST_TYPE - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandQueueExecuteCommandLists( - ze_command_queue_handle_t hCommandQueue, ///< [in] handle of the command queue - uint32_t numCommandLists, ///< [in] number of command lists to execute - ze_command_list_handle_t * phCommandLists, ///< [in][range(0, numCommandLists)] list of handles of the command lists - ///< to execute - ze_fence_handle_t hFence ///< [in][optional] handle of the fence to signal on completion - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Synchronizes a command queue by waiting on the host. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandQueue` - /// - ZE_RESULT_NOT_READY - /// + timeout expired - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandQueueSynchronize( - ze_command_queue_handle_t hCommandQueue, ///< [in] handle of the command queue - uint64_t timeout ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to - ///< yield before returning ZE_RESULT_SUCCESS or ZE_RESULT_NOT_READY; - ///< if zero, then immediately returns the status of the command queue; - ///< if UINT64_MAX, then function will not return until complete or device - ///< is lost. - ///< Due to external dependencies, timeout may be rounded to the closest - ///< value allowed by the accuracy of those dependencies. - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Command List -#if !defined(__GNUC__) - #pragma region cmdlist -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported command list creation flags - typedef uint32_t ze_command_list_flags_t; - typedef enum _ze_command_list_flag_t - { - ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING = ZE_BIT(0), ///< driver may reorder commands (e.g., kernels, copies) between barriers - ///< and synchronization primitives. - ///< using this flag may increase Host overhead of zeCommandListClose. - ///< therefore, this flag should **not** be set for low-latency usage-models. - ZE_COMMAND_LIST_FLAG_MAXIMIZE_THROUGHPUT = ZE_BIT(1), ///< driver may perform additional optimizations that increase execution - ///< throughput. - ///< using this flag may increase Host overhead of zeCommandListClose and zeCommandQueueExecuteCommandLists. - ///< therefore, this flag should **not** be set for low-latency usage-models. - ZE_COMMAND_LIST_FLAG_EXPLICIT_ONLY = ZE_BIT(2), ///< command list should be optimized for submission to a single command - ///< queue and device engine. - ///< driver **must** disable any implicit optimizations for distributing - ///< work across multiple engines. - ///< this flag should be used when applications want full control over - ///< multi-engine submission and scheduling. - ZE_COMMAND_LIST_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_command_list_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Command List descriptor - typedef struct _ze_command_list_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - uint32_t commandQueueGroupOrdinal; ///< [in] command queue group ordinal to which this command list will be - ///< submitted - ze_command_list_flags_t flags; ///< [in] usage flags. - ///< must be 0 (default) or a valid combination of ze_command_list_flag_t; - ///< default behavior may use implicit driver-based heuristics to balance - ///< latency and throughput. - } ze_command_list_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates a command list on the context. - /// - /// @details - /// - A command list represents a sequence of commands for execution on a - /// command queue. - /// - The command list is created in the 'open' state. - /// - The application must only use the command list for the device, or its - /// sub-devices, which was provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phCommandList` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x7 < desc->flags` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListCreate( - ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device object - const ze_command_list_desc_t * desc, ///< [in] pointer to command list descriptor - ze_command_list_handle_t * phCommandList ///< [out] pointer to handle of command list object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates an immediate command list on the context. - /// - /// @details - /// - An immediate command list is used for low-latency submission of - /// commands. - /// - An immediate command list creates an implicit command queue. - /// - The command list is created in the 'open' state and never needs to be - /// closed. - /// - The application must only use the command list for the device, or its - /// sub-devices, which was provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == altdesc` - /// + `nullptr == phCommandList` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x1 < altdesc->flags` - /// + `ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS < altdesc->mode` - /// + `ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH < altdesc->priority` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListCreateImmediate( - ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device object - const ze_command_queue_desc_t * altdesc, ///< [in] pointer to command queue descriptor - ze_command_list_handle_t * phCommandList ///< [out] pointer to handle of command list object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys a command list. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the command list before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this command list. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListDestroy( - ze_command_list_handle_t hCommandList ///< [in][release] handle of command list object to destroy - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Closes a command list; ready to be executed by a command queue. - /// - /// @details - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListClose(ze_command_list_handle_t hCommandList ///< [in] handle of command list object to close - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Reset a command list to initial (empty) state; ready for appending - /// commands. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the command list before it is reset - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListReset(ze_command_list_handle_t hCommandList ///< [in] handle of command list object to reset - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Appends a memory write of the device's global timestamp value into a - /// command list. - /// - /// @details - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The timestamp frequency can be queried from - /// ze_device_properties_t.timerResolution. - /// - The number of valid bits in the timestamp value can be queried from - /// ze_device_properties_t.timestampValidBits. - /// - The application must ensure the memory pointed to by dstptr is - /// accessible by the device on which the command list was created. - /// - The application must ensure the command list and events were created, - /// and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == dstptr` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendWriteGlobalTimestamp( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - uint64_t * dstptr, ///< [in,out] pointer to memory where timestamp value will be written; must - ///< be 8byte-aligned. - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing query; - ///< must be 0 if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before executing query - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Barrier -#if !defined(__GNUC__) - #pragma region barrier -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Appends an execution and global memory barrier into a command list. - /// - /// @details - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - If numWaitEvents is zero, then all previous commands are completed - /// prior to the execution of the barrier. - /// - If numWaitEvents is non-zero, then then all phWaitEvents must be - /// signaled prior to the execution of the barrier. - /// - This command blocks all following commands from beginning until the - /// execution of the barrier completes. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **vkCmdPipelineBarrier** - /// - clEnqueueBarrierWithWaitList - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendBarrier( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing barrier; - ///< must be 0 if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before executing barrier - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Appends a global memory ranges barrier into a command list. - /// - /// @details - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - If numWaitEvents is zero, then all previous commands are completed - /// prior to the execution of the barrier. - /// - If numWaitEvents is non-zero, then then all phWaitEvents must be - /// signaled prior to the execution of the barrier. - /// - This command blocks all following commands from beginning until the - /// execution of the barrier completes. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pRangeSizes` - /// + `nullptr == pRanges` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendMemoryRangesBarrier( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - uint32_t numRanges, ///< [in] number of memory ranges - const size_t * pRangeSizes, ///< [in][range(0, numRanges)] array of sizes of memory range - const void ** pRanges, ///< [in][range(0, numRanges)] array of memory ranges - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing barrier; - ///< must be 0 if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before executing barrier - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Ensures in-bound writes to the device are globally observable. - /// - /// @details - /// - This is a special-case system level barrier that can be used to ensure - /// global observability of writes; - /// typically needed after a producer (e.g., NIC) performs direct writes - /// to the device's memory (e.g., Direct RDMA writes). - /// This is typically required when the memory corresponding to the writes - /// is subsequently accessed from a remote device. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextSystemBarrier(ze_context_handle_t hContext, ///< [in] handle of context object - ze_device_handle_t hDevice ///< [in] handle of the device - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Copies -#if !defined(__GNUC__) - #pragma region copy -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copies host, device, or shared memory. - /// - /// @details - /// - The application must ensure the memory pointed to by dstptr and srcptr - /// is accessible by the device on which the command list was created. - /// - The implementation must not access the memory pointed to by dstptr and - /// srcptr as they are free to be modified by either the Host or device up - /// until execution. - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The application must ensure the command list and events were created, - /// and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clEnqueueCopyBuffer** - /// - **clEnqueueReadBuffer** - /// - **clEnqueueWriteBuffer** - /// - **clEnqueueSVMMemcpy** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == dstptr` - /// + `nullptr == srcptr` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendMemoryCopy( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - void * dstptr, ///< [in] pointer to destination memory to copy to - const void * srcptr, ///< [in] pointer to source memory to copy from - size_t size, ///< [in] size in bytes to copy - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Initializes host, device, or shared memory. - /// - /// @details - /// - The application must ensure the memory pointed to by dstptr is - /// accessible by the device on which the command list was created. - /// - The implementation must not access the memory pointed to by dstptr as - /// it is free to be modified by either the Host or device up until - /// execution. - /// - The value to initialize memory to is described by the pattern and the - /// pattern size. - /// - The pattern size must be a power-of-two and less than - /// ze_command_queue_group_properties_t.maxMemoryFillPatternSize. - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The application must enusre the command list and events were created, - /// and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clEnqueueFillBuffer** - /// - **clEnqueueSVMMemFill** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// + `nullptr == pattern` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendMemoryFill( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - void * ptr, ///< [in] pointer to memory to initialize - const void * pattern, ///< [in] pointer to value to initialize memory to - size_t pattern_size, ///< [in] size in bytes of the value to initialize memory to - size_t size, ///< [in] size in bytes to initialize - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copy region descriptor - typedef struct _ze_copy_region_t - { - uint32_t originX; ///< [in] The origin x offset for region in bytes - uint32_t originY; ///< [in] The origin y offset for region in rows - uint32_t originZ; ///< [in] The origin z offset for region in slices - uint32_t width; ///< [in] The region width relative to origin in bytes - uint32_t height; ///< [in] The region height relative to origin in rows - uint32_t depth; ///< [in] The region depth relative to origin in slices. Set this to 0 for - ///< 2D copy. - } ze_copy_region_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copies a region from a 2D or 3D array of host, device, or shared - /// memory. - /// - /// @details - /// - The application must ensure the memory pointed to by dstptr and srcptr - /// is accessible by the device on which the command list was created. - /// - The implementation must not access the memory pointed to by dstptr and - /// srcptr as they are free to be modified by either the Host or device up - /// until execution. - /// - The region width, height, and depth for both src and dst must be same. - /// The origins can be different. - /// - The src and dst regions cannot be overlapping. - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The application must ensure the command list and events were created, - /// and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == dstptr` - /// + `nullptr == dstRegion` - /// + `nullptr == srcptr` - /// + `nullptr == srcRegion` - /// - ZE_RESULT_ERROR_OVERLAPPING_REGIONS - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendMemoryCopyRegion( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - void * dstptr, ///< [in] pointer to destination memory to copy to - const ze_copy_region_t * dstRegion, ///< [in] pointer to destination region to copy to - uint32_t dstPitch, ///< [in] destination pitch in bytes - uint32_t dstSlicePitch, ///< [in] destination slice pitch in bytes. This is required for 3D region - ///< copies where ze_copy_region_t.depth is not 0, otherwise it's - ///< ignored. - const void * srcptr, ///< [in] pointer to source memory to copy from - const ze_copy_region_t * srcRegion, ///< [in] pointer to source region to copy from - uint32_t srcPitch, ///< [in] source pitch in bytes - uint32_t srcSlicePitch, ///< [in] source slice pitch in bytes. This is required for 3D region - ///< copies where ze_copy_region_t.depth is not 0, otherwise it's - ///< ignored. - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copies host, device, or shared memory from another context. - /// - /// @details - /// - The current active and source context must be from the same driver. - /// - The application must ensure the memory pointed to by dstptr and srcptr - /// is accessible by the device on which the command list was created. - /// - The implementation must not access the memory pointed to by dstptr and - /// srcptr as they are free to be modified by either the Host or device up - /// until execution. - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The application must ensure the command list and events were created, - /// and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hContextSrc` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == dstptr` - /// + `nullptr == srcptr` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendMemoryCopyFromContext( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - void * dstptr, ///< [in] pointer to destination memory to copy to - ze_context_handle_t hContextSrc, ///< [in] handle of source context object - const void * srcptr, ///< [in] pointer to source memory to copy from - size_t size, ///< [in] size in bytes to copy - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copies an image. - /// - /// @details - /// - The application must ensure the image and events are accessible by the - /// device on which the command list was created. - /// - The application must ensure the image format descriptors for both - /// source and destination images are the same. - /// - The application must ensure the command list, images and events were - /// created on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clEnqueueCopyImage** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hDstImage` - /// + `nullptr == hSrcImage` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendImageCopy( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - ze_image_handle_t hDstImage, ///< [in] handle of destination image to copy to - ze_image_handle_t hSrcImage, ///< [in] handle of source image to copy from - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Region descriptor - typedef struct _ze_image_region_t - { - uint32_t originX; ///< [in] The origin x offset for region in pixels - uint32_t originY; ///< [in] The origin y offset for region in pixels - uint32_t originZ; ///< [in] The origin z offset for region in pixels - uint32_t width; ///< [in] The region width relative to origin in pixels - uint32_t height; ///< [in] The region height relative to origin in pixels - uint32_t depth; ///< [in] The region depth relative to origin. For 1D or 2D images, set - ///< this to 1. - } ze_image_region_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copies a region of an image to another image. - /// - /// @details - /// - The application must ensure the image and events are accessible by the - /// device on which the command list was created. - /// - The region width and height for both src and dst must be same. The - /// origins can be different. - /// - The src and dst regions cannot be overlapping. - /// - The application must ensure the image format descriptors for both - /// source and destination images are the same. - /// - The application must ensure the command list, images and events were - /// created, and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hDstImage` - /// + `nullptr == hSrcImage` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_OVERLAPPING_REGIONS - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendImageCopyRegion( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - ze_image_handle_t hDstImage, ///< [in] handle of destination image to copy to - ze_image_handle_t hSrcImage, ///< [in] handle of source image to copy from - const ze_image_region_t * pDstRegion, ///< [in][optional] destination region descriptor - const ze_image_region_t * pSrcRegion, ///< [in][optional] source region descriptor - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copies from an image to device or shared memory. - /// - /// @details - /// - The application must ensure the memory pointed to by dstptr is - /// accessible by the device on which the command list was created. - /// - The implementation must not access the memory pointed to by dstptr as - /// it is free to be modified by either the Host or device up until - /// execution. - /// - The application must ensure the image and events are accessible by the - /// device on which the command list was created. - /// - The application must ensure the image format descriptor for the source - /// image is not a media format. - /// - The application must ensure the command list, image and events were - /// created, and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clEnqueueReadImage - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hSrcImage` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == dstptr` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendImageCopyToMemory( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - void * dstptr, ///< [in] pointer to destination memory to copy to - ze_image_handle_t hSrcImage, ///< [in] handle of source image to copy from - const ze_image_region_t * pSrcRegion, ///< [in][optional] source region descriptor - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Copies to an image from device or shared memory. - /// - /// @details - /// - The application must ensure the memory pointed to by srcptr is - /// accessible by the device on which the command list was created. - /// - The implementation must not access the memory pointed to by srcptr as - /// it is free to be modified by either the Host or device up until - /// execution. - /// - The application must ensure the image and events are accessible by the - /// device on which the command list was created. - /// - The application must ensure the image format descriptor for the - /// destination image is not a media format. - /// - The application must ensure the command list, image and events were - /// created, and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clEnqueueWriteImage - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hDstImage` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == srcptr` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendImageCopyFromMemory( - ze_command_list_handle_t hCommandList, ///< [in] handle of command list - ze_image_handle_t hDstImage, ///< [in] handle of destination image to copy to - const void * srcptr, ///< [in] pointer to source memory to copy from - const ze_image_region_t * pDstRegion, ///< [in][optional] destination region descriptor - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Asynchronously prefetches shared memory to the device associated with - /// the specified command list - /// - /// @details - /// - This is a hint to improve performance only and is not required for - /// correctness. - /// - Only prefetching to the device associated with the specified command - /// list is supported. - /// Prefetching to the host or to a peer device is not supported. - /// - Prefetching may not be supported for all allocation types for all devices. - /// If memory prefetching is not supported for the specified memory range - /// the prefetch hint may be ignored. - /// - Prefetching may only be supported at a device-specific granularity, - /// such as at a page boundary. - /// In this case, the memory range may be expanded such that the start and - /// end of the range satisfy granularity requirements. - /// - The application must ensure the memory pointed to by ptr is accessible - /// by the device on which the command list was created. - /// - The application must ensure the command list was created, and the - /// memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clEnqueueSVMMigrateMem - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendMemoryPrefetch(ze_command_list_handle_t hCommandList, ///< [in] handle of command list - const void * ptr, ///< [in] pointer to start of the memory range to prefetch - size_t size ///< [in] size in bytes of the memory range to prefetch - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported memory advice hints - typedef enum _ze_memory_advice_t - { - ZE_MEMORY_ADVICE_SET_READ_MOSTLY = 0, ///< hint that memory will be read from frequently and written to rarely - ZE_MEMORY_ADVICE_CLEAR_READ_MOSTLY = 1, ///< removes the affect of ZE_MEMORY_ADVICE_SET_READ_MOSTLY - ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION = 2, ///< hint that the preferred memory location is the specified device - ZE_MEMORY_ADVICE_CLEAR_PREFERRED_LOCATION = 3, ///< removes the affect of ZE_MEMORY_ADVICE_SET_PREFERRED_LOCATION - ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY = 4, ///< hints that memory will mostly be accessed non-atomically - ZE_MEMORY_ADVICE_CLEAR_NON_ATOMIC_MOSTLY = 5, ///< removes the affect of ZE_MEMORY_ADVICE_SET_NON_ATOMIC_MOSTLY - ZE_MEMORY_ADVICE_BIAS_CACHED = 6, ///< hints that memory should be cached - ZE_MEMORY_ADVICE_BIAS_UNCACHED = 7, ///< hints that memory should be not be cached - ZE_MEMORY_ADVICE_FORCE_UINT32 = 0x7fffffff - } ze_memory_advice_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Provides advice about the use of a shared memory range - /// - /// @details - /// - Memory advice is a performance hint only and is not required for - /// functional correctness. - /// - Memory advice can be used to override driver heuristics to explicitly - /// control shared memory behavior. - /// - Not all memory advice hints may be supported for all allocation types - /// for all devices. - /// If a memory advice hint is not supported by the device it will be ignored. - /// - Memory advice may only be supported at a device-specific granularity, - /// such as at a page boundary. - /// In this case, the memory range may be expanded such that the start and - /// end of the range satisfy granularity requirements. - /// - The application must ensure the memory pointed to by ptr is accessible - /// by the device on which the command list was created. - /// - The application must ensure the command list was created, and memory - /// was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle, and the memory was - /// allocated. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `ZE_MEMORY_ADVICE_BIAS_UNCACHED < advice` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendMemAdvise(ze_command_list_handle_t hCommandList, ///< [in] handle of command list - ze_device_handle_t hDevice, ///< [in] device associated with the memory advice - const void * ptr, ///< [in] Pointer to the start of the memory range - size_t size, ///< [in] Size in bytes of the memory range - ze_memory_advice_t advice ///< [in] Memory advice for the memory range - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Event -#if !defined(__GNUC__) - #pragma region event -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported event pool creation flags - typedef uint32_t ze_event_pool_flags_t; - typedef enum _ze_event_pool_flag_t - { - ZE_EVENT_POOL_FLAG_HOST_VISIBLE = ZE_BIT(0), ///< signals and waits are also visible to host - ZE_EVENT_POOL_FLAG_IPC = ZE_BIT(1), ///< signals and waits may be shared across processes - ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP = ZE_BIT(2), ///< Indicates all events in pool will contain kernel timestamps; cannot be - ///< combined with ZE_EVENT_POOL_FLAG_IPC - ZE_EVENT_POOL_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_event_pool_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Event pool descriptor - typedef struct _ze_event_pool_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_event_pool_flags_t flags; ///< [in] creation flags. - ///< must be 0 (default) or a valid combination of ze_event_pool_flag_t; - ///< default behavior is signals and waits are visible to the entire device - ///< and peer devices. - uint32_t count; ///< [in] number of events within the pool; must be greater than 0 - } ze_event_pool_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates a pool of events on the context. - /// - /// @details - /// - The application must only use events within the pool for the - /// device(s), or their sub-devices, which were provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phEventPool` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x7 < desc->flags` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `0 < desc->count` - /// + `(nullptr == phDevices) && (0 < numDevices)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventPoolCreate( - ze_context_handle_t hContext, ///< [in] handle of the context object - const ze_event_pool_desc_t * desc, ///< [in] pointer to event pool descriptor - uint32_t numDevices, ///< [in][optional] number of device handles; must be 0 if `nullptr == - ///< phDevices` - ze_device_handle_t * phDevices, ///< [in][optional][range(0, numDevices)] array of device handles which - ///< have visibility to the event pool. - ///< if nullptr, then event pool is visible to all devices supported by the - ///< driver instance. - ze_event_pool_handle_t * phEventPool ///< [out] pointer handle of event pool object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Deletes an event pool object. - /// - /// @details - /// - The application must destroy all event handles created from the pool - /// before destroying the pool itself. - /// - The application must ensure the device is not currently referencing - /// the any event within the pool before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this event pool. - /// - The application must **not** call this function from simultaneous - /// threads with the same event pool handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEventPool` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventPoolDestroy(ze_event_pool_handle_t hEventPool ///< [in][release] handle of event pool object to destroy - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported event scope flags - typedef uint32_t ze_event_scope_flags_t; - typedef enum _ze_event_scope_flag_t - { - ZE_EVENT_SCOPE_FLAG_SUBDEVICE = ZE_BIT(0), ///< cache hierarchies are flushed or invalidated sufficient for local - ///< sub-device access - ZE_EVENT_SCOPE_FLAG_DEVICE = ZE_BIT(1), ///< cache hierarchies are flushed or invalidated sufficient for global - ///< device access and peer device access - ZE_EVENT_SCOPE_FLAG_HOST = ZE_BIT(2), ///< cache hierarchies are flushed or invalidated sufficient for device and - ///< host access - ZE_EVENT_SCOPE_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_event_scope_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Event descriptor - typedef struct _ze_event_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - uint32_t index; ///< [in] index of the event within the pool; must be less-than the count - ///< specified during pool creation - ze_event_scope_flags_t signal; ///< [in] defines the scope of relevant cache hierarchies to flush on a - ///< signal action before the event is triggered. - ///< must be 0 (default) or a valid combination of ze_event_scope_flag_t; - ///< default behavior is execution synchronization only, no cache - ///< hierarchies are flushed. - ze_event_scope_flags_t wait; ///< [in] defines the scope of relevant cache hierarchies to invalidate on - ///< a wait action after the event is complete. - ///< must be 0 (default) or a valid combination of ze_event_scope_flag_t; - ///< default behavior is execution synchronization only, no cache - ///< hierarchies are invalidated. - } ze_event_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates an event from the pool. - /// - /// @details - /// - An event is used to communicate fine-grain host-to-device, - /// device-to-host or device-to-device dependencies have completed. - /// - The application must ensure the location in the pool is not being used - /// by another event. - /// - The application must **not** call this function from simultaneous - /// threads with the same event pool handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clCreateUserEvent** - /// - vkCreateEvent - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEventPool` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phEvent` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x7 < desc->signal` - /// + `0x7 < desc->wait` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventCreate(ze_event_pool_handle_t hEventPool, ///< [in] handle of the event pool - const ze_event_desc_t * desc, ///< [in] pointer to event descriptor - ze_event_handle_t * phEvent ///< [out] pointer to handle of event object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Deletes an event object. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the event before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this event. - /// - The application must **not** call this function from simultaneous - /// threads with the same event handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clReleaseEvent** - /// - vkDestroyEvent - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventDestroy(ze_event_handle_t hEvent ///< [in][release] handle of event object to destroy - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Gets an IPC event pool handle for the specified event handle that can - /// be shared with another process. - /// - /// @details - /// - Event pool must have been created with ZE_EVENT_POOL_FLAG_IPC. - /// - The application may call this function from simultaneous threads. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEventPool` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == phIpc` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventPoolGetIpcHandle(ze_event_pool_handle_t hEventPool, ///< [in] handle of event pool object - ze_ipc_event_pool_handle_t * phIpc ///< [out] Returned IPC event handle - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Opens an IPC event pool handle to retrieve an event pool handle from - /// another process. - /// - /// @details - /// - Multiple calls to this function with the same IPC handle will return - /// unique event pool handles. - /// - The event handle in this process should not be freed with - /// zeEventPoolDestroy, but rather with zeEventPoolCloseIpcHandle. - /// - The application may call this function from simultaneous threads. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == phEventPool` - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventPoolOpenIpcHandle( - ze_context_handle_t hContext, ///< [in] handle of the context object to associate with the IPC event pool - ///< handle - ze_ipc_event_pool_handle_t hIpc, ///< [in] IPC event pool handle - ze_event_pool_handle_t * phEventPool ///< [out] pointer handle of event pool object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Closes an IPC event handle in the current process. - /// - /// @details - /// - Closes an IPC event handle by destroying events that were opened in - /// this process using zeEventPoolOpenIpcHandle. - /// - The application must **not** call this function from simultaneous - /// threads with the same event pool handle. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEventPool` - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventPoolCloseIpcHandle(ze_event_pool_handle_t hEventPool ///< [in][release] handle of event pool object - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Appends a signal of the event from the device into a command list. - /// - /// @details - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The duration of an event created from an event pool that was created - /// using ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined. - /// However, for consistency and orthogonality the event will report - /// correctly as signaled when used by other event API functionality. - /// - The application must ensure the command list and events were created - /// on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clSetUserEventStatus** - /// - vkCmdSetEvent - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendSignalEvent(ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - ze_event_handle_t hEvent ///< [in] handle of the event - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Appends wait on event(s) on the device into a command list. - /// - /// @details - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The application must ensure the command list and events were created - /// on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == phEvents` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendWaitOnEvents( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - uint32_t numEvents, ///< [in] number of events to wait on before continuing - ze_event_handle_t * phEvents ///< [in][range(0, numEvents)] handles of the events to wait on before - ///< continuing - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Signals a event from host. - /// - /// @details - /// - The duration of an event created from an event pool that was created - /// using ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag is undefined. - /// However, for consistency and orthogonality the event will report - /// correctly as signaled when used by other event API functionality. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clSetUserEventStatus - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventHostSignal(ze_event_handle_t hEvent ///< [in] handle of the event - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief The current host thread waits on an event to be signaled. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - clWaitForEvents - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_NOT_READY - /// + timeout expired - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventHostSynchronize( - ze_event_handle_t hEvent, ///< [in] handle of the event - uint64_t timeout ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to - ///< yield before returning ZE_RESULT_SUCCESS or ZE_RESULT_NOT_READY; - ///< if zero, then operates exactly like zeEventQueryStatus; - ///< if UINT64_MAX, then function will not return until complete or device - ///< is lost. - ///< Due to external dependencies, timeout may be rounded to the closest - ///< value allowed by the accuracy of those dependencies. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Queries an event object's status on the host. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **clGetEventInfo** - /// - vkGetEventStatus - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_NOT_READY - /// + not signaled - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventQueryStatus(ze_event_handle_t hEvent ///< [in] handle of the event - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Appends a reset of an event back to not signaled state into a command - /// list. - /// - /// @details - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The application must ensure the command list and events were created - /// on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - vkResetEvent - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendEventReset(ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - ze_event_handle_t hEvent ///< [in] handle of the event - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief The current host thread resets an event back to not signaled state. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - vkResetEvent - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventHostReset(ze_event_handle_t hEvent ///< [in] handle of the event - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Kernel timestamp clock data - /// - /// @details - /// - The timestamp frequency can be queried from - /// ze_device_properties_t.timerResolution. - /// - The number of valid bits in the timestamp value can be queried from - /// ze_device_properties_t.kernelTimestampValidBits. - typedef struct _ze_kernel_timestamp_data_t - { - uint64_t kernelStart; ///< [out] device clock at start of kernel execution - uint64_t kernelEnd; ///< [out] device clock at end of kernel execution - } ze_kernel_timestamp_data_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Kernel timestamp result - typedef struct _ze_kernel_timestamp_result_t - { - ze_kernel_timestamp_data_t global; ///< [out] wall-clock data - ze_kernel_timestamp_data_t context; ///< [out] context-active data; only includes clocks while device context - ///< was actively executing. - } ze_kernel_timestamp_result_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Queries an event's timestamp value on the host. - /// - /// @details - /// - The application must ensure the event was created from an event pool - /// that was created using ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag. - /// - The destination memory will be unmodified if the event has not been - /// signaled. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hEvent` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == dstptr` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_NOT_READY - /// + not signaled - ZE_APIEXPORT ze_result_t ZE_APICALL zeEventQueryKernelTimestamp( - ze_event_handle_t hEvent, ///< [in] handle of the event - ze_kernel_timestamp_result_t * dstptr ///< [in,out] pointer to memory for where timestamp result will be written. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Appends a query of an events' timestamp value(s) into a command list. - /// - /// @details - /// - The application must ensure the events are accessible by the device on - /// which the command list was created. - /// - The application must ensure the events were created from an event pool - /// that was created using ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP flag. - /// - The application must ensure the memory pointed to by both dstptr and - /// pOffsets is accessible by the device on which the command list was - /// created. - /// - The value(s) written to the destination buffer are undefined if any - /// timestamp event has not been signaled. - /// - If pOffsets is nullptr, then multiple results will be appended - /// sequentially into memory in the same order as phEvents. - /// - The application must ensure the command list and events were created, - /// and the memory was allocated, on the same context. - /// - The application must **not** call this function from simultaneous - /// threads with the same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == phEvents` - /// + `nullptr == dstptr` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendQueryKernelTimestamps( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - uint32_t numEvents, ///< [in] the number of timestamp events to query - ze_event_handle_t * phEvents, ///< [in][range(0, numEvents)] handles of timestamp events to query - void * dstptr, ///< [in,out] pointer to memory where ze_kernel_timestamp_result_t will - ///< be written; must be size-aligned. - const size_t * pOffsets, ///< [in][optional][range(0, numEvents)] offset, in bytes, to write - ///< results; address must be 4byte-aligned and offsets must be - ///< size-aligned. - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before executing query; - ///< must be 0 if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before executing query - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Fence -#if !defined(__GNUC__) - #pragma region fence -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported fence creation flags - typedef uint32_t ze_fence_flags_t; - typedef enum _ze_fence_flag_t - { - ZE_FENCE_FLAG_SIGNALED = ZE_BIT(0), ///< fence is created in the signaled state, otherwise not signaled. - ZE_FENCE_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_fence_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Fence descriptor - typedef struct _ze_fence_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_fence_flags_t flags; ///< [in] creation flags. - ///< must be 0 (default) or a valid combination of ze_fence_flag_t. - } ze_fence_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates a fence for the command queue. - /// - /// @details - /// - A fence is a heavyweight synchronization primitive used to communicate - /// to the host that command list execution has completed. - /// - The application must only use the fence for the command queue which - /// was provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @remarks - /// _Analogues_ - /// - **vkCreateFence** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandQueue` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phFence` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x1 < desc->flags` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeFenceCreate(ze_command_queue_handle_t hCommandQueue, ///< [in] handle of command queue - const ze_fence_desc_t * desc, ///< [in] pointer to fence descriptor - ze_fence_handle_t * phFence ///< [out] pointer to handle of fence object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Deletes a fence object. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the fence before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this fence. - /// - The application must **not** call this function from simultaneous - /// threads with the same fence handle. - /// - The implementation of this function must be thread-safe. - /// - /// @remarks - /// _Analogues_ - /// - **vkDestroyFence** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hFence` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeFenceDestroy(ze_fence_handle_t hFence ///< [in][release] handle of fence object to destroy - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief The current host thread waits on a fence to be signaled. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **vkWaitForFences** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hFence` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_NOT_READY - /// + timeout expired - ZE_APIEXPORT ze_result_t ZE_APICALL zeFenceHostSynchronize( - ze_fence_handle_t hFence, ///< [in] handle of the fence - uint64_t timeout ///< [in] if non-zero, then indicates the maximum time (in nanoseconds) to - ///< yield before returning ZE_RESULT_SUCCESS or ZE_RESULT_NOT_READY; - ///< if zero, then operates exactly like zeFenceQueryStatus; - ///< if UINT64_MAX, then function will not return until complete or device - ///< is lost. - ///< Due to external dependencies, timeout may be rounded to the closest - ///< value allowed by the accuracy of those dependencies. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Queries a fence object's status. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **vkGetFenceStatus** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hFence` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_NOT_READY - /// + not signaled - ZE_APIEXPORT ze_result_t ZE_APICALL zeFenceQueryStatus(ze_fence_handle_t hFence ///< [in] handle of the fence - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Reset a fence back to the not signaled state. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @remarks - /// _Analogues_ - /// - **vkResetFences** - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hFence` - ZE_APIEXPORT ze_result_t ZE_APICALL zeFenceReset(ze_fence_handle_t hFence ///< [in] handle of the fence - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Images -#if !defined(__GNUC__) - #pragma region image -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported image creation flags - typedef uint32_t ze_image_flags_t; - typedef enum _ze_image_flag_t - { - ZE_IMAGE_FLAG_KERNEL_WRITE = ZE_BIT(0), ///< kernels will write contents - ZE_IMAGE_FLAG_BIAS_UNCACHED = ZE_BIT(1), ///< device should not cache contents - ZE_IMAGE_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_image_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported image types - typedef enum _ze_image_type_t - { - ZE_IMAGE_TYPE_1D = 0, ///< 1D - ZE_IMAGE_TYPE_1DARRAY = 1, ///< 1D array - ZE_IMAGE_TYPE_2D = 2, ///< 2D - ZE_IMAGE_TYPE_2DARRAY = 3, ///< 2D array - ZE_IMAGE_TYPE_3D = 4, ///< 3D - ZE_IMAGE_TYPE_BUFFER = 5, ///< Buffer - ZE_IMAGE_TYPE_FORCE_UINT32 = 0x7fffffff - } ze_image_type_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported image format layouts - typedef enum _ze_image_format_layout_t - { - ZE_IMAGE_FORMAT_LAYOUT_8 = 0, ///< 8-bit single component layout - ZE_IMAGE_FORMAT_LAYOUT_16 = 1, ///< 16-bit single component layout - ZE_IMAGE_FORMAT_LAYOUT_32 = 2, ///< 32-bit single component layout - ZE_IMAGE_FORMAT_LAYOUT_8_8 = 3, ///< 2-component 8-bit layout - ZE_IMAGE_FORMAT_LAYOUT_8_8_8_8 = 4, ///< 4-component 8-bit layout - ZE_IMAGE_FORMAT_LAYOUT_16_16 = 5, ///< 2-component 16-bit layout - ZE_IMAGE_FORMAT_LAYOUT_16_16_16_16 = 6, ///< 4-component 16-bit layout - ZE_IMAGE_FORMAT_LAYOUT_32_32 = 7, ///< 2-component 32-bit layout - ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32 = 8, ///< 4-component 32-bit layout - ZE_IMAGE_FORMAT_LAYOUT_10_10_10_2 = 9, ///< 4-component 10_10_10_2 layout - ZE_IMAGE_FORMAT_LAYOUT_11_11_10 = 10, ///< 3-component 11_11_10 layout - ZE_IMAGE_FORMAT_LAYOUT_5_6_5 = 11, ///< 3-component 5_6_5 layout - ZE_IMAGE_FORMAT_LAYOUT_5_5_5_1 = 12, ///< 4-component 5_5_5_1 layout - ZE_IMAGE_FORMAT_LAYOUT_4_4_4_4 = 13, ///< 4-component 4_4_4_4 layout - ZE_IMAGE_FORMAT_LAYOUT_Y8 = 14, ///< Media Format: Y8. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_NV12 = 15, ///< Media Format: NV12. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_YUYV = 16, ///< Media Format: YUYV. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_VYUY = 17, ///< Media Format: VYUY. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_YVYU = 18, ///< Media Format: YVYU. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_UYVY = 19, ///< Media Format: UYVY. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_AYUV = 20, ///< Media Format: AYUV. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_P010 = 21, ///< Media Format: P010. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_Y410 = 22, ///< Media Format: Y410. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_P012 = 23, ///< Media Format: P012. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_Y16 = 24, ///< Media Format: Y16. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_P016 = 25, ///< Media Format: P016. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_Y216 = 26, ///< Media Format: Y216. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_P216 = 27, ///< Media Format: P216. Format type and swizzle is ignored for this. - ZE_IMAGE_FORMAT_LAYOUT_FORCE_UINT32 = 0x7fffffff - } ze_image_format_layout_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported image format types - typedef enum _ze_image_format_type_t - { - ZE_IMAGE_FORMAT_TYPE_UINT = 0, ///< Unsigned integer - ZE_IMAGE_FORMAT_TYPE_SINT = 1, ///< Signed integer - ZE_IMAGE_FORMAT_TYPE_UNORM = 2, ///< Unsigned normalized integer - ZE_IMAGE_FORMAT_TYPE_SNORM = 3, ///< Signed normalized integer - ZE_IMAGE_FORMAT_TYPE_FLOAT = 4, ///< Float - ZE_IMAGE_FORMAT_TYPE_FORCE_UINT32 = 0x7fffffff - } ze_image_format_type_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported image format component swizzle into channel - typedef enum _ze_image_format_swizzle_t - { - ZE_IMAGE_FORMAT_SWIZZLE_R = 0, ///< Red component - ZE_IMAGE_FORMAT_SWIZZLE_G = 1, ///< Green component - ZE_IMAGE_FORMAT_SWIZZLE_B = 2, ///< Blue component - ZE_IMAGE_FORMAT_SWIZZLE_A = 3, ///< Alpha component - ZE_IMAGE_FORMAT_SWIZZLE_0 = 4, ///< Zero - ZE_IMAGE_FORMAT_SWIZZLE_1 = 5, ///< One - ZE_IMAGE_FORMAT_SWIZZLE_X = 6, ///< Don't care - ZE_IMAGE_FORMAT_SWIZZLE_FORCE_UINT32 = 0x7fffffff - } ze_image_format_swizzle_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Image format - typedef struct _ze_image_format_t - { - ze_image_format_layout_t layout; ///< [in] image format component layout - ze_image_format_type_t type; ///< [in] image format type. Media formats can't be used for - ///< ZE_IMAGE_TYPE_BUFFER. - ze_image_format_swizzle_t x; ///< [in] image component swizzle into channel x - ze_image_format_swizzle_t y; ///< [in] image component swizzle into channel y - ze_image_format_swizzle_t z; ///< [in] image component swizzle into channel z - ze_image_format_swizzle_t w; ///< [in] image component swizzle into channel w - } ze_image_format_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Image descriptor - typedef struct _ze_image_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_image_flags_t flags; ///< [in] creation flags. - ///< must be 0 (default) or a valid combination of ze_image_flag_t; - ///< default is read-only, cached access. - ze_image_type_t type; ///< [in] image type - ze_image_format_t format; ///< [in] image format - uint64_t width; ///< [in] width dimension. - ///< ZE_IMAGE_TYPE_BUFFER: size in bytes; see - ///< ze_device_image_properties_t.maxImageBufferSize for limits. - ///< ZE_IMAGE_TYPE_1D, ZE_IMAGE_TYPE_1DARRAY: width in pixels; see - ///< ze_device_image_properties_t.maxImageDims1D for limits. - ///< ZE_IMAGE_TYPE_2D, ZE_IMAGE_TYPE_2DARRAY: width in pixels; see - ///< ze_device_image_properties_t.maxImageDims2D for limits. - ///< ZE_IMAGE_TYPE_3D: width in pixels; see - ///< ze_device_image_properties_t.maxImageDims3D for limits. - uint32_t height; ///< [in] height dimension. - ///< ZE_IMAGE_TYPE_2D, ZE_IMAGE_TYPE_2DARRAY: height in pixels; see - ///< ze_device_image_properties_t.maxImageDims2D for limits. - ///< ZE_IMAGE_TYPE_3D: height in pixels; see - ///< ze_device_image_properties_t.maxImageDims3D for limits. - ///< other: ignored. - uint32_t depth; ///< [in] depth dimension. - ///< ZE_IMAGE_TYPE_3D: depth in pixels; see - ///< ze_device_image_properties_t.maxImageDims3D for limits. - ///< other: ignored. - uint32_t arraylevels; ///< [in] array levels. - ///< ZE_IMAGE_TYPE_1DARRAY, ZE_IMAGE_TYPE_2DARRAY: see - ///< ze_device_image_properties_t.maxImageArraySlices for limits. - ///< other: ignored. - uint32_t miplevels; ///< [in] mipmap levels (must be 0) - } ze_image_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported sampler filtering flags - typedef uint32_t ze_image_sampler_filter_flags_t; - typedef enum _ze_image_sampler_filter_flag_t - { - ZE_IMAGE_SAMPLER_FILTER_FLAG_POINT = ZE_BIT(0), ///< device supports point filtering - ZE_IMAGE_SAMPLER_FILTER_FLAG_LINEAR = ZE_BIT(1), ///< device supports linear filtering - ZE_IMAGE_SAMPLER_FILTER_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_image_sampler_filter_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Image properties - typedef struct _ze_image_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_image_sampler_filter_flags_t samplerFilterFlags; ///< [out] supported sampler filtering. - ///< returns 0 (unsupported) or a combination of ze_image_sampler_filter_flag_t. - } ze_image_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves supported properties of an image. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == pImageProperties` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x3 < desc->flags` - /// + `ZE_IMAGE_TYPE_BUFFER < desc->type` - ZE_APIEXPORT ze_result_t ZE_APICALL zeImageGetProperties(ze_device_handle_t hDevice, ///< [in] handle of the device - const ze_image_desc_t * desc, ///< [in] pointer to image descriptor - ze_image_properties_t * pImageProperties ///< [out] pointer to image properties - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates an image on the context. - /// - /// @details - /// - The application must only use the image for the device, or its - /// sub-devices, which was provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @remarks - /// _Analogues_ - /// - clCreateImage - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phImage` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x3 < desc->flags` - /// + `ZE_IMAGE_TYPE_BUFFER < desc->type` - /// - ZE_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeImageCreate(ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device - const ze_image_desc_t * desc, ///< [in] pointer to image descriptor - ze_image_handle_t * phImage ///< [out] pointer to handle of image object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Deletes an image object. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the image before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this image. - /// - The application must **not** call this function from simultaneous - /// threads with the same image handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hImage` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeImageDestroy(ze_image_handle_t hImage ///< [in][release] handle of image object to destroy - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Memory -#if !defined(__GNUC__) - #pragma region memory -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported memory allocation flags - typedef uint32_t ze_device_mem_alloc_flags_t; - typedef enum _ze_device_mem_alloc_flag_t - { - ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0), ///< device should cache allocation - ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1), ///< device should not cache allocation (UC) - ZE_DEVICE_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_device_mem_alloc_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Device memory allocation descriptor - typedef struct _ze_device_mem_alloc_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_device_mem_alloc_flags_t flags; ///< [in] flags specifying additional allocation controls. - ///< must be 0 (default) or a valid combination of ze_device_mem_alloc_flag_t; - ///< default behavior may use implicit driver-based heuristics. - uint32_t ordinal; ///< [in] ordinal of the device's local memory to allocate from. - ///< must be less than the count returned from zeDeviceGetMemoryProperties. - } ze_device_mem_alloc_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported host memory allocation flags - typedef uint32_t ze_host_mem_alloc_flags_t; - typedef enum _ze_host_mem_alloc_flag_t - { - ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED = ZE_BIT(0), ///< host should cache allocation - ZE_HOST_MEM_ALLOC_FLAG_BIAS_UNCACHED = ZE_BIT(1), ///< host should not cache allocation (UC) - ZE_HOST_MEM_ALLOC_FLAG_BIAS_WRITE_COMBINED = ZE_BIT(2), ///< host memory should be allocated write-combined (WC) - ZE_HOST_MEM_ALLOC_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_host_mem_alloc_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Host memory allocation descriptor - typedef struct _ze_host_mem_alloc_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_host_mem_alloc_flags_t flags; ///< [in] flags specifying additional allocation controls. - ///< must be 0 (default) or a valid combination of ze_host_mem_alloc_flag_t; - ///< default behavior may use implicit driver-based heuristics. - } ze_host_mem_alloc_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Allocates shared memory on the context. - /// - /// @details - /// - Shared allocations share ownership between the host and one or more - /// devices. - /// - Shared allocations may optionally be associated with a device by - /// passing a handle to the device. - /// - Devices supporting only single-device shared access capabilities may - /// access shared memory associated with the device. - /// For these devices, ownership of the allocation is shared between the - /// host and the associated device only. - /// - Passing nullptr as the device handle does not associate the shared - /// allocation with any device. - /// For allocations with no associated device, ownership of the allocation - /// is shared between the host and all devices supporting cross-device - /// shared access capabilities. - /// - The application must only use the memory allocation for the context - /// and device, or its sub-devices, which was provided during allocation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == device_desc` - /// + `nullptr == host_desc` - /// + `nullptr == pptr` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x3 < device_desc->flags` - /// + `0x7 < host_desc->flags` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - /// + Must be zero or a power-of-two - /// + `0 != (alignment & (alignment - 1))` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemAllocShared( - ze_context_handle_t hContext, ///< [in] handle of the context object - const ze_device_mem_alloc_desc_t * device_desc, ///< [in] pointer to device memory allocation descriptor - const ze_host_mem_alloc_desc_t * host_desc, ///< [in] pointer to host memory allocation descriptor - size_t size, ///< [in] size in bytes to allocate; must be less-than - ///< ze_device_properties_t.maxMemAllocSize. - size_t alignment, ///< [in] minimum alignment in bytes for the allocation; must be a power of - ///< two. - ze_device_handle_t hDevice, ///< [in][optional] device handle to associate with - void ** pptr ///< [out] pointer to shared allocation - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Allocates device memory on the context. - /// - /// @details - /// - Device allocations are owned by a specific device. - /// - In general, a device allocation may only be accessed by the device - /// that owns it. - /// - The application must only use the memory allocation for the context - /// and device, or its sub-devices, which was provided during allocation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == device_desc` - /// + `nullptr == pptr` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x3 < device_desc->flags` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - /// + Must be zero or a power-of-two - /// + `0 != (alignment & (alignment - 1))` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemAllocDevice( - ze_context_handle_t hContext, ///< [in] handle of the context object - const ze_device_mem_alloc_desc_t * device_desc, ///< [in] pointer to device memory allocation descriptor - size_t size, ///< [in] size in bytes to allocate; must be less-than - ///< ze_device_properties_t.maxMemAllocSize. - size_t alignment, ///< [in] minimum alignment in bytes for the allocation; must be a power of - ///< two. - ze_device_handle_t hDevice, ///< [in] handle of the device - void ** pptr ///< [out] pointer to device allocation - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Allocates host memory on the context. - /// - /// @details - /// - Host allocations are owned by the host process. - /// - Host allocations are accessible by the host and all devices within the - /// driver's context. - /// - Host allocations are frequently used as staging areas to transfer data - /// to or from devices. - /// - The application must only use the memory allocation for the context - /// which was provided during allocation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == host_desc` - /// + `nullptr == pptr` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x7 < host_desc->flags` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - /// + Must be zero or a power-of-two - /// + `0 != (alignment & (alignment - 1))` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemAllocHost( - ze_context_handle_t hContext, ///< [in] handle of the context object - const ze_host_mem_alloc_desc_t * host_desc, ///< [in] pointer to host memory allocation descriptor - size_t size, ///< [in] size in bytes to allocate; must be less-than - ///< ze_device_properties_t.maxMemAllocSize. - size_t alignment, ///< [in] minimum alignment in bytes for the allocation; must be a power of - ///< two. - void ** pptr ///< [out] pointer to host allocation - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Frees allocated host memory, device memory, or shared memory on the - /// context. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the memory before it is freed - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this memory - /// - The application must **not** call this function from simultaneous - /// threads with the same pointer. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemFree(ze_context_handle_t hContext, ///< [in] handle of the context object - void * ptr ///< [in][release] pointer to memory to free - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Memory allocation type - typedef enum _ze_memory_type_t - { - ZE_MEMORY_TYPE_UNKNOWN = 0, ///< the memory pointed to is of unknown type - ZE_MEMORY_TYPE_HOST = 1, ///< the memory pointed to is a host allocation - ZE_MEMORY_TYPE_DEVICE = 2, ///< the memory pointed to is a device allocation - ZE_MEMORY_TYPE_SHARED = 3, ///< the memory pointed to is a shared ownership allocation - ZE_MEMORY_TYPE_FORCE_UINT32 = 0x7fffffff - } ze_memory_type_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Memory allocation properties queried using zeMemGetAllocProperties - typedef struct _ze_memory_allocation_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_memory_type_t type; ///< [out] type of allocated memory - uint64_t id; ///< [out] identifier for this allocation - uint64_t pageSize; ///< [out] page size used for allocation - } ze_memory_allocation_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves attributes of a memory allocation - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The application may query attributes of a memory allocation unrelated - /// to the context. - /// When this occurs, the returned allocation type will be - /// ZE_MEMORY_TYPE_UNKNOWN, and the returned identifier and associated - /// device is unspecified. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// + `nullptr == pMemAllocProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemGetAllocProperties( - ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] memory pointer to query - ze_memory_allocation_properties_t * pMemAllocProperties, ///< [in,out] query result for memory allocation properties - ze_device_handle_t * phDevice ///< [out][optional] device associated with this allocation - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves the base address and/or size of an allocation - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemGetAddressRange(ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] memory pointer to query - void ** pBase, ///< [in,out][optional] base address of the allocation - size_t * pSize ///< [in,out][optional] size of the allocation - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates an IPC memory handle for the specified allocation - /// - /// @details - /// - Takes a pointer to a device memory allocation and creates an IPC - /// memory handle for exporting it for use in another process. - /// - The pointer must be base pointer of the device memory allocation; i.e. - /// the value returned from zeMemAllocDevice. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// + `nullptr == pIpcHandle` - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemGetIpcHandle(ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] pointer to the device memory allocation - ze_ipc_mem_handle_t * pIpcHandle ///< [out] Returned IPC memory handle - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported IPC memory flags - typedef uint32_t ze_ipc_memory_flags_t; - typedef enum _ze_ipc_memory_flag_t - { - ZE_IPC_MEMORY_FLAG_TBD = ZE_BIT(0), ///< reserved for future use - ZE_IPC_MEMORY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_ipc_memory_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Opens an IPC memory handle to retrieve a device pointer on the - /// context. - /// - /// @details - /// - Takes an IPC memory handle from a remote process and associates it - /// with a device pointer usable in this process. - /// - The device pointer in this process should not be freed with - /// zeMemFree, but rather with zeMemCloseIpcHandle. - /// - Multiple calls to this function with the same IPC handle will return - /// unique pointers. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x1 < flags` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pptr` - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemOpenIpcHandle( - ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device to associate with the IPC memory handle - ze_ipc_mem_handle_t handle, ///< [in] IPC memory handle - ze_ipc_memory_flags_t flags, ///< [in] flags controlling the operation. - ///< must be 0 (default) or a valid combination of ze_ipc_memory_flag_t. - void ** pptr ///< [out] pointer to device allocation in this process - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Closes an IPC memory handle - /// - /// @details - /// - Closes an IPC memory handle by unmapping memory that was opened in - /// this process using zeMemOpenIpcHandle. - /// - The application must **not** call this function from simultaneous - /// threads with the same pointer. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - ZE_APIEXPORT ze_result_t ZE_APICALL zeMemCloseIpcHandle(ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr ///< [in][release] pointer to device allocation in this process - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Additional allocation descriptor for exporting external memory - /// - /// @details - /// - This structure may be passed to zeMemAllocDevice, via the `pNext` - /// member of ze_device_mem_alloc_desc_t, to indicate an exportable - /// memory allocation. - /// - This structure may be passed to zeImageCreate, via the `pNext` - /// member of ze_image_desc_t, to indicate an exportable image. - typedef struct _ze_external_memory_export_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_external_memory_type_flags_t flags; ///< [in] flags specifying memory export types for this allocation. - ///< must be 0 (default) or a valid combination of ze_external_memory_type_flags_t - } ze_external_memory_export_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Additional allocation descriptor for importing external memory as a - /// file descriptor - /// - /// @details - /// - This structure may be passed to zeMemAllocDevice, via the `pNext` - /// member of ze_device_mem_alloc_desc_t, to import memory from a file - /// descriptor. - /// - This structure may be passed to zeImageCreate, via the `pNext` - /// member of ze_image_desc_t, to import memory from a file descriptor. - typedef struct _ze_external_memory_import_fd_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_external_memory_type_flags_t flags; ///< [in] flags specifying the memory import type for the file descriptor. - ///< must be 0 (default) or a valid combination of ze_external_memory_type_flags_t - int fd; ///< [in] the file descriptor handle to import - } ze_external_memory_import_fd_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Exports an allocation as a file descriptor - /// - /// @details - /// - This structure may be passed to zeMemGetAllocProperties, via the - /// `pNext` member of ze_memory_allocation_properties_t, to export a - /// memory allocation as a file descriptor. - /// - This structure may be passed to zeImageGetProperties, via the - /// `pNext` member of ze_image_properties_t, to export an image as a - /// file descriptor. - /// - The requested memory export type must have been specified when the - /// allocation was made. - typedef struct _ze_external_memory_export_fd_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_external_memory_type_flags_t flags; ///< [in] flags specifying the memory export type for the file descriptor. - ///< must be 0 (default) or a valid combination of ze_external_memory_type_flags_t - int fd; ///< [out] the exported file descriptor handle representing the allocation. - } ze_external_memory_export_fd_t; - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Module -#if !defined(__GNUC__) - #pragma region module -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported module creation input formats - typedef enum _ze_module_format_t - { - ZE_MODULE_FORMAT_IL_SPIRV = 0, ///< Format is SPIRV IL format - ZE_MODULE_FORMAT_NATIVE = 1, ///< Format is device native format - ZE_MODULE_FORMAT_FORCE_UINT32 = 0x7fffffff - } ze_module_format_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Specialization constants - User defined constants - typedef struct _ze_module_constants_t - { - uint32_t numConstants; ///< [in] Number of specialization constants. - const uint32_t * pConstantIds; ///< [in][range(0, numConstants)] Array of IDs that is sized to - ///< numConstants. - const void ** pConstantValues; ///< [in][range(0, numConstants)] Array of pointers to values that is sized - ///< to numConstants. - } ze_module_constants_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Module descriptor - typedef struct _ze_module_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_module_format_t format; ///< [in] Module format passed in with pInputModule - size_t inputSize; ///< [in] size of input IL or ISA from pInputModule. - const uint8_t * pInputModule; ///< [in] pointer to IL or ISA - const char * pBuildFlags; ///< [in][optional] string containing compiler flags. Following options are supported. - ///< - "-ze-opt-disable" - ///< - Disable optimizations - ///< - "-ze-opt-greater-than-4GB-buffer-required" - ///< - Use 64-bit offset calculations for buffers. - ///< - "-ze-opt-large-register-file" - ///< - Increase number of registers available to threads. - ///< - "-ze-opt-has-buffer-offset-arg" - ///< - Extend stateless to stateful optimization to more - ///< cases with the use of additional offset (e.g. 64-bit - ///< pointer to binding table with 32-bit offset). - ///< - "-g" - ///< - Include debugging information. - const ze_module_constants_t * pConstants; ///< [in][optional] pointer to specialization constants. Valid only for - ///< SPIR-V input. This must be set to nullptr if no specialization - ///< constants are provided. - } ze_module_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates a module on the context. - /// - /// @details - /// - Compiles the module for execution on the device. - /// - The application must only use the module for the device, or its - /// sub-devices, which was provided during creation. - /// - The module can be copied to other devices and contexts within the same - /// driver instance by using zeModuleGetNativeBinary. - /// - A build log can optionally be returned to the caller. The caller is - /// responsible for destroying build log using zeModuleBuildLogDestroy. - /// - The module descriptor constants are only supported for SPIR-V - /// specialization constants. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == desc->pInputModule` - /// + `nullptr == phModule` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `ZE_MODULE_FORMAT_NATIVE < desc->format` - /// - ZE_RESULT_ERROR_INVALID_NATIVE_BINARY - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `0 == desc->inputSize` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - /// - ZE_RESULT_ERROR_MODULE_BUILD_FAILURE - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleCreate( - ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device - const ze_module_desc_t * desc, ///< [in] pointer to module descriptor - ze_module_handle_t * phModule, ///< [out] pointer to handle of module object created - ze_module_build_log_handle_t * phBuildLog ///< [out][optional] pointer to handle of module's build log. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys module - /// - /// @details - /// - The application must destroy all kernel and build log handles created - /// from the module before destroying the module itself. - /// - The application must ensure the device is not currently referencing - /// the module before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this module. - /// - The application must **not** call this function from simultaneous - /// threads with the same module handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModule` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleDestroy(ze_module_handle_t hModule ///< [in][release] handle of the module - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Dynamically link modules together that share import/export linkage - /// dependencies. - /// - /// @details - /// - Modules support import and export linkage for functions and global - /// variables. - /// - Modules that have imports can be dynamically linked to export modules - /// that satisfy those import requirements. - /// - Modules can have both import and export linkages. - /// - Modules that do not have any imports or exports do not need to be - /// linked. - /// - Modules cannot be partially linked. All modules needed to satisfy all - /// import dependencies for a module must be passed in or - /// ZE_RESULT_ERROR_MODULE_LINK_FAILURE will returned. - /// - Modules with imports need to be linked before kernel objects can be - /// created from them. - /// - Modules will only be linked once. A module can be used in multiple - /// link calls if it has exports but it's imports will not be re-linked. - /// - Ambiguous dependencies, where multiple modules satisfy the import - /// dependencies for another module, is not allowed. - /// - ModuleGetNativeBinary can be called on any module regardless of - /// whether it is linked or not. - /// - A link log can optionally be returned to the caller. The caller is - /// responsible for destroying build log using zeModuleBuildLogDestroy. - /// - See SPIR-V specification for linkage details. - /// - The application may call this function from simultaneous threads as - /// long as the import modules being linked are not the same. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == phModules` - /// - ZE_RESULT_ERROR_MODULE_LINK_FAILURE - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleDynamicLink( - uint32_t numModules, ///< [in] number of modules to be linked pointed to by phModules. - ze_module_handle_t * phModules, ///< [in][range(0, numModules)] pointer to an array of modules to - ///< dynamically link together. - ze_module_build_log_handle_t * phLinkLog ///< [out][optional] pointer to handle of dynamic link log. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys module build log object - /// - /// @details - /// - The implementation of this function may immediately free all Host - /// allocations associated with this object. - /// - The application must **not** call this function from simultaneous - /// threads with the same build log handle. - /// - The implementation of this function should be lock-free. - /// - This function can be called before or after zeModuleDestroy for the - /// associated module. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModuleBuildLog` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleBuildLogDestroy( - ze_module_build_log_handle_t hModuleBuildLog ///< [in][release] handle of the module build log object. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieves text string for build log. - /// - /// @details - /// - The caller can pass nullptr for pBuildLog when querying only for size. - /// - The caller must provide memory for build log. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModuleBuildLog` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pSize` - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleBuildLogGetString( - ze_module_build_log_handle_t hModuleBuildLog, ///< [in] handle of the module build log object. - size_t * pSize, ///< [in,out] size of build log string. - char * pBuildLog ///< [in,out][optional] pointer to null-terminated string of the log. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve native binary from Module. - /// - /// @details - /// - The native binary output can be cached to disk and new modules can be - /// later constructed from the cached copy. - /// - The native binary will retain debugging information that is associated - /// with a module. - /// - The caller can pass nullptr for pModuleNativeBinary when querying only - /// for size. - /// - The implementation will copy the native binary into a buffer supplied - /// by the caller. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModule` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pSize` - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleGetNativeBinary(ze_module_handle_t hModule, ///< [in] handle of the module - size_t * pSize, ///< [in,out] size of native binary in bytes. - uint8_t * pModuleNativeBinary ///< [in,out][optional] byte pointer to native binary - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve global variable pointer from Module. - /// - /// @details - /// - The application may query global pointer from any module that either - /// exports or imports it. - /// - The application must dynamically link a module that imports a global - /// before the global pointer can be queried from it. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModule` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pGlobalName` - /// - ZE_RESULT_ERROR_INVALID_GLOBAL_NAME - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleGetGlobalPointer(ze_module_handle_t hModule, ///< [in] handle of the module - const char * pGlobalName, ///< [in] name of global variable in module - size_t * pSize, ///< [in,out][optional] size of global variable - void ** pptr ///< [in,out][optional] device visible pointer - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve all kernel names in the module. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModule` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleGetKernelNames( - ze_module_handle_t hModule, ///< [in] handle of the module - uint32_t * pCount, ///< [in,out] pointer to the number of names. - ///< if count is zero, then the driver will update the value with the total - ///< number of names available. - ///< if count is non-zero, then driver will only retrieve that number of names. - ///< if count is larger than the number of names available, then the driver - ///< will update the value with the correct number of names available. - const char ** pNames ///< [in,out][optional][range(0, *pCount)] array of names of functions - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported module property flags - typedef uint32_t ze_module_property_flags_t; - typedef enum _ze_module_property_flag_t - { - ZE_MODULE_PROPERTY_FLAG_IMPORTS = ZE_BIT(0), ///< Module has imports (i.e. imported global variables and/or kernels). - ///< See zeModuleDynamicLink. - ZE_MODULE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_module_property_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Module properties - typedef struct _ze_module_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - ze_module_property_flags_t flags; ///< [out] 0 (none) or a valid combination of ze_module_property_flag_t - } ze_module_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve module properties. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModule` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pModuleProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleGetProperties( - ze_module_handle_t hModule, ///< [in] handle of the module - ze_module_properties_t * pModuleProperties ///< [in,out] query result for module properties. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported kernel creation flags - typedef uint32_t ze_kernel_flags_t; - typedef enum _ze_kernel_flag_t - { - ZE_KERNEL_FLAG_FORCE_RESIDENCY = ZE_BIT(0), ///< force all device allocations to be resident during execution - ZE_KERNEL_FLAG_EXPLICIT_RESIDENCY = ZE_BIT(1), ///< application is responsible for all residency of device allocations. - ///< driver may disable implicit residency management. - ZE_KERNEL_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_kernel_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Kernel descriptor - typedef struct _ze_kernel_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_kernel_flags_t flags; ///< [in] creation flags. - ///< must be 0 (default) or a valid combination of ze_kernel_flag_t; - ///< default behavior may use driver-based residency. - const char * pKernelName; ///< [in] null-terminated name of kernel in module - } ze_kernel_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Create a kernel from the module. - /// - /// @details - /// - Modules that have unresolved imports need to be dynamically linked - /// before a kernel can be created from them. (See zeModuleDynamicLink) - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModule` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == desc->pKernelName` - /// + `nullptr == phKernel` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x3 < desc->flags` - /// - ZE_RESULT_ERROR_INVALID_KERNEL_NAME - /// - ZE_RESULT_ERROR_INVALID_MODULE_UNLINKED - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelCreate(ze_module_handle_t hModule, ///< [in] handle of the module - const ze_kernel_desc_t * desc, ///< [in] pointer to kernel descriptor - ze_kernel_handle_t * phKernel ///< [out] handle of the Function object - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys a kernel object - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the kernel before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this kernel. - /// - The application must **not** call this function from simultaneous - /// threads with the same kernel handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelDestroy(ze_kernel_handle_t hKernel ///< [in][release] handle of the kernel object - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve a function pointer from a module by name - /// - /// @details - /// - The function pointer is unique for the device on which the module was - /// created. - /// - The function pointer is no longer valid if module is destroyed. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hModule` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pFunctionName` - /// + `nullptr == pfnFunction` - /// - ZE_RESULT_ERROR_INVALID_FUNCTION_NAME - ZE_APIEXPORT ze_result_t ZE_APICALL zeModuleGetFunctionPointer( - ze_module_handle_t hModule, ///< [in] handle of the module - const char * pFunctionName, ///< [in] Name of function to retrieve function pointer for. - void ** pfnFunction ///< [out] pointer to function. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Set group size for a kernel on the current Host thread. - /// - /// @details - /// - The implementation will maintain the group size in thread-local - /// storage. - /// - The group size will be used when a zeCommandListAppendLaunchKernel - /// variant is called on the same Host thread. - /// - The application may call this function from simultaneous threads with - /// the same kernel handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelSetGroupSize(ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - uint32_t groupSizeX, ///< [in] group size for X dimension to use for this kernel - uint32_t groupSizeY, ///< [in] group size for Y dimension to use for this kernel - uint32_t groupSizeZ ///< [in] group size for Z dimension to use for this kernel - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Query a suggested group size for a kernel given a global size for each - /// dimension. - /// - /// @details - /// - This function ignores the group size that is set using - /// zeKernelSetGroupSize. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == groupSizeX` - /// + `nullptr == groupSizeY` - /// + `nullptr == groupSizeZ` - /// - ZE_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelSuggestGroupSize(ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - uint32_t globalSizeX, ///< [in] global width for X dimension - uint32_t globalSizeY, ///< [in] global width for Y dimension - uint32_t globalSizeZ, ///< [in] global width for Z dimension - uint32_t * groupSizeX, ///< [out] recommended size of group for X dimension - uint32_t * groupSizeY, ///< [out] recommended size of group for Y dimension - uint32_t * groupSizeZ ///< [out] recommended size of group for Z dimension - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Query a suggested max group count for a cooperative kernel. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == totalGroupCount` - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelSuggestMaxCooperativeGroupCount(ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - uint32_t * totalGroupCount ///< [out] recommended total group count. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Set kernel argument for a kernel on the current Host thread. - /// - /// @details - /// - The implementation will maintain the argument values in thread-local - /// storage. - /// - The argument values will be used when a - /// zeCommandListAppendLaunchKernel variant is called on the same Host - /// thread. - /// - The application may call this function from simultaneous threads with - /// the same kernel handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX - /// - ZE_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelSetArgumentValue( - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] - size_t argSize, ///< [in] size of argument type - const void * pArgValue ///< [in][optional] argument value represented as matching arg type. If - ///< null then argument value is considered null. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Kernel indirect access flags - typedef uint32_t ze_kernel_indirect_access_flags_t; - typedef enum _ze_kernel_indirect_access_flag_t - { - ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST = ZE_BIT(0), ///< Indicates that the kernel accesses host allocations indirectly. - ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE = ZE_BIT(1), ///< Indicates that the kernel accesses device allocations indirectly. - ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED = ZE_BIT(2), ///< Indicates that the kernel accesses shared allocations indirectly. - ZE_KERNEL_INDIRECT_ACCESS_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_kernel_indirect_access_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Sets kernel indirect access flags. - /// - /// @details - /// - The application should specify which allocations will be indirectly - /// accessed by the kernel to allow driver to optimize which allocations - /// are made resident - /// - This function may **not** be called from simultaneous threads with the - /// same Kernel handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x7 < flags` - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelSetIndirectAccess(ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - ze_kernel_indirect_access_flags_t flags ///< [in] kernel indirect access flags - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve kernel indirect access flags. - /// - /// @details - /// - This function may be called from simultaneous threads with the same - /// Kernel handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pFlags` - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelGetIndirectAccess( - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - ze_kernel_indirect_access_flags_t * pFlags ///< [out] query result for kernel indirect access flags. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve all declared kernel attributes (i.e. can be specified with - /// __attribute__ in runtime language). - /// - /// @details - /// - This function may be called from simultaneous threads with the same - /// Kernel handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pSize` - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelGetSourceAttributes( - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - uint32_t * pSize, ///< [in,out] pointer to size of string in bytes. - ///< if size is zero, then the driver will update string argument. - ///< if size is non-zero, then driver will only retrieve string size in bytes. - ///< if size is larger than source attributes string, then the driver will - ///< update the string. - char ** pString ///< [in,out][optional] pointer to null-terminated string where kernel - ///< source attributes are separated by space. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported Cache Config flags - typedef uint32_t ze_cache_config_flags_t; - typedef enum _ze_cache_config_flag_t - { - ZE_CACHE_CONFIG_FLAG_LARGE_SLM = ZE_BIT(0), ///< Large SLM size - ZE_CACHE_CONFIG_FLAG_LARGE_DATA = ZE_BIT(1), ///< Large General Data size - ZE_CACHE_CONFIG_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_cache_config_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Sets the preferred cache configuration for a kernel on the current - /// Host thread. - /// - /// @details - /// - The implementation will maintain the cache configuration in - /// thread-local storage. - /// - The cache configuration will be used when a - /// zeCommandListAppendLaunchKernel variant is called on the same Host - /// thread. - /// - The application may call this function from simultaneous threads with - /// the same kernel handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x3 < flags` - /// - ZE_RESULT_ERROR_UNSUPPORTED_FEATURE - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelSetCacheConfig( - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - ze_cache_config_flags_t flags ///< [in] cache configuration. - ///< must be 0 (default configuration) or a valid combination of ze_cache_config_flag_t. - ); - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_KERNEL_UUID_SIZE - /// @brief Maximum kernel universal unique id (UUID) size in bytes - #define ZE_MAX_KERNEL_UUID_SIZE 16 -#endif // ZE_MAX_KERNEL_UUID_SIZE - -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_MAX_MODULE_UUID_SIZE - /// @brief Maximum module universal unique id (UUID) size in bytes - #define ZE_MAX_MODULE_UUID_SIZE 16 -#endif // ZE_MAX_MODULE_UUID_SIZE - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Kernel universal unique id (UUID) - typedef struct _ze_kernel_uuid_t - { - uint8_t kid[ZE_MAX_KERNEL_UUID_SIZE]; ///< [out] opaque data representing a kernel UUID - uint8_t mid[ZE_MAX_MODULE_UUID_SIZE]; ///< [out] opaque data representing the kernel's module UUID - } ze_kernel_uuid_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Kernel properties - typedef struct _ze_kernel_properties_t - { - ze_structure_type_t stype; ///< [in] type of this structure - void * pNext; ///< [in,out][optional] pointer to extension-specific structure - uint32_t numKernelArgs; ///< [out] number of kernel arguments. - uint32_t requiredGroupSizeX; ///< [out] required group size in the X dimension, - ///< or zero if there is no required group size - uint32_t requiredGroupSizeY; ///< [out] required group size in the Y dimension, - ///< or zero if there is no required group size - uint32_t requiredGroupSizeZ; ///< [out] required group size in the Z dimension, - ///< or zero if there is no required group size - uint32_t requiredNumSubGroups; ///< [out] required number of subgroups per thread group, - ///< or zero if there is no required number of subgroups - uint32_t requiredSubgroupSize; ///< [out] required subgroup size, - ///< or zero if there is no required subgroup size - uint32_t maxSubgroupSize; ///< [out] maximum subgroup size - uint32_t maxNumSubgroups; ///< [out] maximum number of subgroups per thread group - uint32_t localMemSize; ///< [out] local memory size used by each thread group - uint32_t privateMemSize; ///< [out] private memory size allocated by compiler used by each thread - uint32_t spillMemSize; ///< [out] spill memory size allocated by compiler - ze_kernel_uuid_t uuid; ///< [out] universal unique identifier. - } ze_kernel_properties_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve kernel properties. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pKernelProperties` - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelGetProperties( - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - ze_kernel_properties_t * pKernelProperties ///< [in,out] query result for kernel properties. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Retrieve kernel name from Kernel. - /// - /// @details - /// - The caller can pass nullptr for pName when querying only for size. - /// - The implementation will copy the kernel name into a buffer supplied by - /// the caller. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pSize` - ZE_APIEXPORT ze_result_t ZE_APICALL zeKernelGetName(ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - size_t * pSize, ///< [in,out] size of kernel name string, including null terminator, in - ///< bytes. - char * pName ///< [in,out][optional] char pointer to kernel name. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Kernel dispatch group count. - typedef struct _ze_group_count_t - { - uint32_t groupCountX; ///< [in] number of thread groups in X dimension - uint32_t groupCountY; ///< [in] number of thread groups in Y dimension - uint32_t groupCountZ; ///< [in] number of thread groups in Z dimension - } ze_group_count_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Launch kernel over one or more work groups. - /// - /// @details - /// - The application must ensure the kernel and events are accessible by - /// the device on which the command list was created. - /// - This may **only** be called for a command list created with command - /// queue group ordinal that supports compute. - /// - The application must ensure the command list, kernel and events were - /// created on the same context. - /// - This function may **not** be called from simultaneous threads with the - /// same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pLaunchFuncArgs` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendLaunchKernel( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - const ze_group_count_t * pLaunchFuncArgs, ///< [in] thread group launch arguments - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Launch kernel cooperatively over one or more work groups. - /// - /// @details - /// - The application must ensure the kernel and events are accessible by - /// the device on which the command list was created. - /// - This may **only** be called for a command list created with command - /// queue group ordinal that supports compute. - /// - This may only be used for a command list that are submitted to command - /// queue with cooperative flag set. - /// - The application must ensure the command list, kernel and events were - /// created on the same context. - /// - This function may **not** be called from simultaneous threads with the - /// same command list handle. - /// - The implementation of this function should be lock-free. - /// - Use zeKernelSuggestMaxCooperativeGroupCount to recommend max group - /// count for device for cooperative functions that device supports. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pLaunchFuncArgs` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendLaunchCooperativeKernel( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - const ze_group_count_t * pLaunchFuncArgs, ///< [in] thread group launch arguments - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Launch kernel over one or more work groups using indirect arguments. - /// - /// @details - /// - The application must ensure the kernel and events are accessible by - /// the device on which the command list was created. - /// - The application must ensure the launch arguments are visible to the - /// device on which the command list was created. - /// - The implementation must not access the contents of the launch - /// arguments as they are free to be modified by either the Host or device - /// up until execution. - /// - This may **only** be called for a command list created with command - /// queue group ordinal that supports compute. - /// - The application must ensure the command list, kernel and events were - /// created, and the memory was allocated, on the same context. - /// - This function may **not** be called from simultaneous threads with the - /// same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// + `nullptr == hKernel` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pLaunchArgumentsBuffer` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendLaunchKernelIndirect( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - ze_kernel_handle_t hKernel, ///< [in] handle of the kernel object - const ze_group_count_t * pLaunchArgumentsBuffer, ///< [in] pointer to device buffer that will contain thread group launch - ///< arguments - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Launch multiple kernels over one or more work groups using an array of - /// indirect arguments. - /// - /// @details - /// - The application must ensure the kernel and events are accessible by - /// the device on which the command list was created. - /// - The application must ensure the array of launch arguments and count - /// buffer are visible to the device on which the command list was - /// created. - /// - The implementation must not access the contents of the array of launch - /// arguments or count buffer as they are free to be modified by either - /// the Host or device up until execution. - /// - This may **only** be called for a command list created with command - /// queue group ordinal that supports compute. - /// - The application must enusre the command list, kernel and events were - /// created, and the memory was allocated, on the same context. - /// - This function may **not** be called from simultaneous threads with the - /// same command list handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hCommandList` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == phKernels` - /// + `nullptr == pCountBuffer` - /// + `nullptr == pLaunchArgumentsBuffer` - /// - ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT - /// - ZE_RESULT_ERROR_INVALID_SIZE - /// + `(nullptr == phWaitEvents) && (0 < numWaitEvents)` - ZE_APIEXPORT ze_result_t ZE_APICALL zeCommandListAppendLaunchMultipleKernelsIndirect( - ze_command_list_handle_t hCommandList, ///< [in] handle of the command list - uint32_t numKernels, ///< [in] maximum number of kernels to launch - ze_kernel_handle_t * phKernels, ///< [in][range(0, numKernels)] handles of the kernel objects - const uint32_t * pCountBuffer, ///< [in] pointer to device memory location that will contain the actual - ///< number of kernels to launch; value must be less-than or equal-to - ///< numKernels - const ze_group_count_t * pLaunchArgumentsBuffer, ///< [in][range(0, numKernels)] pointer to device buffer that will contain - ///< a contiguous array of thread group launch arguments - ze_event_handle_t hSignalEvent, ///< [in][optional] handle of the event to signal on completion - uint32_t numWaitEvents, ///< [in][optional] number of events to wait on before launching; must be 0 - ///< if `nullptr == phWaitEvents` - ze_event_handle_t * phWaitEvents ///< [in][optional][range(0, numWaitEvents)] handle of the events to wait - ///< on before launching - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero Extension APIs for Raytracing -#if !defined(__GNUC__) - #pragma region raytracing -#endif -/////////////////////////////////////////////////////////////////////////////// -#ifndef ZE_RAYTRACING_EXT_NAME - /// @brief Raytracing Extension Name - #define ZE_RAYTRACING_EXT_NAME "ZE_extension_raytracing" -#endif // ZE_RAYTRACING_EXT_NAME - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Raytracing Extension Version(s) - typedef enum _ze_raytracing_ext_version_t - { - ZE_RAYTRACING_EXT_VERSION_1_0 = ZE_MAKE_VERSION(1, 0), ///< version 1.0 - ZE_RAYTRACING_EXT_VERSION_CURRENT = ZE_MAKE_VERSION(1, 0), ///< latest known version - ZE_RAYTRACING_EXT_VERSION_FORCE_UINT32 = 0x7fffffff - } ze_raytracing_ext_version_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported raytracing memory allocation flags - typedef uint32_t ze_raytracing_mem_alloc_ext_flags_t; - typedef enum _ze_raytracing_mem_alloc_ext_flag_t - { - ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_TBD = ZE_BIT(0), ///< reserved for future use - ZE_RAYTRACING_MEM_ALLOC_EXT_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_raytracing_mem_alloc_ext_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Raytracing memory allocation descriptor - /// - /// @details - /// - This structure must be passed to zeMemAllocShared or - /// zeMemAllocDevice, via `pNext` member of - /// ze_device_mem_alloc_desc_t, for any memory allocation that is to be - /// accessed by raytracing fixed-function of the device. - typedef struct _ze_raytracing_mem_alloc_ext_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_raytracing_mem_alloc_ext_flags_t flags; ///< [in] flags specifying additional allocation controls. - ///< must be 0 (default) or a valid combination of ze_raytracing_mem_alloc_ext_flag_t; - ///< default behavior may use implicit driver-based heuristics. - } ze_raytracing_mem_alloc_ext_desc_t; - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Memory Residency -#if !defined(__GNUC__) - #pragma region residency -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Makes memory resident for the device. - /// - /// @details - /// - The application must ensure the memory is resident before being - /// referenced by the device - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextMakeMemoryResident(ze_context_handle_t hContext, ///< [in] handle of context object - ze_device_handle_t hDevice, ///< [in] handle of the device - void * ptr, ///< [in] pointer to memory to make resident - size_t size ///< [in] size in bytes to make resident - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Allows memory to be evicted from the device. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the memory before it is evicted - /// - The application may free the memory without evicting; the memory is - /// implicitly evicted when freed. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextEvictMemory(ze_context_handle_t hContext, ///< [in] handle of context object - ze_device_handle_t hDevice, ///< [in] handle of the device - void * ptr, ///< [in] pointer to memory to evict - size_t size ///< [in] size in bytes to evict - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Makes image resident for the device. - /// - /// @details - /// - The application must ensure the image is resident before being - /// referenced by the device - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// + `nullptr == hImage` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextMakeImageResident(ze_context_handle_t hContext, ///< [in] handle of context object - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_image_handle_t hImage ///< [in] handle of image to make resident - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Allows image to be evicted from the device. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the image before it is evicted - /// - The application may destroy the image without evicting; the image is - /// implicitly evicted when destroyed. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// + `nullptr == hImage` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeContextEvictImage(ze_context_handle_t hContext, ///< [in] handle of context object - ze_device_handle_t hDevice, ///< [in] handle of the device - ze_image_handle_t hImage ///< [in] handle of image to make evict - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Sampler -#if !defined(__GNUC__) - #pragma region sampler -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Sampler addressing modes - typedef enum _ze_sampler_address_mode_t - { - ZE_SAMPLER_ADDRESS_MODE_NONE = 0, ///< No coordinate modifications for out-of-bounds image access. - ZE_SAMPLER_ADDRESS_MODE_REPEAT = 1, ///< Out-of-bounds coordinates are wrapped back around. - ZE_SAMPLER_ADDRESS_MODE_CLAMP = 2, ///< Out-of-bounds coordinates are clamped to edge. - ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER = 3, ///< Out-of-bounds coordinates are clamped to border color which is (0.0f, - ///< 0.0f, 0.0f, 0.0f) if image format swizzle contains alpha, otherwise - ///< (0.0f, 0.0f, 0.0f, 1.0f). - ZE_SAMPLER_ADDRESS_MODE_MIRROR = 4, ///< Out-of-bounds coordinates are mirrored starting from edge. - ZE_SAMPLER_ADDRESS_MODE_FORCE_UINT32 = 0x7fffffff - } ze_sampler_address_mode_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Sampler filtering modes - typedef enum _ze_sampler_filter_mode_t - { - ZE_SAMPLER_FILTER_MODE_NEAREST = 0, ///< No coordinate modifications for out of bounds image access. - ZE_SAMPLER_FILTER_MODE_LINEAR = 1, ///< Out-of-bounds coordinates are wrapped back around. - ZE_SAMPLER_FILTER_MODE_FORCE_UINT32 = 0x7fffffff - } ze_sampler_filter_mode_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Sampler descriptor - typedef struct _ze_sampler_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_sampler_address_mode_t addressMode; ///< [in] Sampler addressing mode to determine how out-of-bounds - ///< coordinates are handled. - ze_sampler_filter_mode_t filterMode; ///< [in] Sampler filter mode to determine how samples are filtered. - ze_bool_t isNormalized; ///< [in] Are coordinates normalized [0, 1] or not. - } ze_sampler_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates sampler on the context. - /// - /// @details - /// - The application must only use the sampler for the device, or its - /// sub-devices, which was provided during creation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phSampler` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `ZE_SAMPLER_ADDRESS_MODE_MIRROR < desc->addressMode` - /// + `ZE_SAMPLER_FILTER_MODE_LINEAR < desc->filterMode` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeSamplerCreate(ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device - const ze_sampler_desc_t * desc, ///< [in] pointer to sampler descriptor - ze_sampler_handle_t * phSampler ///< [out] handle of the sampler - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys sampler object - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the sampler before it is deleted. - /// - The implementation of this function may immediately free all Host and - /// Device allocations associated with this sampler. - /// - The application must **not** call this function from simultaneous - /// threads with the same sampler handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hSampler` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zeSamplerDestroy(ze_sampler_handle_t hSampler ///< [in][release] handle of the sampler - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero APIs for Virtual Memory Management -#if !defined(__GNUC__) - #pragma region virtual -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Virtual memory page access attributes - typedef enum _ze_memory_access_attribute_t - { - ZE_MEMORY_ACCESS_ATTRIBUTE_NONE = 0, ///< Indicates the memory page is inaccessible. - ZE_MEMORY_ACCESS_ATTRIBUTE_READWRITE = 1, ///< Indicates the memory page supports read write access. - ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY = 2, ///< Indicates the memory page supports read-only access. - ZE_MEMORY_ACCESS_ATTRIBUTE_FORCE_UINT32 = 0x7fffffff - } ze_memory_access_attribute_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Reserves pages in virtual address space. - /// - /// @details - /// - The application must only use the memory allocation on the context for - /// which it was created. - /// - The starting address and size must be page aligned. See - /// zeVirtualMemQueryPageSize. - /// - If pStart is not null then implementation will attempt to reserve - /// starting from that address. If not available then will find another - /// suitable starting address. - /// - The application may call this function from simultaneous threads. - /// - The access attributes will default to none to indicate reservation is - /// inaccessible. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pStart` - /// + `nullptr == pptr` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - ZE_APIEXPORT ze_result_t ZE_APICALL zeVirtualMemReserve(ze_context_handle_t hContext, ///< [in] handle of the context object - const void * pStart, ///< [in] pointer to start of region to reserve. If nullptr then - ///< implementation will choose a start address. - size_t size, ///< [in] size in bytes to reserve; must be page aligned. - void ** pptr ///< [out] pointer to virtual reservation. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Free pages in a reserved virtual address range. - /// - /// @details - /// - Any existing virtual mappings for the range will be unmapped. - /// - Physical allocations objects that were mapped to this range will not - /// be destroyed. These need to be destroyed explicitly. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - ZE_APIEXPORT ze_result_t ZE_APICALL zeVirtualMemFree(ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] pointer to start of region to free. - size_t size ///< [in] size in bytes to free; must be page aligned. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Queries page size to use for aligning virtual memory reservations and - /// physical memory allocations. - /// - /// @details - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == pagesize` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - ZE_APIEXPORT ze_result_t ZE_APICALL zeVirtualMemQueryPageSize(ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device object - size_t size, ///< [in] unaligned allocation size in bytes - size_t * pagesize ///< [out] pointer to page size to use for start address and size - ///< alignments. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Supported physical memory creation flags - typedef uint32_t ze_physical_mem_flags_t; - typedef enum _ze_physical_mem_flag_t - { - ZE_PHYSICAL_MEM_FLAG_TBD = ZE_BIT(0), ///< reserved for future use. - ZE_PHYSICAL_MEM_FLAG_FORCE_UINT32 = 0x7fffffff - } ze_physical_mem_flag_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Physical memory descriptor - typedef struct _ze_physical_mem_desc_t - { - ze_structure_type_t stype; ///< [in] type of this structure - const void * pNext; ///< [in][optional] pointer to extension-specific structure - ze_physical_mem_flags_t flags; ///< [in] creation flags. - ///< must be 0 (default) or a valid combination of ze_physical_mem_flag_t. - size_t size; ///< [in] size in bytes to reserve; must be page aligned. - } ze_physical_mem_desc_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Creates a physical memory object for the context. - /// - /// @details - /// - The application must only use the physical memory object on the - /// context for which it was created. - /// - The size must be page aligned. See zeVirtualMemQueryPageSize. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hDevice` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == desc` - /// + `nullptr == phPhysicalMemory` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `0x1 < desc->flags` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == desc->size` - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - ZE_APIEXPORT ze_result_t ZE_APICALL zePhysicalMemCreate( - ze_context_handle_t hContext, ///< [in] handle of the context object - ze_device_handle_t hDevice, ///< [in] handle of the device object - ze_physical_mem_desc_t * desc, ///< [in] pointer to physical memory descriptor. - ze_physical_mem_handle_t * phPhysicalMemory ///< [out] pointer to handle of physical memory object created - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Destroys a physical memory object. - /// - /// @details - /// - The application must ensure the device is not currently referencing - /// the physical memory object before it is deleted - /// - The application must **not** call this function from simultaneous - /// threads with the same physical memory handle. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hPhysicalMemory` - /// - ZE_RESULT_ERROR_HANDLE_OBJECT_IN_USE - ZE_APIEXPORT ze_result_t ZE_APICALL zePhysicalMemDestroy( - ze_context_handle_t hContext, ///< [in] handle of the context object - ze_physical_mem_handle_t hPhysicalMemory ///< [in][release] handle of physical memory object to destroy - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Maps pages in virtual address space to pages from physical memory - /// object. - /// - /// @details - /// - The virtual address range must have been reserved using - /// zeVirtualMemReserve. - /// - The application must only use the mapped memory allocation on the - /// context for which it was created. - /// - The virtual start address and size must be page aligned. See - /// zeVirtualMemQueryPageSize. - /// - The application should use, for the starting address and size, the - /// same size alignment used for the physical allocation. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// + `nullptr == hPhysicalMemory` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY < access` - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - ZE_APIEXPORT ze_result_t ZE_APICALL zeVirtualMemMap( - ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] pointer to start of virtual address range to map. - size_t size, ///< [in] size in bytes of virtual address range to map; must be page - ///< aligned. - ze_physical_mem_handle_t hPhysicalMemory, ///< [in] handle to physical memory object. - size_t offset, ///< [in] offset into physical memory allocation object; must be page - ///< aligned. - ze_memory_access_attribute_t access ///< [in] specifies page access attributes to apply to the virtual address - ///< range. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Unmaps pages in virtual address space from pages from a physical - /// memory object. - /// - /// @details - /// - The page access attributes for virtual address range will revert back - /// to none. - /// - The application may call this function from simultaneous threads. - /// - The implementation of this function must be thread-safe. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// - ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY - /// - ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned" - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// + Size must be page aligned - ZE_APIEXPORT ze_result_t ZE_APICALL zeVirtualMemUnmap(ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] pointer to start of region to unmap. - size_t size ///< [in] size in bytes to unmap; must be page aligned. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Set memory access attributes for a virtual address range. - /// - /// @details - /// - This function may be called from simultaneous threads with the same - /// function handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// - ZE_RESULT_ERROR_INVALID_ENUMERATION - /// + `ZE_MEMORY_ACCESS_ATTRIBUTE_READONLY < access` - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned" - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// + Size must be page aligned - ZE_APIEXPORT ze_result_t ZE_APICALL zeVirtualMemSetAccessAttribute( - ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] pointer to start of reserved virtual address region. - size_t size, ///< [in] size in bytes; must be page aligned. - ze_memory_access_attribute_t access ///< [in] specifies page access attributes to apply to the virtual address - ///< range. - ); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Get memory access attribute for a virtual address range. - /// - /// @details - /// - If size and outSize are equal then the pages in the specified virtual - /// address range have the same access attributes. - /// - This function may be called from simultaneous threads with the same - /// function handle. - /// - The implementation of this function should be lock-free. - /// - /// @returns - /// - ZE_RESULT_SUCCESS - /// - ZE_RESULT_ERROR_UNINITIALIZED - /// - ZE_RESULT_ERROR_DEVICE_LOST - /// - ZE_RESULT_ERROR_INVALID_NULL_HANDLE - /// + `nullptr == hContext` - /// - ZE_RESULT_ERROR_INVALID_NULL_POINTER - /// + `nullptr == ptr` - /// + `nullptr == access` - /// + `nullptr == outSize` - /// - ZE_RESULT_ERROR_UNSUPPORTED_ALIGNMENT - "Address must be page aligned" - /// - ZE_RESULT_ERROR_UNSUPPORTED_SIZE - /// + `0 == size` - /// + Size must be page aligned - ZE_APIEXPORT ze_result_t ZE_APICALL zeVirtualMemGetAccessAttribute( - ze_context_handle_t hContext, ///< [in] handle of the context object - const void * ptr, ///< [in] pointer to start of virtual address region for query. - size_t size, ///< [in] size in bytes; must be page aligned. - ze_memory_access_attribute_t * access, ///< [out] query result for page access attribute. - size_t * outSize ///< [out] query result for size of virtual address range, starting at ptr, - ///< that shares same access attribute. - ); - -#if !defined(__GNUC__) - #pragma endregion -#endif -// Intel 'oneAPI' Level-Zero API Callbacks -#if !defined(__GNUC__) - #pragma region callbacks -#endif - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeInit - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_init_params_t - { - ze_init_flags_t * pflags; - } ze_init_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeInit - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnInitCb_t)(ze_init_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Global callback functions pointers - typedef struct _ze_global_callbacks_t - { - ze_pfnInitCb_t pfnInitCb; - } ze_global_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDriverGet - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_driver_get_params_t - { - uint32_t ** ppCount; - ze_driver_handle_t ** pphDrivers; - } ze_driver_get_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDriverGet - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDriverGetCb_t)(ze_driver_get_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDriverGetApiVersion - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_driver_get_api_version_params_t - { - ze_driver_handle_t * phDriver; - ze_api_version_t ** pversion; - } ze_driver_get_api_version_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDriverGetApiVersion - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDriverGetApiVersionCb_t)(ze_driver_get_api_version_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDriverGetProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_driver_get_properties_params_t - { - ze_driver_handle_t * phDriver; - ze_driver_properties_t ** ppDriverProperties; - } ze_driver_get_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDriverGetProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDriverGetPropertiesCb_t)(ze_driver_get_properties_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDriverGetIpcProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_driver_get_ipc_properties_params_t - { - ze_driver_handle_t * phDriver; - ze_driver_ipc_properties_t ** ppIpcProperties; - } ze_driver_get_ipc_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDriverGetIpcProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDriverGetIpcPropertiesCb_t)(ze_driver_get_ipc_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDriverGetExtensionProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_driver_get_extension_properties_params_t - { - ze_driver_handle_t * phDriver; - uint32_t ** ppCount; - ze_driver_extension_properties_t ** ppExtensionProperties; - } ze_driver_get_extension_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDriverGetExtensionProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDriverGetExtensionPropertiesCb_t)(ze_driver_get_extension_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Driver callback functions pointers - typedef struct _ze_driver_callbacks_t - { - ze_pfnDriverGetCb_t pfnGetCb; - ze_pfnDriverGetApiVersionCb_t pfnGetApiVersionCb; - ze_pfnDriverGetPropertiesCb_t pfnGetPropertiesCb; - ze_pfnDriverGetIpcPropertiesCb_t pfnGetIpcPropertiesCb; - ze_pfnDriverGetExtensionPropertiesCb_t pfnGetExtensionPropertiesCb; - } ze_driver_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGet - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_params_t - { - ze_driver_handle_t * phDriver; - uint32_t ** ppCount; - ze_device_handle_t ** pphDevices; - } ze_device_get_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGet - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetCb_t)(ze_device_get_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetSubDevices - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_sub_devices_params_t - { - ze_device_handle_t * phDevice; - uint32_t ** ppCount; - ze_device_handle_t ** pphSubdevices; - } ze_device_get_sub_devices_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetSubDevices - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetSubDevicesCb_t)(ze_device_get_sub_devices_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_properties_params_t - { - ze_device_handle_t * phDevice; - ze_device_properties_t ** ppDeviceProperties; - } ze_device_get_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetPropertiesCb_t)(ze_device_get_properties_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetComputeProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_compute_properties_params_t - { - ze_device_handle_t * phDevice; - ze_device_compute_properties_t ** ppComputeProperties; - } ze_device_get_compute_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetComputeProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetComputePropertiesCb_t)(ze_device_get_compute_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetModuleProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_module_properties_params_t - { - ze_device_handle_t * phDevice; - ze_device_module_properties_t ** ppModuleProperties; - } ze_device_get_module_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetModuleProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetModulePropertiesCb_t)(ze_device_get_module_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetCommandQueueGroupProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_command_queue_group_properties_params_t - { - ze_device_handle_t * phDevice; - uint32_t ** ppCount; - ze_command_queue_group_properties_t ** ppCommandQueueGroupProperties; - } ze_device_get_command_queue_group_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetCommandQueueGroupProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t)(ze_device_get_command_queue_group_properties_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetMemoryProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_memory_properties_params_t - { - ze_device_handle_t * phDevice; - uint32_t ** ppCount; - ze_device_memory_properties_t ** ppMemProperties; - } ze_device_get_memory_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetMemoryProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetMemoryPropertiesCb_t)(ze_device_get_memory_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetMemoryAccessProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_memory_access_properties_params_t - { - ze_device_handle_t * phDevice; - ze_device_memory_access_properties_t ** ppMemAccessProperties; - } ze_device_get_memory_access_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetMemoryAccessProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetMemoryAccessPropertiesCb_t)(ze_device_get_memory_access_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetCacheProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_cache_properties_params_t - { - ze_device_handle_t * phDevice; - uint32_t ** ppCount; - ze_device_cache_properties_t ** ppCacheProperties; - } ze_device_get_cache_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetCacheProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetCachePropertiesCb_t)(ze_device_get_cache_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetImageProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_image_properties_params_t - { - ze_device_handle_t * phDevice; - ze_device_image_properties_t ** ppImageProperties; - } ze_device_get_image_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetImageProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetImagePropertiesCb_t)(ze_device_get_image_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetExternalMemoryProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_external_memory_properties_params_t - { - ze_device_handle_t * phDevice; - ze_device_external_memory_properties_t ** ppExternalMemoryProperties; - } ze_device_get_external_memory_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetExternalMemoryProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetExternalMemoryPropertiesCb_t)(ze_device_get_external_memory_properties_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetP2PProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_p2_p_properties_params_t - { - ze_device_handle_t * phDevice; - ze_device_handle_t * phPeerDevice; - ze_device_p2p_properties_t ** ppP2PProperties; - } ze_device_get_p2_p_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetP2PProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetP2PPropertiesCb_t)(ze_device_get_p2_p_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceCanAccessPeer - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_can_access_peer_params_t - { - ze_device_handle_t * phDevice; - ze_device_handle_t * phPeerDevice; - ze_bool_t ** pvalue; - } ze_device_can_access_peer_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceCanAccessPeer - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceCanAccessPeerCb_t)(ze_device_can_access_peer_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeDeviceGetStatus - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_device_get_status_params_t - { - ze_device_handle_t * phDevice; - } ze_device_get_status_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeDeviceGetStatus - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnDeviceGetStatusCb_t)(ze_device_get_status_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Device callback functions pointers - typedef struct _ze_device_callbacks_t - { - ze_pfnDeviceGetCb_t pfnGetCb; - ze_pfnDeviceGetSubDevicesCb_t pfnGetSubDevicesCb; - ze_pfnDeviceGetPropertiesCb_t pfnGetPropertiesCb; - ze_pfnDeviceGetComputePropertiesCb_t pfnGetComputePropertiesCb; - ze_pfnDeviceGetModulePropertiesCb_t pfnGetModulePropertiesCb; - ze_pfnDeviceGetCommandQueueGroupPropertiesCb_t pfnGetCommandQueueGroupPropertiesCb; - ze_pfnDeviceGetMemoryPropertiesCb_t pfnGetMemoryPropertiesCb; - ze_pfnDeviceGetMemoryAccessPropertiesCb_t pfnGetMemoryAccessPropertiesCb; - ze_pfnDeviceGetCachePropertiesCb_t pfnGetCachePropertiesCb; - ze_pfnDeviceGetImagePropertiesCb_t pfnGetImagePropertiesCb; - ze_pfnDeviceGetExternalMemoryPropertiesCb_t pfnGetExternalMemoryPropertiesCb; - ze_pfnDeviceGetP2PPropertiesCb_t pfnGetP2PPropertiesCb; - ze_pfnDeviceCanAccessPeerCb_t pfnCanAccessPeerCb; - ze_pfnDeviceGetStatusCb_t pfnGetStatusCb; - } ze_device_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_create_params_t - { - ze_driver_handle_t * phDriver; - const ze_context_desc_t ** pdesc; - ze_context_handle_t ** pphContext; - } ze_context_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextCreateCb_t)(ze_context_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_destroy_params_t - { - ze_context_handle_t * phContext; - } ze_context_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextDestroyCb_t)(ze_context_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextGetStatus - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_get_status_params_t - { - ze_context_handle_t * phContext; - } ze_context_get_status_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextGetStatus - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextGetStatusCb_t)(ze_context_get_status_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextSystemBarrier - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_system_barrier_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - } ze_context_system_barrier_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextSystemBarrier - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextSystemBarrierCb_t)(ze_context_system_barrier_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextMakeMemoryResident - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_make_memory_resident_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - void ** pptr; - size_t * psize; - } ze_context_make_memory_resident_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextMakeMemoryResident - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextMakeMemoryResidentCb_t)(ze_context_make_memory_resident_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextEvictMemory - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_evict_memory_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - void ** pptr; - size_t * psize; - } ze_context_evict_memory_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextEvictMemory - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextEvictMemoryCb_t)(ze_context_evict_memory_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextMakeImageResident - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_make_image_resident_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - ze_image_handle_t * phImage; - } ze_context_make_image_resident_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextMakeImageResident - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextMakeImageResidentCb_t)(ze_context_make_image_resident_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeContextEvictImage - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_context_evict_image_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - ze_image_handle_t * phImage; - } ze_context_evict_image_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeContextEvictImage - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnContextEvictImageCb_t)(ze_context_evict_image_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Context callback functions pointers - typedef struct _ze_context_callbacks_t - { - ze_pfnContextCreateCb_t pfnCreateCb; - ze_pfnContextDestroyCb_t pfnDestroyCb; - ze_pfnContextGetStatusCb_t pfnGetStatusCb; - ze_pfnContextSystemBarrierCb_t pfnSystemBarrierCb; - ze_pfnContextMakeMemoryResidentCb_t pfnMakeMemoryResidentCb; - ze_pfnContextEvictMemoryCb_t pfnEvictMemoryCb; - ze_pfnContextMakeImageResidentCb_t pfnMakeImageResidentCb; - ze_pfnContextEvictImageCb_t pfnEvictImageCb; - } ze_context_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandQueueCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_queue_create_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - const ze_command_queue_desc_t ** pdesc; - ze_command_queue_handle_t ** pphCommandQueue; - } ze_command_queue_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandQueueCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandQueueCreateCb_t)(ze_command_queue_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandQueueDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_queue_destroy_params_t - { - ze_command_queue_handle_t * phCommandQueue; - } ze_command_queue_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandQueueDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandQueueDestroyCb_t)(ze_command_queue_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandQueueExecuteCommandLists - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_queue_execute_command_lists_params_t - { - ze_command_queue_handle_t * phCommandQueue; - uint32_t * pnumCommandLists; - ze_command_list_handle_t ** pphCommandLists; - ze_fence_handle_t * phFence; - } ze_command_queue_execute_command_lists_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandQueueExecuteCommandLists - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandQueueExecuteCommandListsCb_t)(ze_command_queue_execute_command_lists_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandQueueSynchronize - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_queue_synchronize_params_t - { - ze_command_queue_handle_t * phCommandQueue; - uint64_t * ptimeout; - } ze_command_queue_synchronize_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandQueueSynchronize - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandQueueSynchronizeCb_t)(ze_command_queue_synchronize_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of CommandQueue callback functions pointers - typedef struct _ze_command_queue_callbacks_t - { - ze_pfnCommandQueueCreateCb_t pfnCreateCb; - ze_pfnCommandQueueDestroyCb_t pfnDestroyCb; - ze_pfnCommandQueueExecuteCommandListsCb_t pfnExecuteCommandListsCb; - ze_pfnCommandQueueSynchronizeCb_t pfnSynchronizeCb; - } ze_command_queue_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_create_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - const ze_command_list_desc_t ** pdesc; - ze_command_list_handle_t ** pphCommandList; - } ze_command_list_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListCreateCb_t)(ze_command_list_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListCreateImmediate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_create_immediate_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - const ze_command_queue_desc_t ** paltdesc; - ze_command_list_handle_t ** pphCommandList; - } ze_command_list_create_immediate_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListCreateImmediate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListCreateImmediateCb_t)(ze_command_list_create_immediate_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_destroy_params_t - { - ze_command_list_handle_t * phCommandList; - } ze_command_list_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListDestroyCb_t)(ze_command_list_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListClose - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_close_params_t - { - ze_command_list_handle_t * phCommandList; - } ze_command_list_close_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListClose - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListCloseCb_t)(ze_command_list_close_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListReset - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_reset_params_t - { - ze_command_list_handle_t * phCommandList; - } ze_command_list_reset_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListReset - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListResetCb_t)(ze_command_list_reset_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendWriteGlobalTimestamp - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_write_global_timestamp_params_t - { - ze_command_list_handle_t * phCommandList; - uint64_t ** pdstptr; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_write_global_timestamp_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendWriteGlobalTimestamp - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendWriteGlobalTimestampCb_t)(ze_command_list_append_write_global_timestamp_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendBarrier - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_barrier_params_t - { - ze_command_list_handle_t * phCommandList; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_barrier_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendBarrier - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendBarrierCb_t)(ze_command_list_append_barrier_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendMemoryRangesBarrier - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_memory_ranges_barrier_params_t - { - ze_command_list_handle_t * phCommandList; - uint32_t * pnumRanges; - const size_t ** ppRangeSizes; - const void *** ppRanges; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_memory_ranges_barrier_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendMemoryRangesBarrier - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendMemoryRangesBarrierCb_t)(ze_command_list_append_memory_ranges_barrier_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendMemoryCopy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_memory_copy_params_t - { - ze_command_list_handle_t * phCommandList; - void ** pdstptr; - const void ** psrcptr; - size_t * psize; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_memory_copy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendMemoryCopy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendMemoryCopyCb_t)(ze_command_list_append_memory_copy_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendMemoryFill - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_memory_fill_params_t - { - ze_command_list_handle_t * phCommandList; - void ** pptr; - const void ** ppattern; - size_t * ppattern_size; - size_t * psize; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_memory_fill_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendMemoryFill - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendMemoryFillCb_t)(ze_command_list_append_memory_fill_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendMemoryCopyRegion - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_memory_copy_region_params_t - { - ze_command_list_handle_t * phCommandList; - void ** pdstptr; - const ze_copy_region_t ** pdstRegion; - uint32_t * pdstPitch; - uint32_t * pdstSlicePitch; - const void ** psrcptr; - const ze_copy_region_t ** psrcRegion; - uint32_t * psrcPitch; - uint32_t * psrcSlicePitch; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_memory_copy_region_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendMemoryCopyRegion - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendMemoryCopyRegionCb_t)(ze_command_list_append_memory_copy_region_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendMemoryCopyFromContext - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_memory_copy_from_context_params_t - { - ze_command_list_handle_t * phCommandList; - void ** pdstptr; - ze_context_handle_t * phContextSrc; - const void ** psrcptr; - size_t * psize; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_memory_copy_from_context_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendMemoryCopyFromContext - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendMemoryCopyFromContextCb_t)(ze_command_list_append_memory_copy_from_context_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendImageCopy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_image_copy_params_t - { - ze_command_list_handle_t * phCommandList; - ze_image_handle_t * phDstImage; - ze_image_handle_t * phSrcImage; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_image_copy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendImageCopy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendImageCopyCb_t)(ze_command_list_append_image_copy_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendImageCopyRegion - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_image_copy_region_params_t - { - ze_command_list_handle_t * phCommandList; - ze_image_handle_t * phDstImage; - ze_image_handle_t * phSrcImage; - const ze_image_region_t ** ppDstRegion; - const ze_image_region_t ** ppSrcRegion; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_image_copy_region_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendImageCopyRegion - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendImageCopyRegionCb_t)(ze_command_list_append_image_copy_region_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendImageCopyToMemory - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_image_copy_to_memory_params_t - { - ze_command_list_handle_t * phCommandList; - void ** pdstptr; - ze_image_handle_t * phSrcImage; - const ze_image_region_t ** ppSrcRegion; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_image_copy_to_memory_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendImageCopyToMemory - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendImageCopyToMemoryCb_t)(ze_command_list_append_image_copy_to_memory_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendImageCopyFromMemory - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_image_copy_from_memory_params_t - { - ze_command_list_handle_t * phCommandList; - ze_image_handle_t * phDstImage; - const void ** psrcptr; - const ze_image_region_t ** ppDstRegion; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_image_copy_from_memory_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendImageCopyFromMemory - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendImageCopyFromMemoryCb_t)(ze_command_list_append_image_copy_from_memory_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendMemoryPrefetch - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_memory_prefetch_params_t - { - ze_command_list_handle_t * phCommandList; - const void ** pptr; - size_t * psize; - } ze_command_list_append_memory_prefetch_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendMemoryPrefetch - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendMemoryPrefetchCb_t)(ze_command_list_append_memory_prefetch_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendMemAdvise - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_mem_advise_params_t - { - ze_command_list_handle_t * phCommandList; - ze_device_handle_t * phDevice; - const void ** pptr; - size_t * psize; - ze_memory_advice_t * padvice; - } ze_command_list_append_mem_advise_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendMemAdvise - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendMemAdviseCb_t)(ze_command_list_append_mem_advise_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendSignalEvent - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_signal_event_params_t - { - ze_command_list_handle_t * phCommandList; - ze_event_handle_t * phEvent; - } ze_command_list_append_signal_event_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendSignalEvent - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendSignalEventCb_t)(ze_command_list_append_signal_event_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendWaitOnEvents - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_wait_on_events_params_t - { - ze_command_list_handle_t * phCommandList; - uint32_t * pnumEvents; - ze_event_handle_t ** pphEvents; - } ze_command_list_append_wait_on_events_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendWaitOnEvents - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendWaitOnEventsCb_t)(ze_command_list_append_wait_on_events_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendEventReset - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_event_reset_params_t - { - ze_command_list_handle_t * phCommandList; - ze_event_handle_t * phEvent; - } ze_command_list_append_event_reset_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendEventReset - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendEventResetCb_t)(ze_command_list_append_event_reset_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendQueryKernelTimestamps - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_query_kernel_timestamps_params_t - { - ze_command_list_handle_t * phCommandList; - uint32_t * pnumEvents; - ze_event_handle_t ** pphEvents; - void ** pdstptr; - const size_t ** ppOffsets; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_query_kernel_timestamps_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendQueryKernelTimestamps - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendQueryKernelTimestampsCb_t)(ze_command_list_append_query_kernel_timestamps_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendLaunchKernel - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_launch_kernel_params_t - { - ze_command_list_handle_t * phCommandList; - ze_kernel_handle_t * phKernel; - const ze_group_count_t ** ppLaunchFuncArgs; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_launch_kernel_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendLaunchKernel - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendLaunchKernelCb_t)(ze_command_list_append_launch_kernel_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendLaunchCooperativeKernel - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_launch_cooperative_kernel_params_t - { - ze_command_list_handle_t * phCommandList; - ze_kernel_handle_t * phKernel; - const ze_group_count_t ** ppLaunchFuncArgs; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_launch_cooperative_kernel_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendLaunchCooperativeKernel - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendLaunchCooperativeKernelCb_t)(ze_command_list_append_launch_cooperative_kernel_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendLaunchKernelIndirect - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_launch_kernel_indirect_params_t - { - ze_command_list_handle_t * phCommandList; - ze_kernel_handle_t * phKernel; - const ze_group_count_t ** ppLaunchArgumentsBuffer; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_launch_kernel_indirect_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendLaunchKernelIndirect - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendLaunchKernelIndirectCb_t)(ze_command_list_append_launch_kernel_indirect_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeCommandListAppendLaunchMultipleKernelsIndirect - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_command_list_append_launch_multiple_kernels_indirect_params_t - { - ze_command_list_handle_t * phCommandList; - uint32_t * pnumKernels; - ze_kernel_handle_t ** pphKernels; - const uint32_t ** ppCountBuffer; - const ze_group_count_t ** ppLaunchArgumentsBuffer; - ze_event_handle_t * phSignalEvent; - uint32_t * pnumWaitEvents; - ze_event_handle_t ** pphWaitEvents; - } ze_command_list_append_launch_multiple_kernels_indirect_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeCommandListAppendLaunchMultipleKernelsIndirect - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t)( - ze_command_list_append_launch_multiple_kernels_indirect_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of CommandList callback functions pointers - typedef struct _ze_command_list_callbacks_t - { - ze_pfnCommandListCreateCb_t pfnCreateCb; - ze_pfnCommandListCreateImmediateCb_t pfnCreateImmediateCb; - ze_pfnCommandListDestroyCb_t pfnDestroyCb; - ze_pfnCommandListCloseCb_t pfnCloseCb; - ze_pfnCommandListResetCb_t pfnResetCb; - ze_pfnCommandListAppendWriteGlobalTimestampCb_t pfnAppendWriteGlobalTimestampCb; - ze_pfnCommandListAppendBarrierCb_t pfnAppendBarrierCb; - ze_pfnCommandListAppendMemoryRangesBarrierCb_t pfnAppendMemoryRangesBarrierCb; - ze_pfnCommandListAppendMemoryCopyCb_t pfnAppendMemoryCopyCb; - ze_pfnCommandListAppendMemoryFillCb_t pfnAppendMemoryFillCb; - ze_pfnCommandListAppendMemoryCopyRegionCb_t pfnAppendMemoryCopyRegionCb; - ze_pfnCommandListAppendMemoryCopyFromContextCb_t pfnAppendMemoryCopyFromContextCb; - ze_pfnCommandListAppendImageCopyCb_t pfnAppendImageCopyCb; - ze_pfnCommandListAppendImageCopyRegionCb_t pfnAppendImageCopyRegionCb; - ze_pfnCommandListAppendImageCopyToMemoryCb_t pfnAppendImageCopyToMemoryCb; - ze_pfnCommandListAppendImageCopyFromMemoryCb_t pfnAppendImageCopyFromMemoryCb; - ze_pfnCommandListAppendMemoryPrefetchCb_t pfnAppendMemoryPrefetchCb; - ze_pfnCommandListAppendMemAdviseCb_t pfnAppendMemAdviseCb; - ze_pfnCommandListAppendSignalEventCb_t pfnAppendSignalEventCb; - ze_pfnCommandListAppendWaitOnEventsCb_t pfnAppendWaitOnEventsCb; - ze_pfnCommandListAppendEventResetCb_t pfnAppendEventResetCb; - ze_pfnCommandListAppendQueryKernelTimestampsCb_t pfnAppendQueryKernelTimestampsCb; - ze_pfnCommandListAppendLaunchKernelCb_t pfnAppendLaunchKernelCb; - ze_pfnCommandListAppendLaunchCooperativeKernelCb_t pfnAppendLaunchCooperativeKernelCb; - ze_pfnCommandListAppendLaunchKernelIndirectCb_t pfnAppendLaunchKernelIndirectCb; - ze_pfnCommandListAppendLaunchMultipleKernelsIndirectCb_t pfnAppendLaunchMultipleKernelsIndirectCb; - } ze_command_list_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeFenceCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_fence_create_params_t - { - ze_command_queue_handle_t * phCommandQueue; - const ze_fence_desc_t ** pdesc; - ze_fence_handle_t ** pphFence; - } ze_fence_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeFenceCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnFenceCreateCb_t)(ze_fence_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeFenceDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_fence_destroy_params_t - { - ze_fence_handle_t * phFence; - } ze_fence_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeFenceDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnFenceDestroyCb_t)(ze_fence_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeFenceHostSynchronize - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_fence_host_synchronize_params_t - { - ze_fence_handle_t * phFence; - uint64_t * ptimeout; - } ze_fence_host_synchronize_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeFenceHostSynchronize - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnFenceHostSynchronizeCb_t)(ze_fence_host_synchronize_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeFenceQueryStatus - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_fence_query_status_params_t - { - ze_fence_handle_t * phFence; - } ze_fence_query_status_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeFenceQueryStatus - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnFenceQueryStatusCb_t)(ze_fence_query_status_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeFenceReset - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_fence_reset_params_t - { - ze_fence_handle_t * phFence; - } ze_fence_reset_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeFenceReset - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnFenceResetCb_t)(ze_fence_reset_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Fence callback functions pointers - typedef struct _ze_fence_callbacks_t - { - ze_pfnFenceCreateCb_t pfnCreateCb; - ze_pfnFenceDestroyCb_t pfnDestroyCb; - ze_pfnFenceHostSynchronizeCb_t pfnHostSynchronizeCb; - ze_pfnFenceQueryStatusCb_t pfnQueryStatusCb; - ze_pfnFenceResetCb_t pfnResetCb; - } ze_fence_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventPoolCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_pool_create_params_t - { - ze_context_handle_t * phContext; - const ze_event_pool_desc_t ** pdesc; - uint32_t * pnumDevices; - ze_device_handle_t ** pphDevices; - ze_event_pool_handle_t ** pphEventPool; - } ze_event_pool_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventPoolCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventPoolCreateCb_t)(ze_event_pool_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventPoolDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_pool_destroy_params_t - { - ze_event_pool_handle_t * phEventPool; - } ze_event_pool_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventPoolDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventPoolDestroyCb_t)(ze_event_pool_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventPoolGetIpcHandle - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_pool_get_ipc_handle_params_t - { - ze_event_pool_handle_t * phEventPool; - ze_ipc_event_pool_handle_t ** pphIpc; - } ze_event_pool_get_ipc_handle_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventPoolGetIpcHandle - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventPoolGetIpcHandleCb_t)(ze_event_pool_get_ipc_handle_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventPoolOpenIpcHandle - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_pool_open_ipc_handle_params_t - { - ze_context_handle_t * phContext; - ze_ipc_event_pool_handle_t * phIpc; - ze_event_pool_handle_t ** pphEventPool; - } ze_event_pool_open_ipc_handle_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventPoolOpenIpcHandle - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventPoolOpenIpcHandleCb_t)(ze_event_pool_open_ipc_handle_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventPoolCloseIpcHandle - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_pool_close_ipc_handle_params_t - { - ze_event_pool_handle_t * phEventPool; - } ze_event_pool_close_ipc_handle_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventPoolCloseIpcHandle - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventPoolCloseIpcHandleCb_t)(ze_event_pool_close_ipc_handle_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of EventPool callback functions pointers - typedef struct _ze_event_pool_callbacks_t - { - ze_pfnEventPoolCreateCb_t pfnCreateCb; - ze_pfnEventPoolDestroyCb_t pfnDestroyCb; - ze_pfnEventPoolGetIpcHandleCb_t pfnGetIpcHandleCb; - ze_pfnEventPoolOpenIpcHandleCb_t pfnOpenIpcHandleCb; - ze_pfnEventPoolCloseIpcHandleCb_t pfnCloseIpcHandleCb; - } ze_event_pool_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_create_params_t - { - ze_event_pool_handle_t * phEventPool; - const ze_event_desc_t ** pdesc; - ze_event_handle_t ** pphEvent; - } ze_event_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventCreateCb_t)(ze_event_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_destroy_params_t - { - ze_event_handle_t * phEvent; - } ze_event_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventDestroyCb_t)(ze_event_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventHostSignal - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_host_signal_params_t - { - ze_event_handle_t * phEvent; - } ze_event_host_signal_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventHostSignal - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventHostSignalCb_t)(ze_event_host_signal_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventHostSynchronize - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_host_synchronize_params_t - { - ze_event_handle_t * phEvent; - uint64_t * ptimeout; - } ze_event_host_synchronize_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventHostSynchronize - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventHostSynchronizeCb_t)(ze_event_host_synchronize_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventQueryStatus - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_query_status_params_t - { - ze_event_handle_t * phEvent; - } ze_event_query_status_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventQueryStatus - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventQueryStatusCb_t)(ze_event_query_status_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventHostReset - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_host_reset_params_t - { - ze_event_handle_t * phEvent; - } ze_event_host_reset_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventHostReset - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventHostResetCb_t)(ze_event_host_reset_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeEventQueryKernelTimestamp - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_event_query_kernel_timestamp_params_t - { - ze_event_handle_t * phEvent; - ze_kernel_timestamp_result_t ** pdstptr; - } ze_event_query_kernel_timestamp_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeEventQueryKernelTimestamp - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnEventQueryKernelTimestampCb_t)(ze_event_query_kernel_timestamp_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Event callback functions pointers - typedef struct _ze_event_callbacks_t - { - ze_pfnEventCreateCb_t pfnCreateCb; - ze_pfnEventDestroyCb_t pfnDestroyCb; - ze_pfnEventHostSignalCb_t pfnHostSignalCb; - ze_pfnEventHostSynchronizeCb_t pfnHostSynchronizeCb; - ze_pfnEventQueryStatusCb_t pfnQueryStatusCb; - ze_pfnEventHostResetCb_t pfnHostResetCb; - ze_pfnEventQueryKernelTimestampCb_t pfnQueryKernelTimestampCb; - } ze_event_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeImageGetProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_image_get_properties_params_t - { - ze_device_handle_t * phDevice; - const ze_image_desc_t ** pdesc; - ze_image_properties_t ** ppImageProperties; - } ze_image_get_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeImageGetProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnImageGetPropertiesCb_t)(ze_image_get_properties_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeImageCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_image_create_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - const ze_image_desc_t ** pdesc; - ze_image_handle_t ** pphImage; - } ze_image_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeImageCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnImageCreateCb_t)(ze_image_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeImageDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_image_destroy_params_t - { - ze_image_handle_t * phImage; - } ze_image_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeImageDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnImageDestroyCb_t)(ze_image_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Image callback functions pointers - typedef struct _ze_image_callbacks_t - { - ze_pfnImageGetPropertiesCb_t pfnGetPropertiesCb; - ze_pfnImageCreateCb_t pfnCreateCb; - ze_pfnImageDestroyCb_t pfnDestroyCb; - } ze_image_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_create_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - const ze_module_desc_t ** pdesc; - ze_module_handle_t ** pphModule; - ze_module_build_log_handle_t ** pphBuildLog; - } ze_module_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleCreateCb_t)(ze_module_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_destroy_params_t - { - ze_module_handle_t * phModule; - } ze_module_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleDestroyCb_t)(ze_module_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleDynamicLink - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_dynamic_link_params_t - { - uint32_t * pnumModules; - ze_module_handle_t ** pphModules; - ze_module_build_log_handle_t ** pphLinkLog; - } ze_module_dynamic_link_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleDynamicLink - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleDynamicLinkCb_t)(ze_module_dynamic_link_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleGetNativeBinary - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_get_native_binary_params_t - { - ze_module_handle_t * phModule; - size_t ** ppSize; - uint8_t ** ppModuleNativeBinary; - } ze_module_get_native_binary_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleGetNativeBinary - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleGetNativeBinaryCb_t)(ze_module_get_native_binary_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleGetGlobalPointer - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_get_global_pointer_params_t - { - ze_module_handle_t * phModule; - const char ** ppGlobalName; - size_t ** ppSize; - void *** ppptr; - } ze_module_get_global_pointer_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleGetGlobalPointer - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleGetGlobalPointerCb_t)(ze_module_get_global_pointer_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleGetKernelNames - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_get_kernel_names_params_t - { - ze_module_handle_t * phModule; - uint32_t ** ppCount; - const char *** ppNames; - } ze_module_get_kernel_names_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleGetKernelNames - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleGetKernelNamesCb_t)(ze_module_get_kernel_names_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleGetProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_get_properties_params_t - { - ze_module_handle_t * phModule; - ze_module_properties_t ** ppModuleProperties; - } ze_module_get_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleGetProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleGetPropertiesCb_t)(ze_module_get_properties_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleGetFunctionPointer - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_get_function_pointer_params_t - { - ze_module_handle_t * phModule; - const char ** ppFunctionName; - void *** ppfnFunction; - } ze_module_get_function_pointer_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleGetFunctionPointer - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleGetFunctionPointerCb_t)(ze_module_get_function_pointer_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Module callback functions pointers - typedef struct _ze_module_callbacks_t - { - ze_pfnModuleCreateCb_t pfnCreateCb; - ze_pfnModuleDestroyCb_t pfnDestroyCb; - ze_pfnModuleDynamicLinkCb_t pfnDynamicLinkCb; - ze_pfnModuleGetNativeBinaryCb_t pfnGetNativeBinaryCb; - ze_pfnModuleGetGlobalPointerCb_t pfnGetGlobalPointerCb; - ze_pfnModuleGetKernelNamesCb_t pfnGetKernelNamesCb; - ze_pfnModuleGetPropertiesCb_t pfnGetPropertiesCb; - ze_pfnModuleGetFunctionPointerCb_t pfnGetFunctionPointerCb; - } ze_module_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleBuildLogDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_build_log_destroy_params_t - { - ze_module_build_log_handle_t * phModuleBuildLog; - } ze_module_build_log_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleBuildLogDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleBuildLogDestroyCb_t)(ze_module_build_log_destroy_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeModuleBuildLogGetString - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_module_build_log_get_string_params_t - { - ze_module_build_log_handle_t * phModuleBuildLog; - size_t ** ppSize; - char ** ppBuildLog; - } ze_module_build_log_get_string_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeModuleBuildLogGetString - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnModuleBuildLogGetStringCb_t)(ze_module_build_log_get_string_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of ModuleBuildLog callback functions pointers - typedef struct _ze_module_build_log_callbacks_t - { - ze_pfnModuleBuildLogDestroyCb_t pfnDestroyCb; - ze_pfnModuleBuildLogGetStringCb_t pfnGetStringCb; - } ze_module_build_log_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_create_params_t - { - ze_module_handle_t * phModule; - const ze_kernel_desc_t ** pdesc; - ze_kernel_handle_t ** pphKernel; - } ze_kernel_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelCreateCb_t)(ze_kernel_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_destroy_params_t - { - ze_kernel_handle_t * phKernel; - } ze_kernel_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelDestroyCb_t)(ze_kernel_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelSetCacheConfig - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_set_cache_config_params_t - { - ze_kernel_handle_t * phKernel; - ze_cache_config_flags_t * pflags; - } ze_kernel_set_cache_config_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelSetCacheConfig - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelSetCacheConfigCb_t)(ze_kernel_set_cache_config_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelSetGroupSize - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_set_group_size_params_t - { - ze_kernel_handle_t * phKernel; - uint32_t * pgroupSizeX; - uint32_t * pgroupSizeY; - uint32_t * pgroupSizeZ; - } ze_kernel_set_group_size_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelSetGroupSize - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelSetGroupSizeCb_t)(ze_kernel_set_group_size_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelSuggestGroupSize - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_suggest_group_size_params_t - { - ze_kernel_handle_t * phKernel; - uint32_t * pglobalSizeX; - uint32_t * pglobalSizeY; - uint32_t * pglobalSizeZ; - uint32_t ** pgroupSizeX; - uint32_t ** pgroupSizeY; - uint32_t ** pgroupSizeZ; - } ze_kernel_suggest_group_size_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelSuggestGroupSize - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelSuggestGroupSizeCb_t)(ze_kernel_suggest_group_size_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelSuggestMaxCooperativeGroupCount - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_suggest_max_cooperative_group_count_params_t - { - ze_kernel_handle_t * phKernel; - uint32_t ** ptotalGroupCount; - } ze_kernel_suggest_max_cooperative_group_count_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelSuggestMaxCooperativeGroupCount - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t)(ze_kernel_suggest_max_cooperative_group_count_params_t * params, - ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelSetArgumentValue - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_set_argument_value_params_t - { - ze_kernel_handle_t * phKernel; - uint32_t * pargIndex; - size_t * pargSize; - const void ** ppArgValue; - } ze_kernel_set_argument_value_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelSetArgumentValue - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelSetArgumentValueCb_t)(ze_kernel_set_argument_value_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelSetIndirectAccess - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_set_indirect_access_params_t - { - ze_kernel_handle_t * phKernel; - ze_kernel_indirect_access_flags_t * pflags; - } ze_kernel_set_indirect_access_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelSetIndirectAccess - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelSetIndirectAccessCb_t)(ze_kernel_set_indirect_access_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelGetIndirectAccess - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_get_indirect_access_params_t - { - ze_kernel_handle_t * phKernel; - ze_kernel_indirect_access_flags_t ** ppFlags; - } ze_kernel_get_indirect_access_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelGetIndirectAccess - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelGetIndirectAccessCb_t)(ze_kernel_get_indirect_access_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelGetSourceAttributes - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_get_source_attributes_params_t - { - ze_kernel_handle_t * phKernel; - uint32_t ** ppSize; - char *** ppString; - } ze_kernel_get_source_attributes_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelGetSourceAttributes - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelGetSourceAttributesCb_t)(ze_kernel_get_source_attributes_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelGetProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_get_properties_params_t - { - ze_kernel_handle_t * phKernel; - ze_kernel_properties_t ** ppKernelProperties; - } ze_kernel_get_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelGetProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelGetPropertiesCb_t)(ze_kernel_get_properties_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeKernelGetName - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_kernel_get_name_params_t - { - ze_kernel_handle_t * phKernel; - size_t ** ppSize; - char ** ppName; - } ze_kernel_get_name_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeKernelGetName - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnKernelGetNameCb_t)(ze_kernel_get_name_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Kernel callback functions pointers - typedef struct _ze_kernel_callbacks_t - { - ze_pfnKernelCreateCb_t pfnCreateCb; - ze_pfnKernelDestroyCb_t pfnDestroyCb; - ze_pfnKernelSetCacheConfigCb_t pfnSetCacheConfigCb; - ze_pfnKernelSetGroupSizeCb_t pfnSetGroupSizeCb; - ze_pfnKernelSuggestGroupSizeCb_t pfnSuggestGroupSizeCb; - ze_pfnKernelSuggestMaxCooperativeGroupCountCb_t pfnSuggestMaxCooperativeGroupCountCb; - ze_pfnKernelSetArgumentValueCb_t pfnSetArgumentValueCb; - ze_pfnKernelSetIndirectAccessCb_t pfnSetIndirectAccessCb; - ze_pfnKernelGetIndirectAccessCb_t pfnGetIndirectAccessCb; - ze_pfnKernelGetSourceAttributesCb_t pfnGetSourceAttributesCb; - ze_pfnKernelGetPropertiesCb_t pfnGetPropertiesCb; - ze_pfnKernelGetNameCb_t pfnGetNameCb; - } ze_kernel_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeSamplerCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_sampler_create_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - const ze_sampler_desc_t ** pdesc; - ze_sampler_handle_t ** pphSampler; - } ze_sampler_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeSamplerCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnSamplerCreateCb_t)(ze_sampler_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeSamplerDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_sampler_destroy_params_t - { - ze_sampler_handle_t * phSampler; - } ze_sampler_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeSamplerDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnSamplerDestroyCb_t)(ze_sampler_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Sampler callback functions pointers - typedef struct _ze_sampler_callbacks_t - { - ze_pfnSamplerCreateCb_t pfnCreateCb; - ze_pfnSamplerDestroyCb_t pfnDestroyCb; - } ze_sampler_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zePhysicalMemCreate - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_physical_mem_create_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - ze_physical_mem_desc_t ** pdesc; - ze_physical_mem_handle_t ** pphPhysicalMemory; - } ze_physical_mem_create_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zePhysicalMemCreate - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnPhysicalMemCreateCb_t)(ze_physical_mem_create_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zePhysicalMemDestroy - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_physical_mem_destroy_params_t - { - ze_context_handle_t * phContext; - ze_physical_mem_handle_t * phPhysicalMemory; - } ze_physical_mem_destroy_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zePhysicalMemDestroy - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnPhysicalMemDestroyCb_t)(ze_physical_mem_destroy_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of PhysicalMem callback functions pointers - typedef struct _ze_physical_mem_callbacks_t - { - ze_pfnPhysicalMemCreateCb_t pfnCreateCb; - ze_pfnPhysicalMemDestroyCb_t pfnDestroyCb; - } ze_physical_mem_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemAllocShared - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_alloc_shared_params_t - { - ze_context_handle_t * phContext; - const ze_device_mem_alloc_desc_t ** pdevice_desc; - const ze_host_mem_alloc_desc_t ** phost_desc; - size_t * psize; - size_t * palignment; - ze_device_handle_t * phDevice; - void *** ppptr; - } ze_mem_alloc_shared_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemAllocShared - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemAllocSharedCb_t)(ze_mem_alloc_shared_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemAllocDevice - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_alloc_device_params_t - { - ze_context_handle_t * phContext; - const ze_device_mem_alloc_desc_t ** pdevice_desc; - size_t * psize; - size_t * palignment; - ze_device_handle_t * phDevice; - void *** ppptr; - } ze_mem_alloc_device_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemAllocDevice - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemAllocDeviceCb_t)(ze_mem_alloc_device_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemAllocHost - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_alloc_host_params_t - { - ze_context_handle_t * phContext; - const ze_host_mem_alloc_desc_t ** phost_desc; - size_t * psize; - size_t * palignment; - void *** ppptr; - } ze_mem_alloc_host_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemAllocHost - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemAllocHostCb_t)(ze_mem_alloc_host_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemFree - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_free_params_t - { - ze_context_handle_t * phContext; - void ** pptr; - } ze_mem_free_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemFree - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemFreeCb_t)(ze_mem_free_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemGetAllocProperties - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_get_alloc_properties_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - ze_memory_allocation_properties_t ** ppMemAllocProperties; - ze_device_handle_t ** pphDevice; - } ze_mem_get_alloc_properties_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemGetAllocProperties - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemGetAllocPropertiesCb_t)(ze_mem_get_alloc_properties_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemGetAddressRange - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_get_address_range_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - void *** ppBase; - size_t ** ppSize; - } ze_mem_get_address_range_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemGetAddressRange - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemGetAddressRangeCb_t)(ze_mem_get_address_range_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemGetIpcHandle - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_get_ipc_handle_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - ze_ipc_mem_handle_t ** ppIpcHandle; - } ze_mem_get_ipc_handle_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemGetIpcHandle - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemGetIpcHandleCb_t)(ze_mem_get_ipc_handle_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemOpenIpcHandle - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_open_ipc_handle_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - ze_ipc_mem_handle_t * phandle; - ze_ipc_memory_flags_t * pflags; - void *** ppptr; - } ze_mem_open_ipc_handle_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemOpenIpcHandle - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemOpenIpcHandleCb_t)(ze_mem_open_ipc_handle_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeMemCloseIpcHandle - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_mem_close_ipc_handle_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - } ze_mem_close_ipc_handle_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeMemCloseIpcHandle - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnMemCloseIpcHandleCb_t)(ze_mem_close_ipc_handle_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of Mem callback functions pointers - typedef struct _ze_mem_callbacks_t - { - ze_pfnMemAllocSharedCb_t pfnAllocSharedCb; - ze_pfnMemAllocDeviceCb_t pfnAllocDeviceCb; - ze_pfnMemAllocHostCb_t pfnAllocHostCb; - ze_pfnMemFreeCb_t pfnFreeCb; - ze_pfnMemGetAllocPropertiesCb_t pfnGetAllocPropertiesCb; - ze_pfnMemGetAddressRangeCb_t pfnGetAddressRangeCb; - ze_pfnMemGetIpcHandleCb_t pfnGetIpcHandleCb; - ze_pfnMemOpenIpcHandleCb_t pfnOpenIpcHandleCb; - ze_pfnMemCloseIpcHandleCb_t pfnCloseIpcHandleCb; - } ze_mem_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeVirtualMemReserve - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_virtual_mem_reserve_params_t - { - ze_context_handle_t * phContext; - const void ** ppStart; - size_t * psize; - void *** ppptr; - } ze_virtual_mem_reserve_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeVirtualMemReserve - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnVirtualMemReserveCb_t)(ze_virtual_mem_reserve_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeVirtualMemFree - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_virtual_mem_free_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - size_t * psize; - } ze_virtual_mem_free_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeVirtualMemFree - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnVirtualMemFreeCb_t)(ze_virtual_mem_free_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeVirtualMemQueryPageSize - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_virtual_mem_query_page_size_params_t - { - ze_context_handle_t * phContext; - ze_device_handle_t * phDevice; - size_t * psize; - size_t ** ppagesize; - } ze_virtual_mem_query_page_size_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeVirtualMemQueryPageSize - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnVirtualMemQueryPageSizeCb_t)(ze_virtual_mem_query_page_size_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeVirtualMemMap - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_virtual_mem_map_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - size_t * psize; - ze_physical_mem_handle_t * phPhysicalMemory; - size_t * poffset; - ze_memory_access_attribute_t * paccess; - } ze_virtual_mem_map_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeVirtualMemMap - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnVirtualMemMapCb_t)(ze_virtual_mem_map_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeVirtualMemUnmap - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_virtual_mem_unmap_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - size_t * psize; - } ze_virtual_mem_unmap_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeVirtualMemUnmap - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnVirtualMemUnmapCb_t)(ze_virtual_mem_unmap_params_t * params, ze_result_t result, void * pTracerUserData, - void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeVirtualMemSetAccessAttribute - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_virtual_mem_set_access_attribute_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - size_t * psize; - ze_memory_access_attribute_t * paccess; - } ze_virtual_mem_set_access_attribute_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeVirtualMemSetAccessAttribute - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnVirtualMemSetAccessAttributeCb_t)(ze_virtual_mem_set_access_attribute_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function parameters for zeVirtualMemGetAccessAttribute - /// @details Each entry is a pointer to the parameter passed to the function; - /// allowing the callback the ability to modify the parameter's value - typedef struct _ze_virtual_mem_get_access_attribute_params_t - { - ze_context_handle_t * phContext; - const void ** pptr; - size_t * psize; - ze_memory_access_attribute_t ** paccess; - size_t ** poutSize; - } ze_virtual_mem_get_access_attribute_params_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Callback function-pointer for zeVirtualMemGetAccessAttribute - /// @param[in] params Parameters passed to this instance - /// @param[in] result Return value - /// @param[in] pTracerUserData Per-Tracer user data - /// @param[in,out] ppTracerInstanceUserData Per-Tracer, Per-Instance user data - typedef void(ZE_APICALL * ze_pfnVirtualMemGetAccessAttributeCb_t)(ze_virtual_mem_get_access_attribute_params_t * params, ze_result_t result, - void * pTracerUserData, void ** ppTracerInstanceUserData); - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Table of VirtualMem callback functions pointers - typedef struct _ze_virtual_mem_callbacks_t - { - ze_pfnVirtualMemReserveCb_t pfnReserveCb; - ze_pfnVirtualMemFreeCb_t pfnFreeCb; - ze_pfnVirtualMemQueryPageSizeCb_t pfnQueryPageSizeCb; - ze_pfnVirtualMemMapCb_t pfnMapCb; - ze_pfnVirtualMemUnmapCb_t pfnUnmapCb; - ze_pfnVirtualMemSetAccessAttributeCb_t pfnSetAccessAttributeCb; - ze_pfnVirtualMemGetAccessAttributeCb_t pfnGetAccessAttributeCb; - } ze_virtual_mem_callbacks_t; - - /////////////////////////////////////////////////////////////////////////////// - /// @brief Container for all callbacks - typedef struct _ze_callbacks_t - { - ze_global_callbacks_t Global; - ze_driver_callbacks_t Driver; - ze_device_callbacks_t Device; - ze_context_callbacks_t Context; - ze_command_queue_callbacks_t CommandQueue; - ze_command_list_callbacks_t CommandList; - ze_fence_callbacks_t Fence; - ze_event_pool_callbacks_t EventPool; - ze_event_callbacks_t Event; - ze_image_callbacks_t Image; - ze_module_callbacks_t Module; - ze_module_build_log_callbacks_t ModuleBuildLog; - ze_kernel_callbacks_t Kernel; - ze_sampler_callbacks_t Sampler; - ze_physical_mem_callbacks_t PhysicalMem; - ze_mem_callbacks_t Mem; - ze_virtual_mem_callbacks_t VirtualMem; - } ze_callbacks_t; - -#if !defined(__GNUC__) - #pragma endregion -#endif - -#if defined(__cplusplus) -} // extern "C" -#endif - -#endif // _ZE_API_H diff --git a/cpp/daal/include/services/internal/sycl/math/blas_executor.h b/cpp/daal/include/services/internal/sycl/math/blas_executor.h deleted file mode 100644 index 2bb7a1f53ce..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/blas_executor.h +++ /dev/null @@ -1,309 +0,0 @@ -/* file: blas_executor.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __ONEAPI_INTERNAL_MATH_BLAS_EXECUTOR_H__ -#define __ONEAPI_INTERNAL_MATH_BLAS_EXECUTOR_H__ - -/* -//++ -// Executors for BLAS functions -//-- -*/ - -#include - -#if (!defined(ONEAPI_DAAL_NO_MKL_GPU_FUNC) && defined(__SYCL_COMPILER_VERSION)) - #include "services/internal/sycl/math/mkl_blas.h" -#endif - -#include "services/internal/sycl/types_utils.h" -#include "services/internal/sycl/math/types.h" -#include "services/internal/sycl/math/reference_gemm.h" -#include "services/internal/sycl/math/reference_axpy.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -/** - * @defgroup oneapi_internal oneAPIInternal - * \brief Contains classes of SYCL* abstraction layer - * @{ - */ - -/** - * - * \brief Adapter for GEMM routine - */ -class GemmExecutor -{ -private: - struct Execute - { - ::sycl::queue & queue; - const math::Transpose transa; - const math::Transpose transb; - const size_t m; - const size_t n; - const size_t k; - const double alpha; - const UniversalBuffer & a_buffer; - const size_t lda; - const size_t offsetA; - const UniversalBuffer & b_buffer; - const size_t ldb; - const size_t offsetB; - const double beta; - UniversalBuffer & c_buffer; - const size_t ldc; - const size_t offsetC; - - explicit Execute(::sycl::queue & queue, const math::Transpose transa, const math::Transpose transb, const size_t m, const size_t n, - const size_t k, const double alpha, const UniversalBuffer & a_buffer, const size_t lda, const size_t offsetA, - const UniversalBuffer & b_buffer, const size_t ldb, const size_t offsetB, const double beta, UniversalBuffer & c_buffer, - const size_t ldc, const size_t offsetC) - : queue(queue), - transa(transa), - transb(transb), - m(m), - n(n), - k(k), - alpha(alpha), - a_buffer(a_buffer), - lda(lda), - offsetA(offsetA), - b_buffer(b_buffer), - ldb(ldb), - offsetB(offsetB), - beta(beta), - c_buffer(c_buffer), - ldc(ldc), - offsetC(offsetC) - {} - - size_t getAExpectedSize() const { return (transa == math::Transpose::NoTrans) ? lda * k : lda * m; } - - size_t getBExpectedSize() const { return (transb == math::Transpose::NoTrans) ? ldb * n : ldb * k; } - - size_t getCExpectedSize() const { return ldc * n; } - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(a_buffer, T, getAExpectedSize()); - DAAL_ASSERT_UNIVERSAL_BUFFER(b_buffer, T, getBExpectedSize()); - DAAL_ASSERT_UNIVERSAL_BUFFER(c_buffer, T, getCExpectedSize()); - - auto a_buffer_t = a_buffer.template get(); - auto b_buffer_t = b_buffer.template get(); - auto c_buffer_t = c_buffer.template get(); - -#ifdef ONEAPI_DAAL_NO_MKL_GPU_FUNC - ReferenceGemm functor; -#else - MKLGemm functor(queue); -#endif - - status |= - functor(transa, transb, m, n, k, (T)alpha, a_buffer_t, lda, offsetA, b_buffer_t, ldb, offsetB, (T)beta, c_buffer_t, ldc, offsetC); - } - }; - -public: - static void run(::sycl::queue & queue, const math::Transpose transa, const math::Transpose transb, const size_t m, const size_t n, const size_t k, - const double alpha, const UniversalBuffer & a_buffer, const size_t lda, const size_t offsetA, const UniversalBuffer & b_buffer, - const size_t ldb, const size_t offsetB, const double beta, UniversalBuffer & c_buffer, const size_t ldc, const size_t offsetC, - Status & status) - { - DAAL_ASSERT(!a_buffer.empty()); - DAAL_ASSERT(!b_buffer.empty()); - DAAL_ASSERT(!c_buffer.empty()); - DAAL_ASSERT(a_buffer.type() == b_buffer.type()); - DAAL_ASSERT(b_buffer.type() == c_buffer.type()); - - Execute op(queue, transa, transb, m, n, k, alpha, a_buffer, lda, offsetA, b_buffer, ldb, offsetB, beta, c_buffer, ldc, offsetC); - TypeDispatcher::floatDispatch(a_buffer.type(), op, status); - } -}; - -/** - * - * \brief Adapter for SYRK routine - */ -class SyrkExecutor -{ -private: - struct Execute - { - ::sycl::queue & queue; - const math::UpLo upper_lower; - const math::Transpose trans; - const size_t n; - const size_t k; - const double alpha; - const UniversalBuffer & a_buffer; - const size_t lda; - const size_t offsetA; - const double beta; - UniversalBuffer & c_buffer; - const size_t ldc; - const size_t offsetC; - - explicit Execute(::sycl::queue & queue, const math::UpLo upper_lower, const math::Transpose trans, const size_t n, const size_t k, - const double alpha, const UniversalBuffer & a_buffer, const size_t lda, const size_t offsetA, const double beta, - UniversalBuffer & c_buffer, const size_t ldc, const size_t offsetC) - : queue(queue), - upper_lower(upper_lower), - trans(trans), - n(n), - k(k), - alpha(alpha), - a_buffer(a_buffer), - lda(lda), - offsetA(offsetA), - beta(beta), - c_buffer(c_buffer), - ldc(ldc), - offsetC(offsetC) - {} - - size_t getAExpectedSize() const { return (trans == math::Transpose::NoTrans) ? lda * k : lda * n; } - - size_t getCExpectedSize() const { return (trans == math::Transpose::NoTrans) ? ldc * n : ldc * k; } - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(a_buffer, T, getAExpectedSize() + offsetA); - DAAL_ASSERT_UNIVERSAL_BUFFER(c_buffer, T, getCExpectedSize() + offsetC); - - auto a_buffer_t = a_buffer.template get(); - auto c_buffer_t = c_buffer.template get(); - - const math::Transpose transInv = trans == math::Transpose::NoTrans ? math::Transpose::Trans : math::Transpose::NoTrans; - -#ifdef ONEAPI_DAAL_NO_MKL_GPU_FUNC - ReferenceGemm functor; - status |= - functor(transInv, trans, k, k, n, (T)alpha, a_buffer_t, lda, offsetA, a_buffer_t, lda, offsetA, (T)beta, c_buffer_t, ldc, offsetC); -#else - MKLSyrk functor(queue); - status |= functor(upper_lower, transInv, k, n, (T)alpha, a_buffer_t, lda, offsetA, (T)beta, c_buffer_t, ldc, offsetC); -#endif - } - }; - -public: - static void run(::sycl::queue & queue, const math::UpLo upper_lower, const math::Transpose trans, const size_t n, const size_t k, - const double alpha, const UniversalBuffer & a_buffer, const size_t lda, const size_t offsetA, const double beta, - UniversalBuffer & c_buffer, const size_t ldc, const size_t offsetC, Status & status) - { - DAAL_ASSERT(!a_buffer.empty()); - DAAL_ASSERT(!c_buffer.empty()); - DAAL_ASSERT(a_buffer.type() == c_buffer.type()); - - Execute op(queue, upper_lower, trans, n, k, alpha, a_buffer, lda, offsetA, beta, c_buffer, ldc, offsetC); - TypeDispatcher::floatDispatch(a_buffer.type(), op, status); - } -}; - -/** - * - * \brief Adapter for AXPY routine - */ -class AxpyExecutor -{ -private: - template - static Status checkSize(const int n, const Buffer & buffer, const int inc) - { - return Status(); - } - - struct Execute - { - ::sycl::queue & queue; - const uint32_t n; - const double a; - const UniversalBuffer & x_buffer; - const int incx; - UniversalBuffer & y_buffer; - const int incy; - - explicit Execute(::sycl::queue & queue, const uint32_t n, const double a, const UniversalBuffer & x_buffer, const int incx, - UniversalBuffer & y_buffer, const int incy) - : queue(queue), n(n), a(a), x_buffer(x_buffer), incx(incx), y_buffer(y_buffer), incy(incy) - {} - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(x_buffer, algorithmFPType); - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(y_buffer, algorithmFPType); - - auto x_buffer_t = x_buffer.template get(); - auto y_buffer_t = y_buffer.template get(); - - DAAL_ASSERT(n > 0); - DAAL_ASSERT(size_t((n - 1) * incx) < x_buffer_t.size()); - DAAL_ASSERT(size_t((n - 1) * incy) < y_buffer_t.size()); - -#ifdef ONEAPI_DAAL_NO_MKL_GPU_FUNC - ReferenceAxpy functor; -#else - MKLAxpy functor(queue); -#endif - status |= functor(n, (algorithmFPType)a, x_buffer_t, incx, y_buffer_t, incy); - } - }; - -public: - static void run(::sycl::queue & queue, const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, UniversalBuffer y_buffer, - const int incy, Status & status) - { - DAAL_ASSERT(!x_buffer.empty()); - DAAL_ASSERT(!y_buffer.empty()); - DAAL_ASSERT(x_buffer.type() == y_buffer.type()); - - Execute op(queue, n, a, x_buffer, incx, y_buffer, incy); - TypeDispatcher::floatDispatch(x_buffer.type(), op, status); - } -}; - -/** @} */ - -} // namespace interface1 - -using interface1::GemmExecutor; -using interface1::SyrkExecutor; -using interface1::AxpyExecutor; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/lapack_executor.h b/cpp/daal/include/services/internal/sycl/math/lapack_executor.h deleted file mode 100644 index 109902bc268..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/lapack_executor.h +++ /dev/null @@ -1,164 +0,0 @@ -/* file: lapack_executor.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __ONEAPI_INTERNAL_MATH_LAPACK_EXECUTOR_H__ -#define __ONEAPI_INTERNAL_MATH_LAPACK_EXECUTOR_H__ - -/* -//++ -// Executors for LAPACK functions -//-- -*/ - -#include - -#if (!defined(ONEAPI_DAAL_NO_MKL_GPU_FUNC) && defined(__SYCL_COMPILER_VERSION)) - #include "services/internal/sycl/math/mkl_lapack.h" -#endif - -#include "services/internal/sycl/types_utils.h" -#include "services/internal/sycl/math/types.h" -#include "services/internal/sycl/math/reference_lapack.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -/** @ingroup oneapi_internal - * @{ - */ - -/** - * - * \brief Adapter for POTRF routine - */ -class PotrfExecutor -{ -private: - struct Execute - { - ::sycl::queue & queue; - const math::UpLo uplo; - const size_t n; - UniversalBuffer & a_buffer; - const size_t lda; - explicit Execute(::sycl::queue & queue, const math::UpLo uplo, const size_t n, UniversalBuffer & a_buffer, const size_t lda) - : queue(queue), uplo(uplo), n(n), a_buffer(a_buffer), lda(lda) - {} - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(a_buffer, T, n * lda); - - auto a_buffer_t = a_buffer.template get(); - -#ifdef ONEAPI_DAAL_NO_MKL_GPU_FUNC - ReferencePotrf functor; -#else - MKLPotrf functor(queue); -#endif - status |= functor(uplo, n, a_buffer_t, lda); - } - }; - -public: - static void run(::sycl::queue & queue, const math::UpLo uplo, const size_t n, UniversalBuffer & a_buffer, const size_t lda, Status & status) - { - DAAL_ASSERT(!a_buffer.empty()); - - Execute op(queue, uplo, n, a_buffer, lda); - TypeDispatcher::floatDispatch(a_buffer.type(), op, status); - } -}; - -/** - * - * \brief Adapter for POTRS routine - */ -class PotrsExecutor -{ -private: - struct Execute - { - ::sycl::queue & queue; - const math::UpLo uplo; - const size_t n; - const size_t ny; - UniversalBuffer & a_buffer; - const size_t lda; - UniversalBuffer & b_buffer; - const size_t ldb; - - explicit Execute(::sycl::queue & queue, const math::UpLo uplo, const size_t n, const size_t ny, UniversalBuffer & a_buffer, const size_t lda, - UniversalBuffer & b_buffer, const size_t ldb) - : queue(queue), uplo(uplo), n(n), ny(ny), a_buffer(a_buffer), lda(lda), b_buffer(b_buffer), ldb(ldb) - {} - - template - void operator()(Typelist, Status & status) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(a_buffer, T, n * lda); - DAAL_ASSERT_UNIVERSAL_BUFFER(b_buffer, T, ny * ldb); - - auto a_buffer_t = a_buffer.template get(); - auto b_buffer_t = b_buffer.template get(); - -#ifdef ONEAPI_DAAL_NO_MKL_GPU_FUNC - ReferencePotrs functor; -#else - MKLPotrs functor(queue); -#endif - - status |= functor(uplo, n, ny, a_buffer_t, lda, b_buffer_t, ldb); - } - }; - -public: - static void run(::sycl::queue & queue, const math::UpLo uplo, const size_t n, const size_t ny, UniversalBuffer & a_buffer, const size_t lda, - UniversalBuffer & b_buffer, const size_t ldb, Status & status) - { - DAAL_ASSERT(!a_buffer.empty()); - DAAL_ASSERT(!b_buffer.empty()); - DAAL_ASSERT(a_buffer.type() == b_buffer.type()); - - Execute op(queue, uplo, n, ny, a_buffer, lda, b_buffer, ldb); - TypeDispatcher::floatDispatch(a_buffer.type(), op, status); - } -}; - -/** @} */ -} // namespace interface1 - -using interface1::PotrfExecutor; -using interface1::PotrsExecutor; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/mkl_blas.h b/cpp/daal/include/services/internal/sycl/math/mkl_blas.h deleted file mode 100644 index 73b2797b143..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/mkl_blas.h +++ /dev/null @@ -1,219 +0,0 @@ -/* file: mkl_blas.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Wrappers for BLAS functions. -//-- -*/ - -#ifndef __ONEAPI_INTERNAL_MKL_BLAS_H__ -#define __ONEAPI_INTERNAL_MKL_BLAS_H__ - -#include "services/internal/buffer.h" -#include "services/internal/sycl/error_handling_sycl.h" -#include "services/internal/sycl/math/mkl_dal_utils.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -/** @ingroup oneapi_internal - * @{ - */ - -/** - * - * \brief Adapter for Intel(R) MKL GEMM routine - */ -template -struct MKLGemm -{ - MKLGemm(::sycl::queue & queue) : _queue(queue) {} - - Status operator()(const math::Transpose transa, const math::Transpose transb, const size_t m, const size_t n, const size_t k, - const algorithmFPType alpha, const Buffer & a_buffer, const size_t lda, const size_t offsetA, - const Buffer & b_buffer, const size_t ldb, const size_t offsetB, const algorithmFPType beta, - Buffer & c_buffer, const size_t ldc, const size_t offsetC) - { - Status status; - -#ifdef DAAL_SYCL_INTERFACE_USM - const auto transamkl = to_fpk_transpose(transa); - const auto transbmkl = to_fpk_transpose(transb); - - auto a_usm = a_buffer.toUSM(_queue, data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - auto b_usm = b_buffer.toUSM(_queue, data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - auto c_usm = c_buffer.toUSM(_queue, data_management::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - auto a_ptr = a_usm.get() + offsetA; - auto b_ptr = b_usm.get() + offsetB; - auto c_ptr = c_usm.get() + offsetC; - - status |= catchSyclExceptions([&]() mutable { - ::oneapi::fpk::blas::gemm(_queue, transamkl, transbmkl, m, n, k, alpha, a_ptr, lda, b_ptr, ldb, beta, c_ptr, ldc); - _queue.wait_and_throw(); - }); -#else - static_assert(false, "USM support required"); -#endif - return status; - } - -private: - template - void innerGemm(MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t m, int64_t n, int64_t k, T alpha, ::sycl::buffer a, int64_t lda, - ::sycl::buffer b, int64_t ldb, T beta, ::sycl::buffer c, int64_t ldc, int64_t offset_a, int64_t offset_b, - int64_t offset_c); - - template <> - void innerGemm(MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t m, int64_t n, int64_t k, double alpha, ::sycl::buffer a, - int64_t lda, ::sycl::buffer b, int64_t ldb, double beta, ::sycl::buffer c, int64_t ldc, - int64_t offset_a, int64_t offset_b, int64_t offset_c) - { - ::oneapi::fpk::gpu::dgemm_sycl(&_queue, transa, transb, m, n, k, alpha, &a, lda, &b, ldb, beta, &c, ldc, offset_a, offset_b, offset_c); - } - - template <> - void innerGemm(MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, int64_t m, int64_t n, int64_t k, float alpha, ::sycl::buffer a, - int64_t lda, ::sycl::buffer b, int64_t ldb, float beta, ::sycl::buffer c, int64_t ldc, int64_t offset_a, - int64_t offset_b, int64_t offset_c) - { - ::oneapi::fpk::gpu::sgemm_sycl(&_queue, transa, transb, m, n, k, alpha, &a, lda, &b, ldb, beta, &c, ldc, offset_a, offset_b, offset_c); - } - - ::sycl::queue & _queue; -}; - -/** - * - * \brief Adapter for Intel(R) MKL SYRK routine - */ -template -struct MKLSyrk -{ - MKLSyrk(::sycl::queue & queue) : _queue(queue) {} - - Status operator()(const math::UpLo upper_lower, const math::Transpose trans, const size_t n, const size_t k, const algorithmFPType alpha, - const Buffer & a_buffer, const size_t lda, const size_t offsetA, const algorithmFPType beta, - Buffer & c_buffer, const size_t ldc, const size_t offsetC) - { - Status status; - -#ifdef DAAL_SYCL_INTERFACE_USM - const auto transmkl = to_fpk_transpose(trans); - const auto uplomkl = to_fpk_uplo(upper_lower); - - auto a_usm = a_buffer.toUSM(_queue, data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - auto c_usm = c_buffer.toUSM(_queue, data_management::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - auto a_ptr = a_usm.get() + offsetA; - auto c_ptr = c_usm.get() + offsetC; - - status |= catchSyclExceptions([&]() mutable { - ::oneapi::fpk::blas::syrk(_queue, uplomkl, transmkl, n, k, alpha, a_ptr, lda, beta, c_ptr, ldc); - _queue.wait_and_throw(); - }); -#else - static_assert(false, "USM support required"); -#endif - return status; - } - -private: - template - void innerSyrk(MKL_UPLO uplo, MKL_TRANSPOSE trans, int64_t n, int64_t k, T alpha, ::sycl::buffer a, int64_t lda, T beta, - ::sycl::buffer c, int64_t ldc, int64_t offset_a, int64_t offset_c); - - template <> - void innerSyrk(MKL_UPLO uplo, MKL_TRANSPOSE trans, int64_t n, int64_t k, double alpha, ::sycl::buffer a, int64_t lda, double beta, - ::sycl::buffer c, int64_t ldc, int64_t offset_a, int64_t offset_c) - { - ::oneapi::fpk::gpu::dsyrk_sycl(&_queue, uplo, trans, n, k, alpha, &a, lda, beta, &c, ldc, offset_a, offset_c); - } - - template <> - void innerSyrk(MKL_UPLO uplo, MKL_TRANSPOSE trans, int64_t n, int64_t k, float alpha, ::sycl::buffer a, int64_t lda, float beta, - ::sycl::buffer c, int64_t ldc, int64_t offset_a, int64_t offset_c) - { - ::oneapi::fpk::gpu::ssyrk_sycl(&_queue, uplo, trans, n, k, alpha, &a, lda, beta, &c, ldc, offset_a, offset_c); - } - - ::sycl::queue & _queue; -}; - -/** - * - * \brief Adapter for Intel(R) MKL AXPY routine - */ -template -struct MKLAxpy -{ - MKLAxpy(::sycl::queue & queue) : _queue(queue) {} - - Status operator()(const int n, const algorithmFPType a, const Buffer & x_buffer, const int incx, - Buffer & y_buffer, const int incy) - { - Status status; - -#ifdef DAAL_SYCL_INTERFACE_USM - auto x_usm = x_buffer.toUSM(_queue, data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - auto y_usm = y_buffer.toUSM(_queue, data_management::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - status |= catchSyclExceptions([&]() mutable { - ::oneapi::fpk::blas::axpy(_queue, n, a, x_usm.get(), incx, y_usm.get(), incy); - _queue.wait_and_throw(); - }); -#else - static_assert(false, "USM support required"); -#endif - return status; - } - -private: - ::sycl::queue & _queue; -}; - -/** @} */ -} // namespace interface1 - -using interface1::MKLGemm; -using interface1::MKLSyrk; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/mkl_dal.h b/cpp/daal/include/services/internal/sycl/math/mkl_dal.h deleted file mode 100644 index 091311057d1..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/mkl_dal.h +++ /dev/null @@ -1,36 +0,0 @@ -/* file: mkl_dal.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_MATH_MKL_DAL_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_MATH_MKL_DAL_H__ - -#ifdef __clang__ - #define DISABLE_MKL_DAL_SYCL_WARNINGS_BEGIN() _Pragma("clang diagnostic push") _Pragma("clang diagnostic ignored \"-Wreorder-ctor\"") - #define DISABLE_MKL_DAL_SYCL_WARNINGS_END() _Pragma("clang diagnostic pop") -#else - #define DISABLE_MKL_DAL_SYCL_WARNINGS_BEGIN() - #define DISABLE_MKL_DAL_SYCL_WARNINGS_END() -#endif - -DISABLE_MKL_DAL_SYCL_WARNINGS_BEGIN() -#include "mkl_dal_sycl.hpp" -DISABLE_MKL_DAL_SYCL_WARNINGS_END() - -#undef DISABLE_MKL_DAL_SYCL_WARNINGS_BEGIN -#undef DISABLE_MKL_DAL_SYCL_WARNINGS_END - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h b/cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h deleted file mode 100644 index 0c39f4a6ab2..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h +++ /dev/null @@ -1,65 +0,0 @@ -/* file: mkl_dal_utils.h */ -/******************************************************************************* -* Copyright 2021 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Utility functions for DAL wrappers over Intel(R) MKL routines. -//-- -*/ - -#ifndef __ONEAPI_INTERNAL_MKL_DAL_UTILS_H__ -#define __ONEAPI_INTERNAL_MKL_DAL_UTILS_H__ - -#include "services/internal/sycl/math/types.h" -#include "services/internal/sycl/math/mkl_dal.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -inline ::oneapi::fpk::transpose to_fpk_transpose(const math::Transpose & trans) -{ - using fpk_transpose = ::oneapi::fpk::transpose; - return trans == math::Transpose::Trans ? fpk_transpose::trans : fpk_transpose::nontrans; -} - -inline ::oneapi::fpk::uplo to_fpk_uplo(const math::UpLo & uplo) -{ - using fpk_uplo = ::oneapi::fpk::uplo; - return uplo == math::UpLo::Upper ? fpk_uplo::upper : fpk_uplo::lower; -} - -} // namespace interface1 - -using interface1::to_fpk_transpose; -using interface1::to_fpk_uplo; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/mkl_lapack.h b/cpp/daal/include/services/internal/sycl/math/mkl_lapack.h deleted file mode 100644 index 32a2f65bf61..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/mkl_lapack.h +++ /dev/null @@ -1,170 +0,0 @@ -/* file: mkl_lapack.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Wrappers for LAPACK functions. -//-- -*/ - -#ifndef __ONEAPI_INTERNAL_MKL_LAPACK_H__ -#define __ONEAPI_INTERNAL_MKL_LAPACK_H__ - -#include "services/internal/buffer.h" -#include "services/internal/sycl/math/mkl_dal_utils.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -/** @ingroup oneapi_internal - * @{ - */ - -/** - * - * \brief Adapter for Intel(R) MKL POTRF routine - */ -template -struct MKLPotrf -{ - MKLPotrf(::sycl::queue & queue) : _queue(queue) {} - - Status operator()(const math::UpLo uplo, const size_t n, Buffer & a, const size_t lda) - { - const auto uplomkl = to_fpk_uplo(uplo); - const std::int64_t minimalScratchpadSize = ::oneapi::fpk::lapack::potrf_scratchpad_size(_queue, uplomkl, n, lda); - return this->operator()(uplo, n, a, lda, minimalScratchpadSize); - } - -private: - Status operator()(const math::UpLo uplo, const size_t n, Buffer & a, const size_t lda, const std::int64_t scratchpadSize) - { - using namespace daal::services; - - Status status; - const auto uplomkl = to_fpk_uplo(uplo); - -#ifdef DAAL_SYCL_INTERFACE_USM - auto a_usm = a.toUSM(_queue, data_management::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - algorithmFPType * scratchpad = nullptr; - if (scratchpadSize > 0) - { - scratchpad = ::sycl::malloc_device(scratchpadSize, _queue); - if (scratchpad == nullptr) return ErrorMemoryAllocationFailed; - } - - status |= catchSyclExceptions([&]() mutable { - ::oneapi::fpk::lapack::potrf(_queue, uplomkl, n, a_usm.get(), lda, scratchpad, scratchpadSize); - _queue.wait_and_throw(); - }); - - if (scratchpadSize > 0) ::sycl::free(scratchpad, _queue); - - scratchpad = nullptr; -#else - static_assert(false, "USM support required"); -#endif - return status; - } - -private: - ::sycl::queue & _queue; -}; - -/** - * - * \brief Adapter for reference AXPY routine - */ -template -struct DAAL_EXPORT ReferenceAxpy -{ - ReferenceAxpy() {} - - Status operator()(const int n, const algorithmFPType a, const Buffer & x_buffer, const int incx, - Buffer & y_buffer, const int incy); -}; - -/** @} */ -} // namespace interface1 - -using interface1::ReferenceAxpy; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/reference_gemm.h b/cpp/daal/include/services/internal/sycl/math/reference_gemm.h deleted file mode 100644 index 2859c2eeec0..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/reference_gemm.h +++ /dev/null @@ -1,74 +0,0 @@ -/* file: reference_gemm.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Wrappers for BLAS functions. -//-- -*/ - -#ifndef __ONEAPI_INTERNAL_MATH_REFERENCE_GEMM_H__ -#define __ONEAPI_INTERNAL_MATH_REFERENCE_GEMM_H__ - -#include "services/internal/sycl/math/types.h" -#include "services/internal/sycl/types_utils.h" -#include "services/internal/buffer.h" -#include "services/env_detect.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -/** @ingroup oneapi_internal - * @{ - */ - -/** - * - * \brief Adapter for reference GEMM routine - */ -template -struct DAAL_EXPORT ReferenceGemm -{ - ReferenceGemm() {} - - Status operator()(const math::Transpose transa, const math::Transpose transb, const size_t m, const size_t n, const size_t k, - const algorithmFPType alpha, const Buffer & a_buffer, const size_t lda, const size_t offsetA, - const Buffer & b_buffer, const size_t ldb, const size_t offsetB, const algorithmFPType beta, - Buffer & c_buffer, const size_t ldc, const size_t offsetC); -}; - -/** @} */ -} // namespace interface1 - -using interface1::ReferenceGemm; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/reference_lapack.h b/cpp/daal/include/services/internal/sycl/math/reference_lapack.h deleted file mode 100644 index bdf610cb6e5..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/reference_lapack.h +++ /dev/null @@ -1,85 +0,0 @@ -/* file: reference_lapack.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Wrappers for Lapack functions. -//-- -*/ - -#ifndef __ONEAPI_INTERNAL_MATH_REFERENCE_LAPACK_H__ -#define __ONEAPI_INTERNAL_MATH_REFERENCE_LAPACK_H__ - -#include "services/internal/sycl/math/types.h" -#include "services/internal/sycl/types_utils.h" -#include "services/internal/buffer.h" -#include "services/env_detect.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -/** @ingroup oneapi_internal - * @{ - */ - -/** - * - * \brief Adapter for reference POTRF routine - */ -template -struct DAAL_EXPORT ReferencePotrf -{ - ReferencePotrf() {} - - Status operator()(const math::UpLo uplo, const size_t n, Buffer & a_buffer, const size_t lda); -}; - -/** - * - * \brief Adapter for reference POTRS routine - */ -template -struct DAAL_EXPORT ReferencePotrs -{ - ReferencePotrs() {} - - Status operator()(const math::UpLo uplo, const size_t n, const size_t ny, Buffer & a_buffer, const size_t lda, - Buffer & b_buffer, const size_t ldb); -}; - -/** @} */ -} // namespace interface1 - -using interface1::ReferencePotrf; -using interface1::ReferencePotrs; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/math/types.h b/cpp/daal/include/services/internal/sycl/math/types.h deleted file mode 100644 index 1633884d61c..00000000000 --- a/cpp/daal/include/services/internal/sycl/math/types.h +++ /dev/null @@ -1,63 +0,0 @@ -/* file: types.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __ONEAPI_INTERNAL_MATH_TYPES_H__ -#define __ONEAPI_INTERNAL_MATH_TYPES_H__ - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -enum Layout -{ - ColMajor, - RowMajor -}; - -enum Transpose -{ - NoTrans, - Trans -}; - -enum UpLo -{ - Upper, - Lower -}; - -} // namespace interface1 - -using interface1::Layout; -using interface1::Transpose; -using interface1::UpLo; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/types.h b/cpp/daal/include/services/internal/sycl/types.h deleted file mode 100644 index 2f2310c93ef..00000000000 --- a/cpp/daal/include/services/internal/sycl/types.h +++ /dev/null @@ -1,272 +0,0 @@ -/* file: types.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_TYPES_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_TYPES_H__ - -#include -#include - -#include "services/daal_string.h" -#include "services/internal/any.h" -#include "services/internal/buffer.h" - -#define DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(buffer, BufferType) DAAL_ASSERT((buffer).type() == TypeIds::id()); - -#define DAAL_ASSERT_UNIVERSAL_BUFFER(buffer, BufferType, bufferSize) \ - { \ - DAAL_ASSERT_UNIVERSAL_BUFFER_TYPE(buffer, BufferType) \ - DAAL_ASSERT((buffer).template get().size() >= (bufferSize)); \ - } - -#define DAAL_ASSERT_UNIVERSAL_BUFFER2(buffer, bufferType1, bufferType2, bufferSize) \ - { \ - DAAL_ASSERT(((buffer).type() == TypeIds::id() && (buffer).template get().size() >= (bufferSize)) \ - || ((buffer).type() == TypeIds::id() && (buffer).template get().size() >= (bufferSize))); \ - } - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -/** @ingroup oneapi_internal - * @{ - */ - -typedef ::int8_t int8_t; -typedef ::int16_t int16_t; -typedef ::int32_t int32_t; -typedef ::int64_t int64_t; -typedef ::uint8_t uint8_t; -typedef ::uint16_t uint16_t; -typedef ::uint32_t uint32_t; -typedef ::uint64_t uint64_t; -typedef float float32_t; -typedef double float64_t; - -template -inline String getKeyFPType() -{ - if (IsSameType::value) - { - return String(" -D algorithmFPType=float -D algorithmFPType2=float2 -D algorithmFPType4=float4 -D FPTYPE_MAXVALUE=FLT_MAX"); - } - if (IsSameType::value) - { - return String(" -D algorithmFPType=double -D algorithmFPType2=double2 -D algorithmFPType4=double4 -D FPTYPE_MAXVALUE=DBL_MAX"); - } - if (IsSameType::value) - { - return String(" -D algorithmFPType=int -D algorithmFPType2=int2 -D algorithmFPType4=int4 "); - } - if (IsSameType::value) - { - return String(" -D algorithmFPType=uint -D algorithmFPType2=uint2 -D algorithmFPType4=uint4 "); - } - return String(); -} - -namespace interface1 -{ -/** - * - * \brief Mapping from standart types to enum values - */ -class TypeIds -{ -public: - enum Id - { - /* Signed integers */ - int8, - int16, - int32, - int64, - - /* Unsigned integers */ - uint8, - uint16, - uint32, - uint64, - - /* Floatin point */ - float32, - float64, - - /* Other types */ - custom - }; - - template - static inline Id id(); - -private: - TypeIds(); -}; -typedef TypeIds::Id TypeId; - -namespace internal -{ -template -inline TypeId getTypeId() -{ - return TypeIds::custom; -} - -#define DAAL_DECLARE_TYPE_ID_MAP(id_) \ - template <> \ - inline TypeId getTypeId() \ - { \ - return TypeIds::id_; \ - } - -DAAL_DECLARE_TYPE_ID_MAP(int8) -DAAL_DECLARE_TYPE_ID_MAP(int16) -DAAL_DECLARE_TYPE_ID_MAP(int32) -DAAL_DECLARE_TYPE_ID_MAP(int64) -DAAL_DECLARE_TYPE_ID_MAP(uint8) -DAAL_DECLARE_TYPE_ID_MAP(uint16) -DAAL_DECLARE_TYPE_ID_MAP(uint32) -DAAL_DECLARE_TYPE_ID_MAP(uint64) -DAAL_DECLARE_TYPE_ID_MAP(float32) -DAAL_DECLARE_TYPE_ID_MAP(float64) - -#undef DAAL_DECLARE_TYPE_ID_MAP - -} // namespace internal - -template -inline TypeId TypeIds::id() -{ - return internal::getTypeId(); -} - -/** - * - * \brief Enumeration of device types avaliable - */ -class ExecutionTargetIds -{ -public: - enum Id - { - host, - device, - unspecified - }; - -private: - ExecutionTargetIds(); -}; -typedef ExecutionTargetIds::Id ExecutionTargetId; - -/** - * - * \brief Access modes to kernel arguments - */ -class AccessModeIds -{ -public: - enum Id - { - read, - write, - readwrite - }; - -private: - AccessModeIds(); -}; -typedef AccessModeIds::Id AccessModeId; - -/** - * - * \brief Non-templated wrapper for Buffer object - */ -class UniversalBuffer : public Base -{ -public: - UniversalBuffer() : _type(TypeIds::id()) {} - - template - UniversalBuffer(const Buffer & buffer) : _type(TypeIds::id()), _anyBuffer(buffer) - {} - - template - const Buffer & get() const - { - return _anyBuffer.get >(); - } - - template - UniversalBuffer & operator=(const Buffer & buffer) - { - _type = TypeIds::id(); - _anyBuffer = buffer; - return *this; - } - - TypeId type() const { return _type; } - - const Any & any() const { return _anyBuffer; } - - bool empty() const { return _anyBuffer.empty(); } - -private: - TypeId _type; - Any _anyBuffer; -}; - -/** - * - * \brief Class representing local (device-only) buffer - */ -class LocalBuffer : public Base -{ -public: - LocalBuffer(TypeId id, size_t size) : _id(id), _size(size) {} - - TypeId type() const { return _id; } - size_t size() const { return _size; } - -private: - TypeId _id; - size_t _size; -}; - -} // namespace interface1 - -using interface1::TypeId; -using interface1::TypeIds; -using interface1::ExecutionTargetId; -using interface1::ExecutionTargetIds; -using interface1::AccessModeId; -using interface1::AccessModeIds; -using interface1::UniversalBuffer; -using interface1::LocalBuffer; - -/** @} */ -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/include/services/internal/sycl/types_utils.h b/cpp/daal/include/services/internal/sycl/types_utils.h deleted file mode 100644 index f2fcb280b86..00000000000 --- a/cpp/daal/include/services/internal/sycl/types_utils.h +++ /dev/null @@ -1,96 +0,0 @@ -/* file: types_utils.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __DAAL_SERVICES_INTERNAL_SYCL_TYPES_UTILS_H__ -#define __DAAL_SERVICES_INTERNAL_SYCL_TYPES_UTILS_H__ - -#include "services/internal/sycl/types.h" - -/// \cond INTERNAL -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -template -struct Typelist -{}; - -typedef Typelist - PrimitiveTypes; - -typedef Typelist FloatTypes; - -class TypeDispatcher -{ -public: - template - static void dispatch(TypeId type, Operation && op, Status & status) - { - dispatchInternal(status, type, op, PrimitiveTypes()); - } - - template - static void floatDispatch(TypeId type, Operation && op, Status & status) - { - dispatchInternal(status, type, op, FloatTypes()); - } - -private: - template - static void dispatchInternal(Status & status, TypeId type, Operation && op, Typelist) - { - if (type == TypeIds::id()) - { - op(Typelist(), status); - } - else - { - dispatchInternal(status, type, op, Typelist()); - } - } - - template - static void dispatchInternal(Status & status, TypeId type, Operation && op, Typelist<>) - { - DAAL_ASSERT(!"Unknown type"); - } -}; - -String getKeyFPType(TypeId typeId); - -} // namespace interface1 - -using interface1::Typelist; -using interface1::TypeDispatcher; -using interface1::getKeyFPType; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal -/// \endcond - -#endif diff --git a/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h b/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h index f9570309739..90bebfaffe9 100644 --- a/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h +++ b/cpp/daal/include/services/internal/x86_64/x86_64_kernel_defines.h @@ -33,7 +33,6 @@ #define DAAL_KERNEL_SSE2_CONTAINER(ContainerTemplate, ...) #define DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, ...) #define DAAL_KERNEL_SSE2_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #endif #if defined(DAAL_KERNEL_SSE42) @@ -44,15 +43,13 @@ #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__) #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) \ extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__); - #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse42, __VA_ARGS__) - #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, sse42, __VA_ARGS__) + #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, sse42, __VA_ARGS__) #else #define DAAL_KERNEL_SSE42_ONLY(something) #define DAAL_KERNEL_SSE42_ONLY_CODE(...) #define DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, ...) #define DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, ...) #define DAAL_KERNEL_SSE42_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #endif #if defined(DAAL_KERNEL_AVX2) @@ -63,15 +60,13 @@ #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__) #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) \ extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__); - #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx2, __VA_ARGS__) - #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx2, __VA_ARGS__) + #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx2, __VA_ARGS__) #else #define DAAL_KERNEL_AVX2_ONLY(something) #define DAAL_KERNEL_AVX2_ONLY_CODE(...) #define DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, ...) #define DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, ...) #define DAAL_KERNEL_AVX2_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #endif #if defined(DAAL_KERNEL_AVX512) @@ -82,15 +77,13 @@ #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__) #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) \ extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512, __VA_ARGS__); - #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx512, __VA_ARGS__) - #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE_SYCL(ContainerTemplate, avx512, __VA_ARGS__) + #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...) DAAL_KERNEL_CONTAINER_CASE(ContainerTemplate, avx512, __VA_ARGS__) #else #define DAAL_KERNEL_AVX512_ONLY(something) #define DAAL_KERNEL_AVX512_ONLY_CODE(...) #define DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, ...) #define DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, ...) #define DAAL_KERNEL_AVX512_CONTAINER_CASE(ContainerTemplate, ...) - #define DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, ...) #endif #endif diff --git a/cpp/daal/src/algorithms/classifier/BUILD b/cpp/daal/src/algorithms/classifier/BUILD index 9f64eecd83b..9810bc00230 100644 --- a/cpp/daal/src/algorithms/classifier/BUILD +++ b/cpp/daal/src/algorithms/classifier/BUILD @@ -4,9 +4,8 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/daal/src/algorithms/classifier/classifier_predict_fpt.cpp b/cpp/daal/src/algorithms/classifier/classifier_predict_fpt.cpp index c903deb62f8..20d8dc14b81 100644 --- a/cpp/daal/src/algorithms/classifier/classifier_predict_fpt.cpp +++ b/cpp/daal/src/algorithms/classifier/classifier_predict_fpt.cpp @@ -22,7 +22,6 @@ */ #include "algorithms/classifier/classifier_predict_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" namespace daal { @@ -51,39 +50,27 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in const Parameter * par = static_cast(parameter); DAAL_CHECK(par, services::ErrorNullParameterNotSupported); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - const size_t nRows = (static_cast(input))->getNumberOfRows(); const size_t nClasses = par->nClasses; if (par->resultsToEvaluate & computeClassLabels) { dm::NumericTablePtr nt; - if (deviceInfo.isCpu) - nt = dm::HomogenNumericTable::create(1, nRows, dm::NumericTableIface::doAllocate, &st); - else - nt = dmi::SyclHomogenNumericTable::create(1, nRows, dm::NumericTableIface::doAllocate, &st); + nt = dm::HomogenNumericTable::create(1, nRows, dm::NumericTableIface::doAllocate, &st); set(prediction, nt); } if (par->resultsToEvaluate & computeClassProbabilities) { dm::NumericTablePtr nt; - if (deviceInfo.isCpu) - nt = dm::HomogenNumericTable::create(nClasses, nRows, dm::NumericTableIface::doAllocate, &st); - else - nt = dmi::SyclHomogenNumericTable::create(nClasses, nRows, dm::NumericTableIface::doAllocate, &st); + nt = dm::HomogenNumericTable::create(nClasses, nRows, dm::NumericTableIface::doAllocate, &st); set(probabilities, nt); } if (par->resultsToEvaluate & computeClassLogProbabilities) { dm::NumericTablePtr nt; - if (deviceInfo.isCpu) - nt = dm::HomogenNumericTable::create(nClasses, nRows, dm::NumericTableIface::doAllocate, &st); - else - nt = dmi::SyclHomogenNumericTable::create(nClasses, nRows, dm::NumericTableIface::doAllocate, &st); + nt = dm::HomogenNumericTable::create(nClasses, nRows, dm::NumericTableIface::doAllocate, &st); set(logProbabilities, nt); } diff --git a/cpp/daal/src/algorithms/covariance/BUILD b/cpp/daal/src/algorithms/covariance/BUILD index 9f64eecd83b..9810bc00230 100644 --- a/cpp/daal/src/algorithms/covariance/BUILD +++ b/cpp/daal/src/algorithms/covariance/BUILD @@ -4,9 +4,8 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/daal/src/algorithms/covariance/covariance_container.h b/cpp/daal/src/algorithms/covariance/covariance_container.h index 362c237f3e8..c8955a716da 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_container.h +++ b/cpp/daal/src/algorithms/covariance/covariance_container.h @@ -30,8 +30,6 @@ #include "algorithms/covariance/covariance_distributed.h" #include "src/algorithms/covariance/covariance_hyperparameter_impl.h" #include "src/algorithms/covariance/covariance_kernel.h" -#include "src/algorithms/covariance/oneapi/covariance_kernel_oneapi.h" -#include "src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi.h" #undef __DAAL_CONCAT #define __DAAL_CONCAT(x, y) x##y @@ -44,24 +42,6 @@ __DAAL_INITIALIZE_KERNELS(KernelClass, algorithmFPType, ComputeMethod); \ } -#undef __DAAL_COVARIANCE_BATCH_CONTAINER_CONSTRUCTOR_ONEAPI -#define __DAAL_COVARIANCE_BATCH_CONTAINER_CONSTRUCTOR_ONEAPI(ComputeMethod, KernelClass) \ - template \ - BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) \ - { \ - auto & context = services::internal::getDefaultContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_INITIALIZE_KERNELS(KernelClass, algorithmFPType, ComputeMethod); \ - } \ - else \ - { \ - _kernel = new oneapi::__DAAL_CONCAT(KernelClass, OneAPI)(); \ - } \ - } - #undef __DAAL_COVARIANCE_BATCH_CONTAINER_DESTRUCTOR #define __DAAL_COVARIANCE_BATCH_CONTAINER_DESTRUCTOR(ComputeMethod) \ template \ @@ -90,36 +70,6 @@ parameter, hyperparameter); \ } -#undef __DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE_ONEAPI -#define __DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE_ONEAPI(ComputeMethod, KernelClass) \ - template \ - services::Status BatchContainer::compute() \ - { \ - Result * result = static_cast(_res); \ - Input * input = static_cast(_in); \ - \ - NumericTable * dataTable = input->get(data).get(); \ - NumericTable * covTable = result->get(covariance).get(); \ - NumericTable * meanTable = result->get(mean).get(); \ - \ - Parameter * parameter = static_cast(_par); \ - daal::services::Environment::env & env = *_env; \ - \ - auto & context = services::internal::getDefaultContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_CALL_KERNEL(env, KernelClass, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), compute, dataTable, covTable, meanTable, \ - parameter); \ - } \ - else \ - { \ - return ((oneapi::__DAAL_CONCAT(KernelClass, OneAPI) < algorithmFPType, ComputeMethod > *)(_kernel)) \ - ->compute(dataTable, covTable, meanTable, parameter); \ - } \ - } - #undef __DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR #define __DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR(ComputeMethod, KernelClass) \ template \ @@ -128,24 +78,6 @@ __DAAL_INITIALIZE_KERNELS(KernelClass, algorithmFPType, ComputeMethod); \ } -#undef __DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR_ONEAPI -#define __DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR_ONEAPI(ComputeMethod, KernelClass) \ - template \ - OnlineContainer::OnlineContainer(daal::services::Environment::env * daalEnv) \ - { \ - auto & context = services::internal::getDefaultContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_INITIALIZE_KERNELS(KernelClass, algorithmFPType, ComputeMethod); \ - } \ - else \ - { \ - __DAAL_INITIALIZE_KERNELS_SYCL(oneapi::__DAAL_CONCAT(KernelClass, OneAPI), algorithmFPType, ComputeMethod); \ - } \ - } - #undef __DAAL_COVARIANCE_ONLINE_CONTAINER_DESTRUCTOR #define __DAAL_COVARIANCE_ONLINE_CONTAINER_DESTRUCTOR(ComputeMethod) \ template \ @@ -176,38 +108,6 @@ crossProductTable, sumTable, parameter, hyperparameter); \ } -#undef __DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE_ONEAPI -#define __DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE_ONEAPI(ComputeMethod, KernelClass) \ - template \ - services::Status OnlineContainer::compute() \ - { \ - PartialResult * partialResult = static_cast(_pres); \ - Input * input = static_cast(_in); \ - \ - NumericTable * dataTable = input->get(data).get(); \ - \ - NumericTable * nObsTable = partialResult->get(nObservations).get(); \ - NumericTable * crossProductTable = partialResult->get(crossProduct).get(); \ - NumericTable * sumTable = partialResult->get(sum).get(); \ - \ - Parameter * parameter = static_cast(_par); \ - daal::services::Environment::env & env = *_env; \ - \ - auto & context = services::internal::getDefaultContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_CALL_KERNEL(env, KernelClass, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), compute, dataTable, nObsTable, \ - crossProductTable, sumTable, parameter); \ - } \ - else \ - { \ - __DAAL_CALL_KERNEL_SYCL(env, oneapi::__DAAL_CONCAT(KernelClass, OneAPI), __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), \ - compute, dataTable, nObsTable, crossProductTable, sumTable, parameter); \ - } \ - } - #undef __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE #define __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE(ComputeMethod, KernelClass) \ template \ @@ -231,39 +131,6 @@ sumTable, covTable, meanTable, parameter, hyperparameter); \ } -#undef __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE_ONEAPI -#define __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE_ONEAPI(ComputeMethod, KernelClass) \ - template \ - services::Status OnlineContainer::finalizeCompute() \ - { \ - PartialResult * partialResult = static_cast(_pres); \ - Result * result = static_cast(_res); \ - \ - NumericTable * nObsTable = partialResult->get(nObservations).get(); \ - NumericTable * crossProductTable = partialResult->get(crossProduct).get(); \ - NumericTable * sumTable = partialResult->get(sum).get(); \ - \ - NumericTable * covTable = result->get(covariance).get(); \ - NumericTable * meanTable = result->get(mean).get(); \ - \ - Parameter * parameter = static_cast(_par); \ - daal::services::Environment::env & env = *_env; \ - \ - auto & context = services::internal::getDefaultContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_CALL_KERNEL(env, KernelClass, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), finalizeCompute, nObsTable, \ - crossProductTable, sumTable, covTable, meanTable, parameter); \ - } \ - else \ - { \ - __DAAL_CALL_KERNEL_SYCL(env, oneapi::__DAAL_CONCAT(KernelClass, OneAPI), __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), \ - finalizeCompute, nObsTable, crossProductTable, sumTable, covTable, meanTable, parameter); \ - } \ - } - #undef __DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR #define __DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR(ComputeMethod) \ template \ @@ -272,24 +139,6 @@ __DAAL_INITIALIZE_KERNELS(internal::CovarianceDistributedKernel, algorithmFPType, ComputeMethod); \ } -#undef __DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR_ONEAPI -#define __DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR_ONEAPI(ComputeMethod) \ - template \ - DistributedContainer::DistributedContainer(daal::services::Environment::env * daalEnv) \ - { \ - auto & context = services::Environment::getInstance()->getDefaultExecutionContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_INITIALIZE_KERNELS(internal::CovarianceDistributedKernel, algorithmFPType, ComputeMethod); \ - } \ - else \ - { \ - __DAAL_INITIALIZE_KERNELS_SYCL(oneapi::internal::CovarianceDenseDistrStep2KernelOneAPI, algorithmFPType, ComputeMethod) \ - } \ - } - #undef __DAAL_COVARIANCE_DISTR_CONTAINER_DESTRUCTOR #define __DAAL_COVARIANCE_DISTR_CONTAINER_DESTRUCTOR(ComputeMethod) \ template \ @@ -319,41 +168,6 @@ collection->clear(); \ } -#undef __DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE_ONEAPI -#define __DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE_ONEAPI(ComputeMethod) \ - template \ - services::Status DistributedContainer::compute() \ - { \ - PartialResult * partialResult = static_cast(_pres); \ - \ - DistributedInput * input = static_cast *>(_in); \ - DataCollection * collection = input->get(partialResults).get(); \ - \ - NumericTable * nObsTable = partialResult->get(nObservations).get(); \ - NumericTable * crossProductTable = partialResult->get(crossProduct).get(); \ - NumericTable * sumTable = partialResult->get(sum).get(); \ - \ - Parameter * parameter = static_cast(_par); \ - daal::services::Environment::env & env = *_env; \ - \ - auto & context = services::Environment::getInstance()->getDefaultExecutionContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_CALL_KERNEL(env, internal::CovarianceDistributedKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), compute, \ - collection, nObsTable, crossProductTable, sumTable, parameter); \ - } \ - else \ - { \ - __DAAL_CALL_KERNEL_SYCL(env, oneapi::internal::CovarianceDenseDistrStep2KernelOneAPI, \ - __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), compute, collection, nObsTable, crossProductTable, \ - sumTable, parameter); \ - } \ - \ - collection->clear(); \ - } - #undef __DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE #define __DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE(ComputeMethod) \ template \ @@ -374,47 +188,13 @@ nObsTable, crossProductTable, sumTable, covTable, meanTable, parameter, hyperparameter); \ } -#undef __DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE_ONEAPI -#define __DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE_ONEAPI(ComputeMethod) \ - template \ - services::Status DistributedContainer::finalizeCompute() \ - { \ - Result * result = static_cast(_res); \ - PartialResult * partialResult = static_cast(_pres); \ - \ - NumericTable * nObsTable = partialResult->get(nObservations).get(); \ - NumericTable * crossProductTable = partialResult->get(crossProduct).get(); \ - NumericTable * sumTable = partialResult->get(sum).get(); \ - \ - NumericTable * covTable = result->get(covariance).get(); \ - NumericTable * meanTable = result->get(mean).get(); \ - \ - Parameter * parameter = static_cast(_par); \ - daal::services::Environment::env & env = *_env; \ - \ - auto & context = services::Environment::getInstance()->getDefaultExecutionContext(); \ - auto & deviceInfo = context.getInfoDevice(); \ - \ - if (deviceInfo.isCpu) \ - { \ - __DAAL_CALL_KERNEL(env, internal::CovarianceDistributedKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), finalizeCompute, \ - nObsTable, crossProductTable, sumTable, covTable, meanTable, parameter); \ - } \ - else \ - { \ - __DAAL_CALL_KERNEL_SYCL(env, oneapi::internal::CovarianceDenseDistrStep2KernelOneAPI, \ - __DAAL_KERNEL_ARGUMENTS(algorithmFPType, ComputeMethod), finalizeCompute, nObsTable, crossProductTable, \ - sumTable, covTable, meanTable, parameter); \ - } \ - } - namespace daal { namespace algorithms { namespace covariance { -__DAAL_COVARIANCE_BATCH_CONTAINER_CONSTRUCTOR_ONEAPI(defaultDense, internal::CovarianceDenseBatchKernel) +__DAAL_COVARIANCE_BATCH_CONTAINER_CONSTRUCTOR(defaultDense, internal::CovarianceDenseBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_CONSTRUCTOR(singlePassDense, internal::CovarianceDenseBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_CONSTRUCTOR(sumDense, internal::CovarianceDenseBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_CONSTRUCTOR(fastCSR, internal::CovarianceCSRBatchKernel) @@ -428,14 +208,14 @@ __DAAL_COVARIANCE_BATCH_CONTAINER_DESTRUCTOR(fastCSR) __DAAL_COVARIANCE_BATCH_CONTAINER_DESTRUCTOR(singlePassCSR) __DAAL_COVARIANCE_BATCH_CONTAINER_DESTRUCTOR(sumCSR) -__DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE_ONEAPI(defaultDense, internal::CovarianceDenseBatchKernel) +__DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE(defaultDense, internal::CovarianceDenseBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE(singlePassDense, internal::CovarianceDenseBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE(sumDense, internal::CovarianceDenseBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE(fastCSR, internal::CovarianceCSRBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE(singlePassCSR, internal::CovarianceCSRBatchKernel) __DAAL_COVARIANCE_BATCH_CONTAINER_COMPUTE(sumCSR, internal::CovarianceCSRBatchKernel) -__DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR_ONEAPI(defaultDense, internal::CovarianceDenseOnlineKernel) +__DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR(defaultDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR(singlePassDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR(sumDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_CONSTRUCTOR(fastCSR, internal::CovarianceCSROnlineKernel) @@ -449,21 +229,21 @@ __DAAL_COVARIANCE_ONLINE_CONTAINER_DESTRUCTOR(fastCSR) __DAAL_COVARIANCE_ONLINE_CONTAINER_DESTRUCTOR(singlePassCSR) __DAAL_COVARIANCE_ONLINE_CONTAINER_DESTRUCTOR(sumCSR) -__DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE_ONEAPI(defaultDense, internal::CovarianceDenseOnlineKernel) +__DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE(defaultDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE(singlePassDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE(sumDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE(fastCSR, internal::CovarianceCSROnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE(singlePassCSR, internal::CovarianceCSROnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_COMPUTE(sumCSR, internal::CovarianceCSROnlineKernel) -__DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE_ONEAPI(defaultDense, internal::CovarianceDenseOnlineKernel) +__DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE(defaultDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE(singlePassDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE(sumDense, internal::CovarianceDenseOnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE(fastCSR, internal::CovarianceCSROnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE(singlePassCSR, internal::CovarianceCSROnlineKernel) __DAAL_COVARIANCE_ONLINE_CONTAINER_FINALIZECOMPUTE(sumCSR, internal::CovarianceCSROnlineKernel) -__DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR_ONEAPI(defaultDense) +__DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR(defaultDense) __DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR(singlePassDense) __DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR(sumDense) __DAAL_COVARIANCE_DISTR_CONTAINER_CONSTRUCTOR(fastCSR) @@ -477,14 +257,14 @@ __DAAL_COVARIANCE_DISTR_CONTAINER_DESTRUCTOR(fastCSR) __DAAL_COVARIANCE_DISTR_CONTAINER_DESTRUCTOR(singlePassCSR) __DAAL_COVARIANCE_DISTR_CONTAINER_DESTRUCTOR(sumCSR) -__DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE_ONEAPI(defaultDense) +__DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE(defaultDense) __DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE(singlePassDense) __DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE(sumDense) __DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE(fastCSR) __DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE(singlePassCSR) __DAAL_COVARIANCE_DISTR_CONTAINER_COMPUTE(sumCSR) -__DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE_ONEAPI(defaultDense) +__DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE(defaultDense) __DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE(singlePassDense) __DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE(sumDense) __DAAL_COVARIANCE_DISTR_CONTAINER_FINALIZECOMPUTE(fastCSR) diff --git a/cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_fpt_dispatcher.cpp index 05fea365ff3..37fc7632532 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_fpt_dispatcher.cpp @@ -27,6 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(covariance::BatchContainer, batch, DAAL_FPTYPE, covariance::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(covariance::BatchContainer, batch, DAAL_FPTYPE, covariance::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index c92b05dcbf5..00000000000 --- a/cpp/daal/src/algorithms/covariance/covariance_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* file: covariance_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Covariance kernel. -//-- -*/ - -#include "src/externals/service_profiler.h" -#include "src/algorithms/covariance/covariance_container.h" -#include "src/algorithms/covariance/oneapi/covariance_dense_batch_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template class DAAL_EXPORT CovarianceDenseBatchKernelOneAPI; -} -} // namespace oneapi - -} // namespace covariance -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_fpt_dispatcher.cpp index b52b606e94c..2bf7979d4d6 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_fpt_dispatcher.cpp @@ -27,6 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(covariance::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, covariance::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(covariance::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, covariance::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_oneapi_fpt.cpp b/cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_oneapi_fpt.cpp deleted file mode 100644 index 87db47dd51d..00000000000 --- a/cpp/daal/src/algorithms/covariance/covariance_dense_default_distr_step2_oneapi_fpt.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* file: covariance_dense_default_distr_step2_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Covariance kernel. -//-- -*/ - -#include "src/externals/service_profiler.h" -#include "src/algorithms/covariance/covariance_container.h" -#include "src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi.h" -#include "src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template class CovarianceDenseDistrStep2KernelOneAPI; -} -} // namespace oneapi -} // namespace covariance -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/covariance/covariance_dense_default_online_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/covariance/covariance_dense_default_online_fpt_dispatcher.cpp index 17b8541c473..06752e02b2d 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_dense_default_online_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/covariance/covariance_dense_default_online_fpt_dispatcher.cpp @@ -27,6 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(covariance::OnlineContainer, online, DAAL_FPTYPE, covariance::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(covariance::OnlineContainer, online, DAAL_FPTYPE, covariance::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/covariance/covariance_dense_default_online_oneapi_fpt.cpp b/cpp/daal/src/algorithms/covariance/covariance_dense_default_online_oneapi_fpt.cpp deleted file mode 100644 index 31a6b79fe8d..00000000000 --- a/cpp/daal/src/algorithms/covariance/covariance_dense_default_online_oneapi_fpt.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* file: covariance_dense_default_online_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Covariance kernel. -//-- -*/ - -#include "src/externals/service_profiler.h" -#include "src/algorithms/covariance/covariance_container.h" -#include "src/algorithms/covariance/oneapi/covariance_dense_online_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template class DAAL_EXPORT CovarianceDenseOnlineKernelOneAPI; -} -} // namespace oneapi - -} // namespace covariance -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/covariance/covariance_partialresult.h b/cpp/daal/src/algorithms/covariance/covariance_partialresult.h index 7e01c829d71..4876a920d72 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_partialresult.h +++ b/cpp/daal/src/algorithms/covariance/covariance_partialresult.h @@ -25,7 +25,7 @@ #define __COVARIANCE_PARTIALRESULT_ #include "algorithms/covariance/covariance_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" using namespace daal::data_management; namespace daal @@ -48,21 +48,9 @@ DAAL_EXPORT services::Status PartialResult::allocate(const daal::algorithms::Inp size_t nColumns = algInput->getNumberOfFeatures(); services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - set(nObservations, HomogenNumericTable::create(1, 1, NumericTable::doAllocate, &status)); - set(crossProduct, HomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); - set(sum, HomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); - } - else - { - set(nObservations, internal::SyclHomogenNumericTable::create(1, 1, NumericTable::doAllocate, &status)); - set(crossProduct, internal::SyclHomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); - set(sum, internal::SyclHomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); - } + set(nObservations, HomogenNumericTable::create(1, 1, NumericTable::doAllocate, &status)); + set(crossProduct, HomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); + set(sum, HomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); return status; } diff --git a/cpp/daal/src/algorithms/covariance/covariance_result.h b/cpp/daal/src/algorithms/covariance/covariance_result.h index cd6d757359e..939cf9be2f3 100644 --- a/cpp/daal/src/algorithms/covariance/covariance_result.h +++ b/cpp/daal/src/algorithms/covariance/covariance_result.h @@ -25,7 +25,7 @@ #define __COVARIANCE_RESULT_ #include "algorithms/covariance/covariance_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" using namespace daal::data_management; namespace daal @@ -47,25 +47,11 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in size_t nColumns = algInput->getNumberOfFeatures(); services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); + set(covariance, HomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); + DAAL_CHECK_STATUS_VAR(status); - if (deviceInfo.isCpu) - { - set(covariance, HomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); - DAAL_CHECK_STATUS_VAR(status); - - set(mean, HomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); - DAAL_CHECK_STATUS_VAR(status); - } - else - { - set(covariance, internal::SyclHomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); - DAAL_CHECK_STATUS_VAR(status); - - set(mean, internal::SyclHomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); - DAAL_CHECK_STATUS_VAR(status); - } + set(mean, HomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); + DAAL_CHECK_STATUS_VAR(status); return status; } @@ -84,19 +70,8 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::PartialRes size_t nColumns = pres->getNumberOfFeatures(); services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - set(covariance, HomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); - set(mean, HomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); - } - else - { - set(covariance, internal::SyclHomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); - set(mean, internal::SyclHomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); - } + set(covariance, HomogenNumericTable::create(nColumns, nColumns, NumericTable::doAllocate, &status)); + set(mean, HomogenNumericTable::create(nColumns, 1, NumericTable::doAllocate, &status)); return status; } diff --git a/cpp/daal/src/algorithms/covariance/oneapi/cl_kernels/covariance_kernels.cl b/cpp/daal/src/algorithms/covariance/oneapi/cl_kernels/covariance_kernels.cl deleted file mode 100644 index 61b4d6fbb24..00000000000 --- a/cpp/daal/src/algorithms/covariance/oneapi/cl_kernels/covariance_kernels.cl +++ /dev/null @@ -1,101 +0,0 @@ -/* file: covariance_kernels.cl */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Covariance OpenCL kernels. -//-- -*/ - -#ifndef __COVARIANCE_KERNELS_CL__ -#define __COVARIANCE_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - covariance_kernels, - - bool isFirstDataBlock(const algorithmFPType nObservations) { return (nObservations < (algorithmFPType)(0.5)); } - - __kernel void mergeCrossProduct(unsigned int nFeatures, __global const algorithmFPType * partialCrossProduct, - __global const algorithmFPType * partialSums, __global algorithmFPType * crossProduct, - __global const algorithmFPType * sums, const algorithmFPType invPartialNObs, const algorithmFPType invNObs, - const algorithmFPType invNewNObs) { - const unsigned int i = get_global_id(0); - const unsigned int j = get_global_id(1); - - if ((i < nFeatures) && (j < nFeatures)) - { - crossProduct[i * nFeatures + j] += partialCrossProduct[i * nFeatures + j]; - crossProduct[i * nFeatures + j] += partialSums[i] * partialSums[j] * invPartialNObs; - crossProduct[i * nFeatures + j] += sums[i] * sums[j] * invNObs; - crossProduct[i * nFeatures + j] -= (partialSums[i] + sums[i]) * (partialSums[j] + sums[j]) * invNewNObs; - } - } - - __kernel void prepareMeansAndCrossProductDiag(unsigned int nFeatures, algorithmFPType nObservations, __global algorithmFPType * crossProduct, - __global algorithmFPType * diagCrossProduct, __global algorithmFPType * sums, - __global algorithmFPType * mean) { - const unsigned int tid = get_global_id(0); - const algorithmFPType invNObservations = (algorithmFPType)(1.0) / nObservations; - - diagCrossProduct[tid] = crossProduct[tid * nFeatures + tid]; - mean[tid] = sums[tid] * invNObservations; - } - - __kernel void finalize(unsigned int nFeatures, algorithmFPType nObservations, __global algorithmFPType * crossProduct, - __global algorithmFPType * cov, __global algorithmFPType * diagCrossProduct, unsigned int isOutputCorrelationMatrix) { - algorithmFPType invNObservationsM1 = (algorithmFPType)(1.0); - - if (nObservations > (algorithmFPType)(1.0)) - { - invNObservationsM1 = (algorithmFPType)(1.0) / (nObservations - (algorithmFPType)(1.0)); - } - - const unsigned int global_row_id = get_global_id(0); - const unsigned int global_col_id = get_global_id(1); - - if ((global_row_id < nFeatures) && (global_col_id < nFeatures)) - { - algorithmFPType covElement = (algorithmFPType)(1.0); - - algorithmFPType crossProductRowElement = diagCrossProduct[global_row_id]; - algorithmFPType crossProductColElement = diagCrossProduct[global_col_id]; - - algorithmFPType crossProductElement = crossProduct[global_row_id * nFeatures + global_col_id]; - - if (!isOutputCorrelationMatrix) - { - covElement = crossProductElement * invNObservationsM1; - } - else if (global_row_id != global_col_id) - { - algorithmFPType sqrtRowElement = (algorithmFPType)(1.0) / sqrt(crossProductRowElement); - algorithmFPType sqrtColElement = (algorithmFPType)(1.0) / sqrt(crossProductColElement); - - covElement = crossProductElement * sqrtRowElement * sqrtColElement; - } - - cov[global_row_id * nFeatures + global_col_id] = covElement; - } - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_batch_oneapi_impl.i b/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_batch_oneapi_impl.i deleted file mode 100644 index 7462bf758a0..00000000000 --- a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_batch_oneapi_impl.i +++ /dev/null @@ -1,105 +0,0 @@ -/* file: covariance_dense_batch_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Covariance matrix computation algorithm implementation in batch mode -//-- -*/ - -#ifndef __COVARIANCE_DENSE_BATCH_ONEAPI_IMPL_I__ -#define __COVARIANCE_DENSE_BATCH_ONEAPI_IMPL_I__ - -#include "src/algorithms/covariance/oneapi/covariance_kernel_oneapi.h" -#include "src/algorithms/covariance/oneapi/covariance_oneapi_impl.i" - -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template -services::Status CovarianceDenseBatchKernelOneAPI::compute(NumericTable * dataTable, NumericTable * covTable, - NumericTable * meanTable, const Parameter * parameter) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(computeDenseBatch); - - services::Status status; - - if (dataTable->getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - const uint32_t nFeatures = static_cast(dataTable->getNumberOfColumns()); - - if (dataTable->getNumberOfRows() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - const uint32_t nVectors = static_cast(dataTable->getNumberOfRows()); - - const algorithmFPType nObservations = static_cast(nVectors); - - BlockDescriptor dataBlock; - BlockDescriptor sumBlock; - BlockDescriptor crossProductBlock; - - { - status |= dataTable->getBlockOfRows(0, nVectors, readOnly, dataBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= meanTable->getBlockOfRows(0, meanTable->getNumberOfRows(), writeOnly, sumBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= covTable->getBlockOfRows(0, covTable->getNumberOfRows(), writeOnly, crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - } - - status |= calculateCrossProductAndSums(dataTable, crossProductBlock.getBuffer(), sumBlock.getBuffer()); - DAAL_CHECK_STATUS_VAR(status); - - status |= finalizeCovariance(nFeatures, nObservations, crossProductBlock.getBuffer(), sumBlock.getBuffer(), - crossProductBlock.getBuffer(), sumBlock.getBuffer(), parameter); - - { - status |= dataTable->releaseBlockOfRows(dataBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= meanTable->releaseBlockOfRows(sumBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= covTable->releaseBlockOfRows(crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -} // namespace internal -} // namespace oneapi -} // namespace covariance -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi.h b/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi.h deleted file mode 100644 index 7335b6fb270..00000000000 --- a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi.h +++ /dev/null @@ -1,61 +0,0 @@ -/* file: covariance_dense_distr_step2_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template structs that calculate Covariance matrix. -//-- -*/ - -#ifndef __COVARIANCE_DENSE_DISTR_STEP2_ONEAPI_H__ -#define __COVARIANCE_DENSE_DISTR_STEP2_ONEAPI_H__ - -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/covariance/covariance_types.h" - -using namespace daal::services; -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template -class CovarianceDenseDistrStep2KernelOneAPI : public Kernel -{ -public: - services::Status compute(DataCollection * partialResultsCollection, NumericTable * nObsTable, NumericTable * crossProductTable, - NumericTable * sumTable, const Parameter * parameter); - - services::Status finalizeCompute(NumericTable * nObsTable, NumericTable * crossProductTable, NumericTable * sumTable, NumericTable * covTable, - NumericTable * meanTable, const Parameter * parameter); -}; - -} // namespace internal -} // namespace oneapi -} // namespace covariance -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi_impl.i b/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi_impl.i deleted file mode 100644 index 0eaca70c231..00000000000 --- a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi_impl.i +++ /dev/null @@ -1,215 +0,0 @@ -/* file: covariance_dense_distr_step2_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Covariance matrix computation algorithm implementation in distributed mode -//-- -*/ - -#ifndef __COVARIANCE_DENSE_DISTR_STEP2_ONEAPI_IMPL_I__ -#define __COVARIANCE_DENSE_DISTR_STEP2_ONEAPI_IMPL_I__ - -#include "src/algorithms/covariance/oneapi/covariance_dense_distr_step2_oneapi.h" -#include "src/algorithms/covariance/oneapi/covariance_oneapi_impl.i" - -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template -services::Status CovarianceDenseDistrStep2KernelOneAPI::compute(DataCollection * partialResultsCollection, - NumericTable * nObsTable, NumericTable * crossProductTable, - NumericTable * sumTable, const Parameter * parameter) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(computeDistr); - - auto & context = services::internal::getDefaultContext(); - - services::Status status; - - if (crossProductTable->getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - - const uint32_t nFeatures = static_cast(crossProductTable->getNumberOfColumns()); - const size_t collectionSize = partialResultsCollection->size(); - - BlockDescriptor sumBlock; - BlockDescriptor crossProductBlock; - BlockDescriptor nObservationsBlock; - - status |= sumTable->getBlockOfRows(0, sumTable->getNumberOfRows(), readWrite, sumBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= crossProductTable->getBlockOfRows(0, crossProductTable->getNumberOfRows(), readWrite, crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= nObsTable->getBlockOfRows(0, nObsTable->getNumberOfRows(), readWrite, nObservationsBlock); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType zero = 0.0; - context.fill(sumBlock.getBuffer(), zero, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(crossProductBlock.getBuffer(), zero, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(nObservationsBlock.getBuffer(), zero, status); - DAAL_CHECK_STATUS_VAR(status); - - for (size_t i = 0; i < collectionSize; i++) - { - PartialResult * patrialResult = static_cast((*partialResultsCollection)[i].get()); - NumericTable * partialSumsTable = patrialResult->get(covariance::sum).get(); - NumericTable * partialCrossProductTable = patrialResult->get(covariance::crossProduct).get(); - NumericTable * partialNObservationsTable = patrialResult->get(covariance::nObservations).get(); - - BlockDescriptor partialSumsBlock; - BlockDescriptor partialCrossProductBlock; - BlockDescriptor partialNObservationsBlock; - - status |= partialSumsTable->getBlockOfRows(0, partialSumsTable->getNumberOfRows(), readWrite, partialSumsBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= partialCrossProductTable->getBlockOfRows(0, partialCrossProductTable->getNumberOfRows(), readWrite, partialCrossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= partialNObservationsTable->getBlockOfRows(0, partialNObservationsTable->getNumberOfRows(), readWrite, partialNObservationsBlock); - DAAL_CHECK_STATUS_VAR(status); - - if (i == 0) - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nFeatures, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProductBlock.getBuffer()), algorithmFPType, nFeatures * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(partialCrossProductBlock.getBuffer()), algorithmFPType, nFeatures * nFeatures); - context.copy(crossProductBlock.getBuffer(), 0, partialCrossProductBlock.getBuffer(), 0, nFeatures * nFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(partialSumsBlock.getBuffer()), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sumBlock.getBuffer()), algorithmFPType, nFeatures); - context.copy(sumBlock.getBuffer(), 0, partialSumsBlock.getBuffer(), 0, nFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - } - else - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(partialNObservationsBlock.getBuffer()), algorithmFPType, 1); - const auto partialNObservationsBlockHost = partialNObservationsBlock.getBuffer().toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(nObservationsBlock.getBuffer()), algorithmFPType, 1); - const auto nObservationsBlockHost = nObservationsBlock.getBuffer().toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - status |= mergeCrossProduct(nFeatures, partialCrossProductBlock.getBuffer(), partialSumsBlock.getBuffer(), - *partialNObservationsBlockHost, crossProductBlock.getBuffer(), sumBlock.getBuffer(), - *nObservationsBlockHost); - DAAL_CHECK_STATUS_VAR(status); - - status |= mergeSums(nFeatures, partialSumsBlock.getBuffer(), sumBlock.getBuffer()); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(partialNObservationsBlock.getBuffer()), algorithmFPType, 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(nObservationsBlock.getBuffer()), algorithmFPType, 1); - status |= BlasGpu::xaxpy(1, 1, partialNObservationsBlock.getBuffer(), 1, nObservationsBlock.getBuffer(), 1); - DAAL_CHECK_STATUS_VAR(status); - status |= partialSumsTable->releaseBlockOfRows(partialSumsBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= partialCrossProductTable->releaseBlockOfRows(partialCrossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= partialNObservationsTable->releaseBlockOfRows(partialNObservationsBlock); - DAAL_CHECK_STATUS_VAR(status); - } - - status |= sumTable->releaseBlockOfRows(sumBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= crossProductTable->releaseBlockOfRows(crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= nObsTable->releaseBlockOfRows(nObservationsBlock); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status CovarianceDenseDistrStep2KernelOneAPI::finalizeCompute(NumericTable * nObservationsTable, - NumericTable * crossProductTable, - NumericTable * sumTable, NumericTable * covTable, - NumericTable * meanTable, - const Parameter * parameter) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(finalizeComputeDistr); - - services::Status status; - - if (crossProductTable->getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - - const uint32_t nFeatures = static_cast(crossProductTable->getNumberOfColumns()); - - BlockDescriptor sumBlock; - BlockDescriptor covBlock; - BlockDescriptor meanBlock; - BlockDescriptor crossProductBlock; - BlockDescriptor nObservationsBlock; - - status |= sumTable->getBlockOfRows(0, sumTable->getNumberOfRows(), readWrite, sumBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= covTable->getBlockOfRows(0, covTable->getNumberOfRows(), readWrite, covBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= meanTable->getBlockOfRows(0, meanTable->getNumberOfRows(), readWrite, meanBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= crossProductTable->getBlockOfRows(0, crossProductTable->getNumberOfRows(), readWrite, crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= nObservationsTable->getBlockOfRows(0, nObservationsTable->getNumberOfRows(), readWrite, nObservationsBlock); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(nObservationsBlock.getBuffer()), algorithmFPType, 1); - const auto nObservationsBlockHost = nObservationsBlock.getBuffer().toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - status |= finalizeCovariance(nFeatures, *nObservationsBlockHost, crossProductBlock.getBuffer(), sumBlock.getBuffer(), - covBlock.getBuffer(), meanBlock.getBuffer(), parameter); - DAAL_CHECK_STATUS_VAR(status); - - status |= sumTable->releaseBlockOfRows(sumBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= crossProductTable->releaseBlockOfRows(crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= nObservationsTable->releaseBlockOfRows(nObservationsBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= meanTable->releaseBlockOfRows(meanBlock); - DAAL_CHECK_STATUS_VAR(status); - status |= covTable->releaseBlockOfRows(covBlock); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -} // namespace internal -} // namespace oneapi -} // namespace covariance -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_online_oneapi_impl.i b/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_online_oneapi_impl.i deleted file mode 100644 index 8356f3b0b11..00000000000 --- a/cpp/daal/src/algorithms/covariance/oneapi/covariance_dense_online_oneapi_impl.i +++ /dev/null @@ -1,195 +0,0 @@ -/* file: covariance_dense_online_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Covariance matrix computation algorithm implementation in batch mode -//-- -*/ - -#ifndef __COVARIANCE_DENSE_ONLINE_ONEAPI_IMPL_I__ -#define __COVARIANCE_DENSE_ONLINE_ONEAPI_IMPL_I__ - -#include "src/algorithms/covariance/oneapi/covariance_kernel_oneapi.h" -#include "src/algorithms/covariance/oneapi/covariance_oneapi_impl.i" - -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template -inline bool isFirstDataBlock(algorithmFPType nObservations) -{ - return (nObservations < static_cast(0.5)); -} - -template -services::Status CovarianceDenseOnlineKernelOneAPI::compute(NumericTable * dataTable, NumericTable * nObsTable, - NumericTable * crossProductTable, NumericTable * sumTable, - const Parameter * parameter) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(computeDenseOnline); - services::Status status; - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - algorithmFPType * nObservations = nullptr; - algorithmFPType partialNObservations = 0.0; - - if (dataTable->getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - const uint32_t nFeatures = static_cast(dataTable->getNumberOfColumns()); - - if (dataTable->getNumberOfRows() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - const uint32_t nVectors = static_cast(dataTable->getNumberOfRows()); - - partialNObservations = static_cast(nVectors); - - BlockDescriptor dataBlock; - BlockDescriptor sumBlock; - BlockDescriptor crossProductBlock; - BlockDescriptor nObsBlock; - - { - status |= dataTable->getBlockOfRows(0, nVectors, readWrite, dataBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= sumTable->getBlockOfRows(0, sumTable->getNumberOfRows(), readWrite, sumBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= crossProductTable->getBlockOfRows(0, crossProductTable->getNumberOfRows(), readWrite, crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= nObsTable->getBlockOfRows(0, crossProductTable->getNumberOfRows(), readWrite, nObsBlock); - DAAL_CHECK_STATUS_VAR(status); - nObservations = nObsBlock.getBlockPtr(); - DAAL_ASSERT(nObservations != nullptr); - } - - if (isFirstDataBlock(*nObservations)) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProductBlock.getBuffer()), algorithmFPType, nFeatures * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sumBlock.getBuffer()), algorithmFPType, nFeatures); - status |= calculateCrossProductAndSums(dataTable, crossProductBlock.getBuffer(), sumBlock.getBuffer()); - DAAL_CHECK_STATUS_VAR(status); - } - else - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nFeatures, nFeatures); - auto partialCrossProductBlock = context.allocate(TypeIds::id(), nFeatures * nFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - auto partialSumBlock = context.allocate(TypeIds::id(), nFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - status |= calculateCrossProductAndSums(dataTable, partialCrossProductBlock.template get(), - partialSumBlock.template get()); - DAAL_CHECK_STATUS_VAR(status); - - status |= mergeCrossProduct(nFeatures, partialCrossProductBlock.template get(), - partialSumBlock.template get(), partialNObservations, - crossProductBlock.getBuffer(), sumBlock.getBuffer(), *nObservations); - DAAL_CHECK_STATUS_VAR(status); - - status |= mergeSums(nFeatures, partialSumBlock.template get(), sumBlock.getBuffer()); - DAAL_CHECK_STATUS_VAR(status); - } - - *nObservations += partialNObservations; - - DAAL_CHECK_STATUS_VAR(dataTable->releaseBlockOfRows(dataBlock)); - DAAL_CHECK_STATUS_VAR(sumTable->releaseBlockOfRows(sumBlock)); - DAAL_CHECK_STATUS_VAR(crossProductTable->releaseBlockOfRows(crossProductBlock)); - DAAL_CHECK_STATUS_VAR(nObsTable->releaseBlockOfRows(nObsBlock)); - - return status; -} - -template -services::Status CovarianceDenseOnlineKernelOneAPI::finalizeCompute(NumericTable * nObservationsTable, - NumericTable * crossProductTable, - NumericTable * sumTable, NumericTable * covTable, - NumericTable * meanTable, const Parameter * parameter) -{ - services::Status status; - - if (crossProductTable->getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - - const uint32_t nFeatures = static_cast(crossProductTable->getNumberOfColumns()); - algorithmFPType * nObservations = nullptr; - - BlockDescriptor dataBlock; - BlockDescriptor sumBlock; - BlockDescriptor meanBlock; - BlockDescriptor covBlock; - BlockDescriptor crossProductBlock; - BlockDescriptor nObservationsBlock; - - { - status |= sumTable->getBlockOfRows(0, sumTable->getNumberOfRows(), readWrite, sumBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= crossProductTable->getBlockOfRows(0, crossProductTable->getNumberOfRows(), readWrite, crossProductBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= covTable->getBlockOfRows(0, covTable->getNumberOfRows(), readWrite, covBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= meanTable->getBlockOfRows(0, meanTable->getNumberOfRows(), readWrite, meanBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= nObservationsTable->getBlockOfRows(0, crossProductTable->getNumberOfRows(), readWrite, nObservationsBlock); - DAAL_CHECK_STATUS_VAR(status); - nObservations = nObservationsBlock.getBlockPtr(); - DAAL_ASSERT(nObservations != nullptr); - } - - status |= finalizeCovariance(nFeatures, *nObservations, crossProductBlock.getBuffer(), sumBlock.getBuffer(), - covBlock.getBuffer(), meanBlock.getBuffer(), parameter); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(sumTable->releaseBlockOfRows(sumBlock)); - DAAL_CHECK_STATUS_VAR(crossProductTable->releaseBlockOfRows(crossProductBlock)); - DAAL_CHECK_STATUS_VAR(nObservationsTable->releaseBlockOfRows(nObservationsBlock)); - DAAL_CHECK_STATUS_VAR(meanTable->releaseBlockOfRows(meanBlock)); - DAAL_CHECK_STATUS_VAR(covTable->releaseBlockOfRows(covBlock)); - - return status; -} - -} // namespace internal -} // namespace oneapi -} // namespace covariance -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/covariance/oneapi/covariance_kernel_oneapi.h b/cpp/daal/src/algorithms/covariance/oneapi/covariance_kernel_oneapi.h deleted file mode 100644 index 10ebfa2ced3..00000000000 --- a/cpp/daal/src/algorithms/covariance/oneapi/covariance_kernel_oneapi.h +++ /dev/null @@ -1,69 +0,0 @@ -/* file: covariance_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template structs that calculate Covariance matrix. -//-- -*/ - -#ifndef __COVARIANCE_KERNEL_ONEAPI_H__ -#define __COVARIANCE_KERNEL_ONEAPI_H__ - -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/covariance/covariance_types.h" - -using namespace daal::services; -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template -class CovarianceDenseBatchKernelOneAPI : public Kernel -{ -public: - services::Status compute(NumericTable * dataTable, NumericTable * covTable, NumericTable * meanTable, const Parameter * parameter); -}; - -template -class CovarianceDenseOnlineKernelOneAPI : public Kernel -{ -public: - services::Status compute(NumericTable * dataTable, NumericTable * nObsTable, NumericTable * crossProductTable, NumericTable * sumTable, - const Parameter * parameter); - - services::Status finalizeCompute(NumericTable * nObservationsTable, NumericTable * crossProductTable, NumericTable * sumTable, - NumericTable * covTable, NumericTable * meanTable, const Parameter * parameter); -}; - -} // namespace internal -} // namespace oneapi -} // namespace covariance -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/covariance/oneapi/covariance_oneapi_impl.i b/cpp/daal/src/algorithms/covariance/oneapi/covariance_oneapi_impl.i deleted file mode 100644 index bb25a08474a..00000000000 --- a/cpp/daal/src/algorithms/covariance/oneapi/covariance_oneapi_impl.i +++ /dev/null @@ -1,432 +0,0 @@ -/* file: covariance_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Covariance matrix computation algorithm implementation -//-- -*/ - -#ifndef __COVARIANCE_ONEAPI_IMPL_I__ -#define __COVARIANCE_ONEAPI_IMPL_I__ - -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/sycl/blas_gpu.h" -#include "src/sycl/reducer.h" -#include "src/algorithms/covariance/oneapi/cl_kernels/covariance_kernels.cl" -#include "src/externals/service_profiler.h" -#include "src/services/service_data_utils.h" - -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace covariance -{ -namespace oneapi -{ -namespace internal -{ -template -static services::Status buildProgram(ClKernelFactoryIface & factory) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - - services::Status status; - - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - build_options.add("-cl-std=CL1.2"); - - services::String cachekey("__daal_algorithms_covariance_dense_batch_finalizeCovariance_"); - cachekey.add(fptype_name); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), covariance_kernels, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -static uint32_t getGlobalRangeSize(uint32_t localRangeSize, uint32_t N) -{ - DAAL_ASSERT(localRangeSize != 0); - uint32_t factor = N / localRangeSize; - - if (factor * localRangeSize != N) - { - factor++; - } - return factor * localRangeSize; -} - -static KernelNDRange getKernelNDRange(uint32_t localRangeSize, uint32_t globalRangeSize, services::Status & status) -{ - KernelNDRange ndrange(2); - KernelRange local_range(localRangeSize, localRangeSize); - KernelRange global_range(globalRangeSize, globalRangeSize); - - ndrange.global(global_range, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, ndrange); - - ndrange.local(local_range, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, ndrange); - - return ndrange; -} - -template -services::Status prepareSums(NumericTable * dataTable, const services::internal::Buffer & sumsBuffer) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.prepareSums); - - const uint32_t nFeatures = dataTable->getNumberOfColumns(); - auto & context = services::internal::getDefaultContext(); - services::Status status; - - if (method == sumDense || method == sumCSR) - { - NumericTable * dataSumsTable = dataTable->basicStatistics.get(NumericTable::sum).get(); - - BlockDescriptor userSums; - DAAL_CHECK_STATUS_VAR(dataSumsTable->getBlockOfRows(0, dataSumsTable->getNumberOfRows(), readOnly, userSums)); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sumsBuffer), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(userSums.getBuffer()), algorithmFPType, nFeatures); - context.copy(sumsBuffer, 0, userSums.getBuffer(), 0, nFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(dataSumsTable->releaseBlockOfRows(userSums)); - } - else - { - const algorithmFPType zero = 0.0; - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sumsBuffer), algorithmFPType, nFeatures); - context.fill(sumsBuffer, zero, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status prepareCrossProduct(const services::internal::Buffer & crossProductBuffer, uint32_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.prepareCrossProduct); - - const algorithmFPType zero = 0.0; - - auto & context = services::internal::getDefaultContext(); - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProductBuffer), algorithmFPType, nFeatures * nFeatures); - context.fill(crossProductBuffer, zero, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status updateDenseCrossProductAndSums(bool isNormalized, uint32_t nFeatures, uint32_t nVectors, - const services::internal::Buffer & dataBlock, - const services::internal::Buffer & crossProduct, - const services::internal::Buffer & sums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateCrossProductAndSums); - auto & context = services::internal::getDefaultContext(); - bool nonNormalizedFastDense = ((!isNormalized) && (method == defaultDense || method == sumDense)); - - if (isNormalized || nonNormalizedFastDense) - { - services::Status status; - - DAAL_ASSERT(nVectors != 0); - algorithmFPType nVectorsInv = algorithmFPType(1.0) / algorithmFPType(nVectors); - algorithmFPType beta = (isNormalized == true) ? algorithmFPType(0.0) : -nVectorsInv; - - if (!isNormalized) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(dataBlock), algorithmFPType, nFeatures * nVectors); - auto sumResult = math::SumReducer::sum(math::Layout::ColMajor, dataBlock, nFeatures, nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sums), algorithmFPType, sums.size()); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sumResult.sum), algorithmFPType, sums.size()); - context.copy(sums, 0, sumResult.sum, 0, sums.size(), status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateCrossProductAndSums.gemmSums); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sums), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProduct), algorithmFPType, nFeatures * nFeatures); - status |= BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::Trans, math::Transpose::NoTrans, nFeatures, - nFeatures, 1, algorithmFPType(1.0), sums, nFeatures, 0, sums, nFeatures, 0, - algorithmFPType(0.0), crossProduct, nFeatures, 0); - } - DAAL_CHECK_STATUS_VAR(status); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateCrossProductAndSums.gemmData); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(dataBlock), algorithmFPType, nFeatures * nVectors); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProduct), algorithmFPType, nFeatures * nFeatures); - status |= BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::Trans, math::Transpose::NoTrans, nFeatures, nFeatures, - nVectors, algorithmFPType(1.0), dataBlock, nFeatures, 0, dataBlock, nFeatures, 0, beta, - crossProduct, nFeatures, 0); - } - DAAL_CHECK_STATUS_VAR(status); - } - else - { - return services::ErrorMethodNotImplemented; - } - - return services::Status(); -} - -template -services::Status mergeCrossProduct(uint32_t nFeatures, const services::internal::Buffer & partialCrossProduct, - const services::internal::Buffer & partialSums, algorithmFPType partialNObservations, - const services::internal::Buffer & crossProduct, - const services::internal::Buffer & sums, algorithmFPType nObservations) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.mergeCrossProduct); - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("mergeCrossProduct", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProduct), algorithmFPType, nFeatures * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(partialCrossProduct), algorithmFPType, nFeatures * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sums), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(partialSums), algorithmFPType, nFeatures); - - DAAL_ASSERT(partialNObservations != 0); - DAAL_ASSERT(nObservations != 0); - DAAL_ASSERT(nObservations + partialNObservations != 0); - - const algorithmFPType invPartialNObs = (algorithmFPType)(1.0) / partialNObservations; - const algorithmFPType invNObs = (algorithmFPType)(1.0) / nObservations; - const algorithmFPType invNewNObs = (algorithmFPType)(1.0) / (nObservations + partialNObservations); - - KernelArguments args(8, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nFeatures); - args.set(1, partialCrossProduct, AccessModeIds::read); - args.set(2, partialSums, AccessModeIds::read); - args.set(3, crossProduct, AccessModeIds::readwrite); - args.set(4, sums, AccessModeIds::read); - args.set(5, invPartialNObs); - args.set(6, invNObs); - args.set(7, invNewNObs); - - const uint32_t localRangeSize = 16; - KernelNDRange ndrange = getKernelNDRange(localRangeSize, getGlobalRangeSize(localRangeSize, nFeatures), status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(ndrange, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status mergeSums(uint32_t nFeatures, const services::internal::Buffer & partialSums, - const services::internal::Buffer & sums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.mergeSums); - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(partialSums), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sums), algorithmFPType, nFeatures); - status |= BlasGpu::xaxpy(nFeatures, 1, partialSums, 1, sums, 1); - - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status prepareMeansAndCrossProductDiag(uint32_t nFeatures, algorithmFPType nObservations, - const services::internal::Buffer & crossProduct, - const services::internal::Buffer & diagCrossProduct, - const services::internal::Buffer & sums, - const services::internal::Buffer & means) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.prepareMeansAndCrossProductDiag); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("prepareMeansAndCrossProductDiag", status); - DAAL_CHECK_STATUS_VAR(status); - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProduct), algorithmFPType, nFeatures * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(diagCrossProduct), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(sums), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(means), algorithmFPType, nFeatures); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nFeatures); - args.set(1, nObservations); - args.set(2, crossProduct, AccessModeIds::read); - args.set(3, diagCrossProduct, AccessModeIds::write); - args.set(4, sums, AccessModeIds::readwrite); - args.set(5, means, AccessModeIds::readwrite); - - KernelRange range(nFeatures); - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status finalize(uint32_t nFeatures, algorithmFPType nObservations, const services::internal::Buffer & crossProduct, - const services::internal::Buffer & cov, - const services::internal::Buffer & diagCrossProduct, const Parameter * parameter) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.finalize); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("finalize", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nFeatures, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(crossProduct), algorithmFPType, nFeatures * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(cov), algorithmFPType, nFeatures * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(diagCrossProduct), algorithmFPType, nFeatures); - - uint32_t isOutputCorrelationMatrix = static_cast(parameter->outputMatrixType == covariance::correlationMatrix); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nFeatures); - args.set(1, nObservations); - args.set(2, crossProduct, AccessModeIds::read); - args.set(3, cov, AccessModeIds::readwrite); - args.set(4, diagCrossProduct, AccessModeIds::read); - args.set(5, isOutputCorrelationMatrix); - - const uint32_t localRangeSize = 4; - KernelNDRange ndrange = getKernelNDRange(localRangeSize, getGlobalRangeSize(localRangeSize, nFeatures), status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(ndrange, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status calculateCrossProductAndSums(NumericTable * dataTable, const services::internal::Buffer & crossProduct, - const services::internal::Buffer & sums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.calculateCrossProductAndSums); - services::Status status; - - if (dataTable->getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - const uint32_t nFeatures = static_cast(dataTable->getNumberOfColumns()); - - if (dataTable->getNumberOfRows() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - const uint32_t nVectors = static_cast(dataTable->getNumberOfRows()); - - const bool isNormalized = dataTable->isNormalized(NumericTableIface::standardScoreNormalized); - - BlockDescriptor dataBlock; - status |= dataTable->getBlockOfRows(0, nVectors, readOnly, dataBlock); - DAAL_CHECK_STATUS_VAR(status); - - status |= prepareSums(dataTable, sums); - DAAL_CHECK_STATUS_VAR(status); - - status |= prepareCrossProduct(crossProduct, nFeatures); - DAAL_CHECK_STATUS_VAR(status); - - status |= updateDenseCrossProductAndSums(isNormalized, nFeatures, nVectors, dataBlock.getBuffer(), crossProduct, sums); - DAAL_CHECK_STATUS_VAR(status); - - status |= dataTable->releaseBlockOfRows(dataBlock); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status finalizeCovariance(uint32_t nFeatures, algorithmFPType nObservations, - const services::internal::Buffer & crossProduct, - const services::internal::Buffer & sums, const services::internal::Buffer & cov, - const services::internal::Buffer & mean, const Parameter * parameter) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.finalizeCovariance); - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto diagCrossProduct = context.allocate(TypeIds::id(), nFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - status |= prepareMeansAndCrossProductDiag(nFeatures, nObservations, crossProduct, - diagCrossProduct.template get(), sums, mean); - DAAL_CHECK_STATUS_VAR(status); - - status |= finalize(nFeatures, nObservations, crossProduct, cov, diagCrossProduct.template get(), parameter); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -} // namespace internal -} // namespace oneapi -} // namespace covariance -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/dbscan/BUILD b/cpp/daal/src/algorithms/dbscan/BUILD index 9f64eecd83b..9810bc00230 100644 --- a/cpp/daal/src/algorithms/dbscan/BUILD +++ b/cpp/daal/src/algorithms/dbscan/BUILD @@ -4,9 +4,8 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/daal/src/algorithms/dbscan/dbscan_container.h b/cpp/daal/src/algorithms/dbscan/dbscan_container.h index 7a727556b71..5f79f8e24bb 100644 --- a/cpp/daal/src/algorithms/dbscan/dbscan_container.h +++ b/cpp/daal/src/algorithms/dbscan/dbscan_container.h @@ -29,7 +29,6 @@ #include "algorithms/dbscan/dbscan_batch.h" #include "algorithms/dbscan/dbscan_distributed.h" #include "src/algorithms/dbscan/dbscan_kernel.h" -#include "src/algorithms/dbscan/oneapi/dbscan_kernel_ucapi.h" #include "src/services/service_algo_utils.h" namespace daal @@ -41,17 +40,7 @@ namespace dbscan template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu || method != defaultDense) - { - __DAAL_INITIALIZE_KERNELS(internal::DBSCANBatchKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::DBSCANBatchKernelUCAPI, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(internal::DBSCANBatchKernel, algorithmFPType, method); } template @@ -77,27 +66,15 @@ services::Status BatchContainer::compute() Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu || method != defaultDense) + if (par->memorySavingMode == false) { - if (par->memorySavingMode == false) - { - __DAAL_CALL_KERNEL(env, internal::DBSCANBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), computeNoMemSave, ntData.get(), - ntWeights.get(), ntAssignments.get(), ntNClusters.get(), ntCoreIndices.get(), ntCoreObservations.get(), par); - } - else - { - __DAAL_CALL_KERNEL(env, internal::DBSCANBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), computeMemSave, ntData.get(), - ntWeights.get(), ntAssignments.get(), ntNClusters.get(), ntCoreIndices.get(), ntCoreObservations.get(), par); - } + __DAAL_CALL_KERNEL(env, internal::DBSCANBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), computeNoMemSave, ntData.get(), + ntWeights.get(), ntAssignments.get(), ntNClusters.get(), ntCoreIndices.get(), ntCoreObservations.get(), par); } else { - // memorySavingMode flag is not applicable for DBSCAN on GPU - __DAAL_CALL_KERNEL_SYCL(env, internal::DBSCANBatchKernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType), compute, ntData.get(), - ntWeights.get(), ntAssignments.get(), ntNClusters.get(), ntCoreIndices.get(), ntCoreObservations.get(), par); + __DAAL_CALL_KERNEL(env, internal::DBSCANBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), computeMemSave, ntData.get(), + ntWeights.get(), ntAssignments.get(), ntNClusters.get(), ntCoreIndices.get(), ntCoreObservations.get(), par); } } diff --git a/cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_fpt_dispatcher.cpp index 937e09cd6cf..76f4912dd34 100644 --- a/cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(dbscan::BatchContainer, batch, DAAL_FPTYPE, dbscan::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(dbscan::BatchContainer, batch, DAAL_FPTYPE, dbscan::defaultDense) namespace dbscan { diff --git a/cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_kernel_ucapi_fpt.cpp deleted file mode 100755 index 1252d8a1f72..00000000000 --- a/cpp/daal/src/algorithms/dbscan/dbscan_dense_default_batch_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,38 +0,0 @@ -/* file: dbscan_dense_default_batch_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Batch Kernel for GPU. -//-- -*/ - -#include "oneapi/dbscan_dense_default_batch_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace dbscan -{ -namespace internal -{ -template class DAAL_EXPORT DBSCANBatchKernelUCAPI; -} // namespace internal -} // namespace dbscan -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/dbscan/oneapi/cl_kernels/dbscan_cl_kernels.cl b/cpp/daal/src/algorithms/dbscan/oneapi/cl_kernels/dbscan_cl_kernels.cl deleted file mode 100644 index a269f5c842c..00000000000 --- a/cpp/daal/src/algorithms/dbscan/oneapi/cl_kernels/dbscan_cl_kernels.cl +++ /dev/null @@ -1,146 +0,0 @@ -/* file: dbscan_cl_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of DBSCAN OpenCL kernels. -//-- -*/ - -#ifndef __DBSCAN_CL_KERNELS_CL__ -#define __DBSCAN_CL_KERNELS_CL__ - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - dbscanClKernels, - - __kernel void computeCores(int numPoints, int numFeatures, int numNbrs, algorithmFPType eps, const __global algorithmFPType * points, - __global int * cores) { - const int globalId = get_global_id(0); - if (get_sub_group_id() > 0) return; - - const int subgroupSize = get_sub_group_size(); - const int localId = get_sub_group_local_id(); - int count = 0; - for (int j = 0; j < numPoints; j++) - { - algorithmFPType sum = 0.0; - for (int i = localId; i < numFeatures; i += subgroupSize) - { - algorithmFPType val = points[globalId * numFeatures + i] - points[j * numFeatures + i]; - sum += val * val; - } - algorithmFPType distance = sub_group_reduce_add(sum); - count += (int)(distance <= eps); - } - if (localId == 0) - { - cores[globalId] = (int)(count >= numNbrs); - } - } - - __kernel void computeCoresWithWeights(int numPoints, int numFeatures, algorithmFPType numNbrs, algorithmFPType eps, int useWeights, - const __global algorithmFPType * points, const __global algorithmFPType * weights, __global int * cores) { - const int globalId = get_global_id(0); - if (get_sub_group_id() > 0) return; - - const int subgroupSize = get_sub_group_size(); - const int localId = get_sub_group_local_id(); - algorithmFPType count = 0; - for (int j = 0; j < numPoints; j++) - { - algorithmFPType sum = 0.0; - for (int i = localId; i < numFeatures; i += subgroupSize) - { - algorithmFPType val = points[globalId * numFeatures + i] - points[j * numFeatures + i]; - sum += val * val; - } - algorithmFPType distance = sub_group_reduce_add(sum); - algorithmFPType incr = (distance <= eps) ? 1.0 : 0.0; - incr *= useWeights ? weights[globalId] : 1.0; - count += incr; - } - if (localId == 0) - { - cores[globalId] = (int)(count >= numNbrs); - } - } - - __kernel void startNextCluster(int clusterId, int numPoints, int queueEnd, const __global int * cores, __global int * clusters, - __global int * lastClusterStart, __global int * queue) { - // The kernel should be run on a single subgroup - if (get_sub_group_id() > 0 || get_global_id(0) > 0) return; - - const int subgroupSize = get_sub_group_size(); - const int localId = get_sub_group_local_id(); - const int start = lastClusterStart[0]; - for (int i = start + localId; i < numPoints; i++) - { - const bool found = cores[i] == 1 && clusters[i] < 0; - const int index = sub_group_reduce_min(found ? i : numPoints); - if (index < numPoints) - { - if (localId == 0) - { - clusters[index] = clusterId; - lastClusterStart[0] = index + 1; - queue[queueEnd] = index; - } - break; - } - } - } - - __kernel void updateQueue(int clusterId, int numPoints, int numFeatures, algorithmFPType eps, int queueStart, int queueEnd, - const __global algorithmFPType * points, __global int * cores, __global int * clusters, __global int * queue, - __global int * queueFront) { - if (get_sub_group_id() > 0) return; - const int subgroupIndex = get_global_id(0); - if (clusters[subgroupIndex] > -1) return; - const int localId = get_sub_group_local_id(); - const int subgroupSize = get_sub_group_size(); - volatile __global int * counterPtr = queueFront; - - for (int j = queueStart; j < queueEnd; j++) - { - const int index = queue[j]; - algorithmFPType sum = 0.0; - for (int i = localId; i < numFeatures; i += subgroupSize) - { - algorithmFPType val = points[subgroupIndex * numFeatures + i] - points[index * numFeatures + i]; - sum += val * val; - } - algorithmFPType distance = sub_group_reduce_add(sum); - if (distance > eps) continue; - if (localId == 0) - { - clusters[subgroupIndex] = clusterId; - } - if (cores[subgroupIndex] == 0) continue; - if (localId == 0) - { - const int newIndex = atomic_inc(counterPtr); - queue[newIndex] = subgroupIndex; - } - break; - } - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/dbscan/oneapi/dbscan_dense_default_batch_ucapi_impl.i b/cpp/daal/src/algorithms/dbscan/oneapi/dbscan_dense_default_batch_ucapi_impl.i deleted file mode 100644 index afb8069e50a..00000000000 --- a/cpp/daal/src/algorithms/dbscan/oneapi/dbscan_dense_default_batch_ucapi_impl.i +++ /dev/null @@ -1,481 +0,0 @@ -/* file: dbscan_dense_default_batch_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of default method for DBSCAN algorithm on GPU. -//-- -*/ - -#include "src/algorithms/dbscan/oneapi/dbscan_kernel_ucapi.h" -#include "src/algorithms/dbscan/oneapi/cl_kernels/dbscan_cl_kernels.cl" -#include "src/services/service_data_utils.h" -#include "src/externals/service_profiler.h" - -using namespace daal::services; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -constexpr size_t maxInt32AsSizeT = static_cast(daal::services::internal::MaxVal::get()); -constexpr size_t maxInt32AsUint32T = static_cast(daal::services::internal::MaxVal::get()); - -namespace daal -{ -namespace algorithms -{ -namespace dbscan -{ -namespace internal -{ -template -services::Status DBSCANBatchKernelUCAPI::initializeBuffers(uint32_t nRows, NumericTable * weights) -{ - Status s; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - _queue = context.allocate(TypeIds::id(), nRows, s); - DAAL_CHECK_STATUS_VAR(s); - _isCore = context.allocate(TypeIds::id(), nRows, s); - DAAL_CHECK_STATUS_VAR(s); - context.fill(_isCore, 0, s); - DAAL_CHECK_STATUS_VAR(s); - _lastPoint = context.allocate(TypeIds::id(), 1, s); - DAAL_CHECK_STATUS_VAR(s); - context.fill(_lastPoint, 0, s); - DAAL_CHECK_STATUS_VAR(s); - _queueFront = context.allocate(TypeIds::id(), 1, s); - DAAL_CHECK_STATUS_VAR(s); - _useWeights = weights != nullptr; - if (_useWeights) - { - BlockDescriptor weightRows; - DAAL_CHECK_STATUS_VAR(weights->getBlockOfRows(0, nRows, readOnly, weightRows)); - _weights = UniversalBuffer(weightRows.getBuffer()); - } - else - { - // OpenCL needs it - _weights = context.allocate(TypeIds::id(), 1, s); - DAAL_CHECK_STATUS_VAR(s); - } - return s; -} - -template -Status DBSCANBatchKernelUCAPI::processResultsToCompute(DAAL_UINT64 resultsToCompute, NumericTable * ntData, - NumericTable * ntCoreIndices, NumericTable * ntCoreObservations) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.processResultsToCompute); - Status st; - - const uint32_t nRows = ntData->getNumberOfRows(); - const uint32_t nFeatures = ntData->getNumberOfColumns(); - - DAAL_ASSERT_UNIVERSAL_BUFFER(_isCore, int, nRows); - auto isCoreHost = _isCore.template get().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - auto isCore = isCoreHost.get(); - - uint32_t nCoreObservations = 0; - for (uint32_t i = 0; i < nRows; i++) - { - if (!isCore[i]) - { - continue; - } - nCoreObservations++; - } - - if (nCoreObservations == 0) - { - return Status(); - } - - if (resultsToCompute & computeCoreIndices) - { - DAAL_CHECK_STATUS_VAR(ntCoreIndices->resize(nCoreObservations)); - BlockDescriptor indexRows; - DAAL_CHECK_STATUS_VAR(ntCoreIndices->getBlockOfRows(0, nCoreObservations, writeOnly, indexRows)); - auto coreIndices = indexRows.getBlockPtr(); - if (!coreIndices) - { - return Status(ErrorNullPtr); - } - - uint32_t pos = 0; - for (uint32_t i = 0; i < nRows; i++) - { - if (!isCore[i]) - { - continue; - } - coreIndices[pos] = i; - pos++; - } - } - - if (resultsToCompute & computeCoreObservations) - { - DAAL_CHECK_STATUS_VAR(ntCoreObservations->resize(nCoreObservations)); - BlockDescriptor coreObservationsRows; - DAAL_CHECK_STATUS_VAR(ntCoreObservations->getBlockOfRows(0, nCoreObservations, writeOnly, coreObservationsRows)); - auto coreObservationsPtr = coreObservationsRows.getBuffer().toHost(ReadWriteMode::writeOnly, st); - DAAL_CHECK_STATUS_VAR(st); - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(0, nRows, readOnly, dataRows)); - auto dataPtr = dataRows.getBuffer().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - - uint32_t pos = 0; - for (uint32_t i = 0; i < nRows; i++) - { - if (!isCore[i]) - { - continue; - } - for (uint32_t j = 0; j < nFeatures; j++) coreObservationsPtr.get()[pos * nFeatures + j] = dataPtr.get()[i * nFeatures + j]; - pos++; - } - DAAL_CHECK_STATUS_VAR(ntCoreObservations->releaseBlockOfRows(coreObservationsRows)); - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(dataRows)); - } - - return st; -} - -template -Status DBSCANBatchKernelUCAPI::compute(const NumericTable * x, const NumericTable * ntWeights, NumericTable * ntAssignments, - NumericTable * ntNClusters, NumericTable * ntCoreIndices, NumericTable * ntCoreObservations, - const Parameter * par) -{ - Status s; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - const uint32_t minkowskiPower = 2; - algorithmFPType epsP = 1.0; - for (uint32_t i = 0; i < minkowskiPower; i++) epsP *= par->epsilon; - DAAL_CHECK((par->minObservations > algorithmFPType(0)) && (par->minObservations < algorithmFPType(maxInt32AsSizeT)), - services::ErrorIncorrectParameter); - - NumericTable * const ntData = const_cast(x); - NumericTable * const ntW = const_cast(ntWeights); - - const size_t nDataRowsAsSizeT = ntData->getNumberOfRows(); - const size_t nDataColumnsAsSizeT = ntData->getNumberOfColumns(); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nDataRowsAsSizeT, nDataColumnsAsSizeT); - - DAAL_CHECK(nDataRowsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfRowsInInputNumericTable); - DAAL_CHECK(nDataColumnsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - - if (ntW) - { - const size_t nWeightRowsAsSizeT = ntW->getNumberOfRows(); - const size_t nWeightColumnsAsSizeT = ntW->getNumberOfColumns(); - DAAL_CHECK(nWeightRowsAsSizeT == nDataRowsAsSizeT, services::ErrorIncorrectNumberOfRowsInInputNumericTable); - DAAL_CHECK(nWeightColumnsAsSizeT == 1, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - } - - const uint32_t nRows = static_cast(nDataRowsAsSizeT); - const uint32_t nFeatures = static_cast(nDataColumnsAsSizeT); - - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(0, nRows, readOnly, dataRows)); - auto data = dataRows.getBuffer(); - - BlockDescriptor assignRows; - DAAL_CHECK_STATUS_VAR(ntAssignments->getBlockOfRows(0, nRows, writeOnly, assignRows)); - auto assignBuffer = assignRows.getBuffer(); - - UniversalBuffer assignments = assignBuffer; - context.fill(assignments, noise, s); - DAAL_CHECK_STATUS_VAR(s); - - DAAL_CHECK_STATUS_VAR(initializeBuffers(nRows, ntW)); - - uint32_t nClusters = 0; - uint32_t queueBegin = 0; - uint32_t queueEnd = 0; - - if (_useWeights) - { - DAAL_CHECK_STATUS_VAR(getCoresWithWeights(data, nRows, nFeatures, par->minObservations, epsP)); - } - else - { - DAAL_CHECK_STATUS_VAR(getCores(data, nRows, nFeatures, par->minObservations, epsP)); - } - - bool foundCluster = false; - DAAL_CHECK_STATUS_VAR(startNextCluster(nClusters, nRows, queueEnd, assignments, foundCluster)); - while (foundCluster) - { - ++nClusters; - ++queueEnd; - DAAL_CHECK_STATUS_VAR(setQueueFront(queueEnd)); - while (queueBegin < queueEnd) - { - updateQueue(nClusters - 1, nRows, nFeatures, epsP, queueBegin, queueEnd, data, assignments); - queueBegin = queueEnd; - DAAL_CHECK_STATUS_VAR(getQueueFront(queueEnd)); - } - DAAL_CHECK_STATUS_VAR(startNextCluster(nClusters, nRows, queueEnd, assignments, foundCluster)); - } - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(dataRows)); - - BlockDescriptor nClustersRows; - DAAL_CHECK_STATUS_VAR(ntNClusters->getBlockOfRows(0, 1, writeOnly, nClustersRows)); - auto nClusterHostBuffer = nClustersRows.getBuffer().toHost(ReadWriteMode::writeOnly, s); - DAAL_CHECK_STATUS_VAR(s); - *nClusterHostBuffer.get() = nClusters; - - if (par->resultsToCompute & (computeCoreIndices | computeCoreObservations)) - { - DAAL_CHECK_STATUS_VAR(processResultsToCompute(par->resultsToCompute, ntData, ntCoreIndices, ntCoreObservations)); - } - return s; -} - -template -services::Status DBSCANBatchKernelUCAPI::startNextCluster(uint32_t clusterId, uint32_t nRows, uint32_t queueEnd, - UniversalBuffer & clusters, bool & found) -{ - services::Status st; - DAAL_ITTNOTIFY_SCOPED_TASK(compute.startNextCluster); - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory)); - auto kernel = kernel_factory.getKernel("startNextCluster", st); - DAAL_CHECK_STATUS_VAR(st); - - int last; - { - DAAL_ASSERT_UNIVERSAL_BUFFER(_lastPoint, int, 1); - const auto lastPointHostBuffer = _lastPoint.template get().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - last = *lastPointHostBuffer.get(); - } - - DAAL_ASSERT_UNIVERSAL_BUFFER(_isCore, int, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(clusters, int, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(_queue, int, nRows); - - KernelArguments args(7, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, static_cast(clusterId)); - args.set(1, static_cast(nRows)); - args.set(2, static_cast(queueEnd)); - args.set(3, _isCore, AccessModeIds::read); - args.set(4, clusters, AccessModeIds::write); - args.set(5, _lastPoint, AccessModeIds::write); - args.set(6, _queue, AccessModeIds::write); - - KernelRange localRange(1, _maxSubgroupSize); - KernelRange globalRange(1, _maxSubgroupSize); - - KernelNDRange range(2); - range.global(globalRange, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(localRange, st); - DAAL_CHECK_STATUS_VAR(st); - - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - int newLast; - { - const auto lastPointHostBuffer = _lastPoint.template get().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - newLast = *lastPointHostBuffer.get(); - found = newLast > last; - } - return st; -} - -template -services::Status DBSCANBatchKernelUCAPI::getCores(const UniversalBuffer & data, uint32_t nRows, uint32_t nFeatures, int nNbrs, - algorithmFPType eps) -{ - services::Status st; - DAAL_ITTNOTIFY_SCOPED_TASK(compute.getCores); - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory)); - auto kernel = kernel_factory.getKernel("computeCores", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, algorithmFPType, nRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_weights, algorithmFPType, _useWeights ? nRows : 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(_isCore, int, nRows); - - KernelArguments args(6, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, static_cast(nRows)); - args.set(1, static_cast(nFeatures)); - args.set(2, nNbrs); - args.set(3, eps); - args.set(4, data, AccessModeIds::read); - args.set(5, _isCore, AccessModeIds::write); - - const uint32_t rangeWidth = (nFeatures < _maxSubgroupSize) ? nFeatures : _maxSubgroupSize; - KernelRange localRange(1, rangeWidth); - KernelRange globalRange(nRows, rangeWidth); - - KernelNDRange range(2); - range.global(globalRange, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(localRange, st); - DAAL_CHECK_STATUS_VAR(st); - - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -services::Status DBSCANBatchKernelUCAPI::getCoresWithWeights(const UniversalBuffer & data, uint32_t nRows, uint32_t nFeatures, - algorithmFPType nNbrs, algorithmFPType eps) -{ - services::Status st; - DAAL_ITTNOTIFY_SCOPED_TASK(compute.getCores); - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory)); - auto kernel = kernel_factory.getKernel("computeCoresWithWeights", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, algorithmFPType, nRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_weights, algorithmFPType, _useWeights ? nRows : 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(_isCore, int, nRows); - - KernelArguments args(8, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, static_cast(nRows)); - args.set(1, static_cast(nFeatures)); - args.set(2, nNbrs); - args.set(3, eps); - args.set(4, static_cast(_useWeights)); - args.set(5, data, AccessModeIds::read); - args.set(6, _weights, AccessModeIds::read); - args.set(7, _isCore, AccessModeIds::write); - - uint32_t rangeWidth = nFeatures < _maxSubgroupSize ? nFeatures : _maxSubgroupSize; - KernelRange localRange(1, rangeWidth); - KernelRange globalRange(nRows, rangeWidth); - - KernelNDRange range(2); - range.global(globalRange, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(localRange, st); - DAAL_CHECK_STATUS_VAR(st); - - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -services::Status DBSCANBatchKernelUCAPI::updateQueue(uint32_t clusterId, uint32_t nRows, uint32_t nFeatures, algorithmFPType eps, - uint32_t queueBegin, uint32_t queueEnd, const UniversalBuffer & data, - UniversalBuffer & clusters) -{ - services::Status st; - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateQueue); - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory)); - auto kernel = kernel_factory.getKernel("updateQueue", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, algorithmFPType, nRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_isCore, int, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(clusters, int, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(_queue, int, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(_queueFront, int, 1); - - KernelArguments args(11, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, static_cast(clusterId)); - args.set(1, static_cast(nRows)); - args.set(2, static_cast(nFeatures)); - args.set(3, eps); - args.set(4, static_cast(queueBegin)); - args.set(5, static_cast(queueEnd)); - args.set(6, data, AccessModeIds::read); - args.set(7, _isCore, AccessModeIds::read); - args.set(8, clusters, AccessModeIds::write); - args.set(9, _queue, AccessModeIds::write); - args.set(10, _queueFront, AccessModeIds::write); - - uint32_t rangeWidth = nFeatures < _maxSubgroupSize ? nFeatures : _maxSubgroupSize; - KernelRange localRange(1, rangeWidth); - KernelRange globalRange(nRows, rangeWidth); - - KernelNDRange range(2); - range.global(globalRange, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(localRange, st); - DAAL_CHECK_STATUS_VAR(st); - - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -Status DBSCANBatchKernelUCAPI::buildProgram(ClKernelFactoryIface & kernel_factory) -{ - Status st; - const auto fptypeName = services::internal::sycl::getKeyFPType(); - const auto buildOptions = fptypeName; - - services::String cachekey("__daal_algorithms_dbscan_block_"); - cachekey.add(fptypeName); - cachekey.add(buildOptions); - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - kernel_factory.build(ExecutionTargetIds::device, cachekey.c_str(), dbscanClKernels, buildOptions.c_str(), st); - DAAL_CHECK_STATUS_VAR(st); - } - return st; -} - -template -Status DBSCANBatchKernelUCAPI::setQueueFront(uint32_t queueEnd) -{ - Status st; - DAAL_ASSERT_UNIVERSAL_BUFFER(_queueFront, int, 1); - const auto val = _queueFront.template get().toHost(ReadWriteMode::readWrite, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_ASSERT(queueEnd <= maxInt32AsUint32T); - *val.get() = static_cast(queueEnd); - return st; -} - -template -Status DBSCANBatchKernelUCAPI::getQueueFront(uint32_t & queueEnd) -{ - Status st; - DAAL_ASSERT_UNIVERSAL_BUFFER(_queueFront, int, 1); - const auto val = _queueFront.template get().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_ASSERT(*val.get() >= 0); - queueEnd = static_cast(*val.get()); - return st; -} - -} // namespace internal -} // namespace dbscan -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/dbscan/oneapi/dbscan_kernel_ucapi.h b/cpp/daal/src/algorithms/dbscan/oneapi/dbscan_kernel_ucapi.h deleted file mode 100644 index cb116111c3d..00000000000 --- a/cpp/daal/src/algorithms/dbscan/oneapi/dbscan_kernel_ucapi.h +++ /dev/null @@ -1,82 +0,0 @@ -/* file: dbscan_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template function that computes DBSCAN for GPU. -//-- -*/ - -#ifndef __DBSCAN_KERNEL_UCAPI_H -#define __DBSCAN_KERNEL_UCAPI_H - -#include "algorithms/dbscan/dbscan_types.h" -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "services/internal/sycl/execution_context.h" - -namespace daal -{ -namespace algorithms -{ -namespace dbscan -{ -namespace internal -{ -template -class DBSCANBatchKernelUCAPI : public Kernel -{ -public: - services::Status compute(const daal::data_management::NumericTable * ntData, const daal::data_management::NumericTable * ntWeights, - daal::data_management::NumericTable * ntAssignments, daal::data_management::NumericTable * ntNClusters, - daal::data_management::NumericTable * ntCoreIndices, daal::data_management::NumericTable * ntCoreObservations, - const Parameter * par); - -private: - services::Status getCores(const services::internal::sycl::UniversalBuffer & data, uint32_t nRows, uint32_t nFeatures, int nNbrs, - algorithmFPType eps); - services::Status getCoresWithWeights(const services::internal::sycl::UniversalBuffer & data, uint32_t nRows, uint32_t nFeatures, - algorithmFPType nNbrs, algorithmFPType eps); - services::Status updateQueue(uint32_t clusterId, uint32_t nRows, uint32_t nFeatures, algorithmFPType eps, uint32_t queueBegin, uint32_t queueEnd, - const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & clusters); - - services::Status startNextCluster(uint32_t clusterId, uint32_t nRows, uint32_t queueEnd, services::internal::sycl::UniversalBuffer & clusters, - bool & found); - services::Status processResultsToCompute(DAAL_UINT64 resultsToCompute, daal::data_management::NumericTable * ntData, - daal::data_management::NumericTable * ntCoreIndices, - daal::data_management::NumericTable * ntCoreObservations); - services::Status initializeBuffers(uint32_t nRows, daal::data_management::NumericTable * weights); - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & kernel_factory); - services::Status setQueueFront(uint32_t queueEnd); - services::Status getQueueFront(uint32_t & queueEnd); - - static constexpr uint32_t _maxSubgroupSize = 32; - bool _useWeights; - - services::internal::sycl::UniversalBuffer _weights; - services::internal::sycl::UniversalBuffer _queue; - services::internal::sycl::UniversalBuffer _isCore; - services::internal::sycl::UniversalBuffer _lastPoint; - services::internal::sycl::UniversalBuffer _queueFront; -}; - -} // namespace internal -} // namespace dbscan -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/decision_tree/BUILD b/cpp/daal/src/algorithms/decision_tree/BUILD index 2cfc6df1b5e..67a470476f8 100644 --- a/cpp/daal/src/algorithms/decision_tree/BUILD +++ b/cpp/daal/src/algorithms/decision_tree/BUILD @@ -8,6 +8,6 @@ daal_module( "@onedal//cpp/daal:core", "@onedal//cpp/daal/src/algorithms/classifier:kernel", "@onedal//cpp/daal/src/algorithms/regression:kernel", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/daal/src/algorithms/dtrees/forest/BUILD b/cpp/daal/src/algorithms/dtrees/forest/BUILD index c49b7737cc4..58bcaf67574 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/BUILD +++ b/cpp/daal/src/algorithms/dtrees/forest/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/dtrees:kernel", "@onedal//cpp/daal/src/algorithms/distributions:kernel", ], diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/BUILD b/cpp/daal/src/algorithms/dtrees/forest/classification/BUILD index 0f14150b99e..4c5016a9feb 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/BUILD +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/classifier:kernel", "@onedal//cpp/daal/src/algorithms/dtrees:kernel", "@onedal//cpp/daal/src/algorithms/dtrees/forest:kernel", diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_container.h b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_container.h index 62205352314..b8763aa6235 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_container.h +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_container.h @@ -25,7 +25,6 @@ #include "algorithms/decision_forest/decision_forest_classification_predict.h" #include "src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch.h" -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_kernel_oneapi.h" #include "src/services/service_algo_utils.h" namespace daal @@ -43,17 +42,7 @@ namespace interface3 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : PredictionContainerIface() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (!deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::PredictKernelOneAPI, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); } template @@ -65,9 +54,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - const Input * const input = static_cast(_in); classifier::prediction::Result * const result = static_cast(_res); const decision_forest::classification::prediction::Parameter * const par = @@ -91,16 +77,8 @@ services::Status BatchContainer::compute() const VotingMethod votingMethod = par->votingMethod; - if (!deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::PredictKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*const_cast(input)), a, m, r, prob, par->nClasses, votingMethod); - } - else - { - __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*const_cast(input)), a, m, r, prob, par->nClasses, votingMethod); - } + __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + daal::services::internal::hostApp(*const_cast(input)), a, m, r, prob, par->nClasses, votingMethod); } } // namespace interface3 } // namespace prediction diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_fpt_dispatcher.cpp index 99c527d7899..4842b1d6e8f 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_fpt_dispatcher.cpp @@ -29,8 +29,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(decision_forest::classification::prediction::BatchContainer, batch, DAAL_FPTYPE, - decision_forest::classification::prediction::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(decision_forest::classification::prediction::BatchContainer, batch, DAAL_FPTYPE, + decision_forest::classification::prediction::defaultDense) namespace decision_forest { namespace classification diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index b1d0a1fcae1..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_predict_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* file: df_classification_predict_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of prediction stage of decision forest algorithm for GPU. -//-- -*/ - -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace classification -{ -namespace prediction -{ -namespace internal -{ -template class DAAL_EXPORT PredictKernelOneAPI; -} -} // namespace prediction -} // namespace classification -} // namespace decision_forest -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_container.h b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_container.h index 333f5085029..8c4ceef8c06 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_container.h +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_container.h @@ -28,7 +28,6 @@ #include "algorithms/decision_forest/decision_forest_classification_training_types.h" #include "algorithms/decision_forest/decision_forest_classification_training_batch.h" #include "src/algorithms/dtrees/forest/classification/df_classification_train_kernel.h" -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_kernel_oneapi.h" #include "src/algorithms/dtrees/forest/classification/df_classification_model_impl.h" #include "src/services/service_algo_utils.h" @@ -47,17 +46,7 @@ namespace interface3 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method == hist && !deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::ClassificationTrainBatchKernelOneAPI, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::ClassificationTrainBatchKernel, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::ClassificationTrainBatchKernel, algorithmFPType, method); } template @@ -69,9 +58,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - classifier::training::Input * input = static_cast(_in); Result * result = static_cast(_res); @@ -86,16 +72,8 @@ services::Status BatchContainer::compute() const decision_forest::classification::training::Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - if (method == hist && !deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::ClassificationTrainBatchKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), x, y, *m, *result, *par); - } - else - { - __DAAL_CALL_KERNEL(env, internal::ClassificationTrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), x, y, w, *m, *result, *par); - } + __DAAL_CALL_KERNEL(env, internal::ClassificationTrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + daal::services::internal::hostApp(*input), x, y, w, *m, *result, *par); } template diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_fpt_dispatcher.cpp old mode 100755 new mode 100644 index 24b3380e016..e6db0c8f14c --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_fpt_dispatcher.cpp @@ -28,8 +28,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(decision_forest::classification::training::BatchContainer, batch, DAAL_FPTYPE, - decision_forest::classification::training::hist) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(decision_forest::classification::training::BatchContainer, batch, DAAL_FPTYPE, + decision_forest::classification::training::hist) namespace decision_forest { namespace classification diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_oneapi_fpt.cpp deleted file mode 100644 index 67795879395..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/df_classification_train_hist_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* file: df_classification_train_hist_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of decision forest classification training functions for the hist method -//-- -*/ - -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace classification -{ -namespace training -{ -namespace internal -{ -template class DAAL_EXPORT ClassificationTrainBatchKernelOneAPI; -} - -} // namespace training -} // namespace classification -} // namespace decision_forest -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_classification_kernels.cl b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_classification_kernels.cl deleted file mode 100644 index 67288de540a..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_classification_kernels.cl +++ /dev/null @@ -1,632 +0,0 @@ -/* file: df_batch_classification_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of decision forest Batch classification OpenCL kernels. -//-- -*/ - -#ifndef __DF_BATCH_CLASSIFICATION_KERNELS_CL__ -#define __DF_BATCH_CLASSIFICATION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - df_batch_classification_kernels_part1, - - inline int fpEq(algorithmFPType a, algorithmFPType b) { return (int)(fabs(a - b) <= algorithmFPTypeAccuracy); } - - inline int fpGt(algorithmFPType a, algorithmFPType b) { return (int)((a - b) > algorithmFPTypeAccuracy); } - - __kernel void computeBestSplitSinglePass(const __global int * data, const __global int * treeOrder, const __global int * selectedFeatures, - int nSelectedFeatures, const __global algorithmFPType * response, const __global int * binOffsets, - __global int * nodeList, const __global int * nodeIndices, int nodeIndicesOffset, - __global algorithmFPType * splitInfo, __global algorithmFPType * nodeImpDecreaseList, - int updateImpDecreaseRequired, int nFeatures, int minObservationsInLeafNode, - algorithmFPType impurityThreshold) { - // this kernel is targeted for processing nodes with small number of rows - // nodeList will be updated with split attributes - // spliInfo will contain node impurity and mean - const int nProp = HIST_PROPS; // num of classes (i.e. classes) - const int nNodeProp = NODE_PROPS; // num of node properties in nodeList - const int nImpProp = IMPURITY_PROPS + HIST_PROPS; // impurity + node classes histogram - const int leafMark = -1; - - const int local_id = get_local_id(0); - const int sub_group_local_id = get_sub_group_local_id(); - const int sub_group_size = get_sub_group_size(); - const int local_size = get_local_size(0); - const int n_sub_groups = local_size / sub_group_size; // num of subgroups for current node processing - const int sub_group_id = local_id / sub_group_size; - const int max_sub_groups_num = 16; //replace with define - - const int nodeIdx = get_global_id(1); - const int nodeId = nodeIndices[nodeIndicesOffset + nodeIdx]; - - const int rowsOffset = nodeList[nodeId * nNodeProp + 0]; - const int nRows = nodeList[nodeId * nNodeProp + 1]; - - // each sub group will process sub_group_size bins and produce 1 best split for it - const int maxBinsBlocks = max_sub_groups_num; - __local algorithmFPType bufI[maxBinsBlocks]; // storage for impurity decrease - __local algorithmFPType bufHist[maxBinsBlocks * nProp]; // storage for classes info - __local int bufS[maxBinsBlocks * nNodeProp]; // storage for split info - - const algorithmFPType minImpDec = (algorithmFPType)-1e30; - const int valNotFound = 1 << 30; - - algorithmFPType curImpDec = minImpDec; - int curFeatureValue = leafMark; - int curFeatureId = leafMark; - - nodeList[nodeId * nNodeProp + 2] = curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = nRows; - - algorithmFPType mrgN = (algorithmFPType)nRows; - - algorithmFPType bestLN = (algorithmFPType)0; - - algorithmFPType mrgCls[nProp] = { (algorithmFPType)0 }; - - algorithmFPType imp = (algorithmFPType)1; - - int totalBins = 0; - - // totalBins is calculated by each subgroup - for (int featIdx = sub_group_local_id; featIdx < nSelectedFeatures; featIdx += sub_group_size) - { - int featId = selectedFeatures[nodeId * nSelectedFeatures + featIdx]; - int nBins = binOffsets[featId + 1] - binOffsets[featId]; - totalBins += sub_group_reduce_add(nBins); - } - totalBins = sub_group_broadcast(totalBins, 0); - - int currFtrIdx = 0; - int featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - int binId = 0; - int currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - int passedBins = 0; - - for (int i = local_id; i < totalBins; i += local_size) - { - while (i >= passedBins + currFtrBins) - { - passedBins += currFtrBins; - currFtrIdx++; - featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - } - binId = i - passedBins; - - algorithmFPType mrgLRN[2] = { (algorithmFPType)0 }; - algorithmFPType mrgLRCls[nProp * 2] = { (algorithmFPType)0 }; - - // calculating classes histogram - for (int row = 0; row < nRows; row++) - { - int id = treeOrder[rowsOffset + row]; - int bin = data[id * nFeatures + featId]; - int classId = (int)response[id]; - mrgLRN[(int)(bin > binId)] += (algorithmFPType)1; - mrgLRCls[nProp * (int)(bin > binId) + classId] += (algorithmFPType)1; - } - - imp = (algorithmFPType)1; - algorithmFPType impL = (algorithmFPType)1; - algorithmFPType impR = (algorithmFPType)1; - algorithmFPType div = (algorithmFPType)1 / (mrgN * mrgN); - algorithmFPType divL = ((algorithmFPType)0 < mrgLRN[0]) ? (algorithmFPType)1 / (mrgLRN[0] * mrgLRN[0]) : (algorithmFPType)0; - algorithmFPType divR = ((algorithmFPType)0 < mrgLRN[1]) ? (algorithmFPType)1 / (mrgLRN[1] * mrgLRN[1]) : (algorithmFPType)0; - - for (int prop = 0; prop < nProp; prop++) - { - impL -= mrgLRCls[prop] * mrgLRCls[prop] * divL; - impR -= mrgLRCls[nProp + prop] * mrgLRCls[nProp + prop] * divR; - mrgCls[prop] = mrgLRCls[prop] + mrgLRCls[nProp + prop]; - imp -= mrgCls[prop] * mrgCls[prop] * div; - } - impL = (algorithmFPType)0 < impL ? impL : (algorithmFPType)0; - impR = (algorithmFPType)0 < impR ? impR : (algorithmFPType)0; - imp = (algorithmFPType)0 < imp ? imp : (algorithmFPType)0; - - algorithmFPType impDec = imp - (mrgLRN[0] * impL + mrgLRN[1] * impR) / mrgN; - - if ((algorithmFPType)0 < impDec && (!fpEq(imp, (algorithmFPType)0)) && imp >= impurityThreshold - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) || (fpEq(impDec, curImpDec) && featId < curFeatureId)) - && mrgLRN[0] >= minObservationsInLeafNode && mrgLRN[1] >= minObservationsInLeafNode) - { - curFeatureId = featId; - curFeatureValue = binId; - curImpDec = impDec; - - bestLN = mrgLRN[0]; - } - } // for i - - algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - int impDecIsBest = fpEq(bestImpDec, curImpDec); - int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - if (1 == n_sub_groups) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - __global algorithmFPType * nodeHistInfo = splitInfo + nodeId * nImpProp + IMPURITY_PROPS; - splitNodeInfo[0] = imp; - algorithmFPType maxVal = (algorithmFPType)0; - int maxInd = 0; - for (int i = 0; i < nProp; i++) - { - nodeHistInfo[i] = mrgCls[i]; - if (mrgCls[i] > maxVal) - { - maxVal = mrgCls[i]; - maxInd = i; - } - } - - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)bestLN; - nodeList[nodeId * nNodeProp + 5] = maxInd; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = bestImpDec; - } - else - { - bufS[sub_group_id * nNodeProp + 0] = curFeatureId; - bufS[sub_group_id * nNodeProp + 1] = curFeatureValue; - bufS[sub_group_id * nNodeProp + 2] = (int)bestLN; - - for (int i = 0; i < nProp; i++) - { - bufHist[sub_group_id * nProp + i] = mrgCls[i]; - } - - bufI[sub_group_id] = curImpDec; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (1 < n_sub_groups && 0 == sub_group_id) - { - // first sub group for current node reduces over local buffer if required - algorithmFPType curImpDec = (sub_group_local_id < n_sub_groups) ? bufI[sub_group_local_id] : minImpDec; - - int curFeatureId = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 0] : valNotFound; - int curFeatureValue = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 1] : valNotFound; - int LN = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 2] : 0; - int bestValBufIdx = sub_group_local_id; // index of best value in shared buffer between subgroups, need to escape classes info copying - - for (int i = sub_group_size + sub_group_local_id; i < n_sub_groups; i += sub_group_size) - { - algorithmFPType impDec = bufI[i]; - int featId = bufS[i * nNodeProp + 0]; - int featVal = bufS[i * nNodeProp + 1]; - int tLN = bufS[i * nNodeProp + 2]; - if ((algorithmFPType)0 < impDec - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) - || (fpEq(impDec, curImpDec) && (featId < curFeatureId || (featId == curFeatureId && featVal < curFeatureValue))))) - { - curFeatureId = featId; - curFeatureValue = featVal; - curImpDec = impDec; - - LN = tLN; - bestValBufIdx = i; - } - } - // now all info in the range of one subgroup - - const algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - const int impDecIsBest = fpEq(bestImpDec, curImpDec); - const int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - const int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - const bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - const bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - __global algorithmFPType * nodeHistInfo = splitInfo + nodeId * nImpProp + IMPURITY_PROPS; - splitNodeInfo[0] = imp; - algorithmFPType maxVal = (algorithmFPType)0; - int maxInd = 0; - for (int i = 0; i < nProp; i++) - { - algorithmFPType curVal = bufHist[bestValBufIdx * nProp + i]; - nodeHistInfo[i] = curVal; - - if (curVal > maxVal) - { - maxVal = curVal; - maxInd = i; - } - } - - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)LN; - nodeList[nodeId * nNodeProp + 5] = maxInd; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = bestImpDec; - } - } - }); - -DECLARE_SOURCE( - df_batch_classification_kernels_part2, - - inline int fpEq(algorithmFPType a, algorithmFPType b) { return (int)(fabs(a - b) <= algorithmFPTypeAccuracy); } - - inline int fpGt(algorithmFPType a, algorithmFPType b) { return (int)((a - b) > algorithmFPTypeAccuracy); } - - __kernel void computeBestSplitByHistogram(const __global algorithmFPType * histograms, const __global int * selectedFeatures, - int nSelectedFeatures, const __global int * binOffsets, __global int * nodeList, - const __global int * nodeIndices, int nodeIndicesOffset, __global algorithmFPType * splitInfo, - __global algorithmFPType * nodeImpDecreaseList, int updateImpDecreaseRequired, int nMaxBinsAmongFtrs, - int minObservationsInLeafNode, algorithmFPType impurityThreshold) { - // this kernel has almost the same code as computeBestSplitSinglePass - // the difference is that here for each potential split point we pass through bins hist instead of rows - // nodeList will be updated with split attributes in this kernel - // spliInfo will contain node impurity and mean - const int nProp = HIST_PROPS; // classes histogram - const int nNodeProp = NODE_PROPS; // num of node properties in nodeList - const int nImpProp = IMPURITY_PROPS + HIST_PROPS; // impurity + node classes histogram - const int local_id = get_local_id(0); - const int sub_group_local_id = get_sub_group_local_id(); - const int sub_group_size = get_sub_group_size(); - const int nodeIdx = get_global_id(1); - const int nodeId = nodeIndices[nodeIndicesOffset + nodeIdx]; - const int leafMark = -1; - - const int local_size = get_local_size(0); - const int n_sub_groups = local_size / sub_group_size; // num of subgroups for current node processing - const int sub_group_id = local_id / sub_group_size; - const int max_sub_groups_num = 16; //replace with define - - const int rowsOffset = nodeList[nodeId * nNodeProp + 0]; - const int nRows = nodeList[nodeId * nNodeProp + 1]; - - // each sub group will process sub_group_size bins and produce 1 best split for it - const int maxBinsBlocks = max_sub_groups_num; - __local algorithmFPType bufI[maxBinsBlocks]; // storage for impurity decrease - __local algorithmFPType bufHist[maxBinsBlocks * nProp]; // storage for classes info - __local int bufS[maxBinsBlocks * nNodeProp]; // storage for split info - - int valNotFound = 1 << 30; - int curFeatureValue = leafMark; - int curFeatureId = leafMark; - - const algorithmFPType minImpDec = (algorithmFPType)-1e30; - algorithmFPType curImpDec = minImpDec; - - nodeList[nodeId * nNodeProp + 2] = curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = nRows; - - algorithmFPType mrgN = (algorithmFPType)nRows; - - algorithmFPType mrgLN = (algorithmFPType)0; - - algorithmFPType bestLN = (algorithmFPType)0; - algorithmFPType mrgCls[nProp] = { (algorithmFPType)0 }; - - algorithmFPType imp = (algorithmFPType)1; - - int totalBins = 0; - - // totalBins is calculated by each subgroup - for (int featIdx = sub_group_local_id; featIdx < nSelectedFeatures; featIdx += sub_group_size) - { - int featId = selectedFeatures[nodeId * nSelectedFeatures + featIdx]; - int nBins = binOffsets[featId + 1] - binOffsets[featId]; - totalBins += sub_group_reduce_add(nBins); - } - - totalBins = sub_group_broadcast(totalBins, 0); - - int currFtrIdx = 0; - int featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - int binId = 0; - int currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - int passedBins = 0; - - for (int i = local_id; i < totalBins; i += local_size) - { - while (i >= passedBins + currFtrBins) - { - passedBins += currFtrBins; - currFtrIdx++; - featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - } - binId = i - passedBins; - - const __global algorithmFPType * nodeHistogram = histograms + nodeIdx * nSelectedFeatures * nMaxBinsAmongFtrs * nProp; - const __global algorithmFPType * histogramForFeature = nodeHistogram + currFtrIdx * nMaxBinsAmongFtrs * nProp; - - // calculate merged statistics - - algorithmFPType mrgLRN[2] = { (algorithmFPType)0 }; - algorithmFPType mrgLRCls[nProp * 2] = { (algorithmFPType)0 }; - - for (int tbin = 0; tbin < currFtrBins; tbin++) - { - int binOffset = tbin * nProp; - for (int prop = 0; prop < nProp; prop++) - { - mrgCls[prop] += histogramForFeature[binOffset + prop]; - - mrgLRN[(int)(tbin > binId)] += histogramForFeature[binOffset + prop]; - mrgLRCls[nProp * (int)(tbin > binId) + prop] += histogramForFeature[binOffset + prop]; - } - } - - imp = (algorithmFPType)1; - algorithmFPType impL = (algorithmFPType)1; - algorithmFPType impR = (algorithmFPType)1; - algorithmFPType div = (algorithmFPType)1 / (mrgN * mrgN); - algorithmFPType divL = ((algorithmFPType)0 < mrgLRN[0]) ? (algorithmFPType)1 / (mrgLRN[0] * mrgLRN[0]) : (algorithmFPType)0; - algorithmFPType divR = ((algorithmFPType)0 < mrgLRN[1]) ? (algorithmFPType)1 / (mrgLRN[1] * mrgLRN[1]) : (algorithmFPType)0; - - for (int prop = 0; prop < nProp; prop++) - { - impL -= mrgLRCls[prop] * mrgLRCls[prop] * divL; - impR -= mrgLRCls[nProp + prop] * mrgLRCls[nProp + prop] * divR; - mrgCls[prop] = mrgLRCls[prop] + mrgLRCls[nProp + prop]; - imp -= mrgCls[prop] * mrgCls[prop] * div; - } - impL = (algorithmFPType)0 < impL ? impL : (algorithmFPType)0; - impR = (algorithmFPType)0 < impR ? impR : (algorithmFPType)0; - imp = (algorithmFPType)0 < imp ? imp : (algorithmFPType)0; - - algorithmFPType impDec = imp - (mrgLRN[0] * impL + mrgLRN[1] * impR) / mrgN; - - if ((algorithmFPType)0 < impDec && !fpEq(imp, (algorithmFPType)0) && imp >= impurityThreshold - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) || (fpEq(impDec, curImpDec) && featId < curFeatureId)) - && mrgLRN[0] >= minObservationsInLeafNode && mrgLRN[1] >= minObservationsInLeafNode) - { - curFeatureId = featId; - curFeatureValue = binId; - curImpDec = impDec; - - bestLN = mrgLRN[0]; - } - } // for i - - algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - int impDecIsBest = fpEq(bestImpDec, curImpDec); - int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - if (1 == n_sub_groups) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - __global algorithmFPType * nodeHistInfo = splitInfo + nodeId * nImpProp + IMPURITY_PROPS; - - splitNodeInfo[0] = imp; - algorithmFPType maxVal = (algorithmFPType)0; - int maxInd = 0; - for (int i = 0; i < nProp; i++) - { - nodeHistInfo[i] = mrgCls[i]; - if (mrgCls[i] > maxVal) - { - maxVal = mrgCls[i]; - maxInd = i; - } - } - - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)bestLN; - nodeList[nodeId * nNodeProp + 5] = maxInd; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = bestImpDec; - } - else - { - bufS[sub_group_id * nNodeProp + 0] = curFeatureId; - bufS[sub_group_id * nNodeProp + 1] = curFeatureValue; - bufS[sub_group_id * nNodeProp + 2] = (int)bestLN; - - for (int i = 0; i < nProp; i++) - { - bufHist[sub_group_id * nProp + i] = mrgCls[i]; - } - - bufI[sub_group_id] = curImpDec; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - if (1 < n_sub_groups && 0 == sub_group_id) - { - // first sub group for current node reduces over local buffer if required - algorithmFPType curImpDec = (sub_group_local_id < n_sub_groups) ? bufI[sub_group_local_id] : minImpDec; - - int curFeatureId = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 0] : valNotFound; - int curFeatureValue = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 1] : valNotFound; - int LN = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 2] : 0; - int bestValBufIdx = sub_group_local_id; // index of best value in shared buffer between subgroups, need to escape classes info copying - - for (int i = sub_group_size + sub_group_local_id; i < n_sub_groups; i += sub_group_size) - { - algorithmFPType impDec = bufI[i]; - int featId = bufS[i * nNodeProp + 0]; - int featVal = bufS[i * nNodeProp + 1]; - int tLN = bufS[i * nNodeProp + 2]; - if ((algorithmFPType)0 < impDec - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) - || (fpEq(impDec, curImpDec) && (featId < curFeatureId || (featId == curFeatureId && featVal < curFeatureValue))))) - { - curFeatureId = featId; - curFeatureValue = featVal; - curImpDec = impDec; - - LN = tLN; - bestValBufIdx = i; - } - } - // now all info in the range of one subgroup - - const algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - const int impDecIsBest = fpEq(bestImpDec, curImpDec); - const int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - const int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - const bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - const bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - __global algorithmFPType * nodeHistInfo = splitInfo + nodeId * nImpProp + IMPURITY_PROPS; - splitNodeInfo[0] = imp; - algorithmFPType maxVal = (algorithmFPType)0; - int maxInd = 0; - for (int i = 0; i < nProp; i++) - { - algorithmFPType curVal = bufHist[bestValBufIdx * nProp + i]; - nodeHistInfo[i] = curVal; - - if (curVal > maxVal) - { - maxVal = curVal; - maxInd = i; - } - } - - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)LN; - nodeList[nodeId * nNodeProp + 5] = maxInd; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = bestImpDec; - } - } - } - - __kernel void computePartialHistograms(const __global int * data, const __global int * treeOrder, const __global int * nodeList, - const __global int * nodeIndices, int nodeIndicesOffset, const __global int * selectedFeatures, - const __global algorithmFPType * response, const __global int * binOffsets, int nMaxBinsAmongFtrs, - int nFeatures, __global algorithmFPType * partialHistograms, int nSelectedFeatures) { - const int nProp = HIST_PROPS; // num of characteristics in histogram (i.e. classes) - const int nNodeProp = NODE_PROPS; // num of node properties in nodeOffsets - - const int nodeIdx = get_global_id(1); - const int nodeId = nodeIndices[nodeIndicesOffset + nodeIdx]; - const int ftrGrpIdx = get_local_id(0); - const int ftrGrpSize = get_local_size(0); - const int nPartHist = get_num_groups(0); - const int histIdx = get_group_id(0); - - const int rowsOffset = nodeList[nodeId * nNodeProp + 0]; - const int nRows = nodeList[nodeId * nNodeProp + 1]; - - const int nElementsForGroup = nRows / nPartHist + !!(nRows % nPartHist); - - int iStart = histIdx * nElementsForGroup; - int iEnd = (histIdx + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - for (int i = iStart; i < iEnd; i++) - { - int id = treeOrder[rowsOffset + i]; - for (int featIdx = ftrGrpIdx; featIdx < nSelectedFeatures; featIdx += ftrGrpSize) - { - const int featId = selectedFeatures[nodeId * nSelectedFeatures + featIdx]; - - __global algorithmFPType * histogram = - partialHistograms + ((nodeIdx * nPartHist + histIdx) * nSelectedFeatures + featIdx) * nMaxBinsAmongFtrs * nProp; - - int bin = data[id * nFeatures + featId]; - int classId = (int)response[id]; - - histogram[bin * nProp + classId] += (algorithmFPType)1; - } - } - } - - __kernel void reducePartialHistograms(const __global algorithmFPType * partialHistograms, __global algorithmFPType * histograms, - int nPartialHistograms, int nSelectedFeatures, int nMaxBinsAmongFtrs) { - const int nProp = HIST_PROPS; // num of characteristics in histogram (i.e. classes) - __local algorithmFPType buf[LOCAL_BUFFER_SIZE * nProp]; - - const int nodeIdx = get_global_id(2); - const int binId = get_global_id(0); - const int local_id = get_local_id(1); - const int local_size = get_local_size(1); - - for (int prop = 0; prop < nProp; prop++) - { - buf[local_id * nProp + prop] = (algorithmFPType)0; - } - - const __global algorithmFPType * nodePartialHistograms = - partialHistograms + nodeIdx * nPartialHistograms * nSelectedFeatures * nMaxBinsAmongFtrs * nProp; - __global algorithmFPType * nodeHistogram = histograms + nodeIdx * nSelectedFeatures * nMaxBinsAmongFtrs * nProp; - - for (int i = local_id; i < nPartialHistograms; i += local_size) - { - int offset = i * nSelectedFeatures * nMaxBinsAmongFtrs * nProp + binId * nProp; - for (int prop = 0; prop < nProp; prop++) - { - buf[local_id * nProp + prop] += nodePartialHistograms[offset + prop]; - } - } - - for (int offset = local_size / 2; offset > 0; offset >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (local_id < offset) - { - for (int prop = 0; prop < nProp; prop++) - { - buf[local_id * nProp + prop] += buf[(local_id + offset) * nProp + prop]; - } - } - } - - if (local_id == 0) - { - for (int prop = 0; prop < nProp; prop++) - { - nodeHistogram[binId * nProp + prop] = buf[local_id + prop]; - } - } - }); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_predict_classification_kernels.cl b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_predict_classification_kernels.cl deleted file mode 100644 index 21cc8ac6b42..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_predict_classification_kernels.cl +++ /dev/null @@ -1,199 +0,0 @@ -/* file: df_batch_classification_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of decision forest Batch classification OpenCL kernels. -//-- -*/ - -#ifndef __DF_BATCH_PREDICT_CLASSIFICATION_KERNELS_CL__ -#define __DF_BATCH_PREDICT_CLASSIFICATION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - df_batch_predict_classification_kernels, - __kernel void predictByTreesWeighted(const __global algorithmFPType * data, const __global int * ftrIdx, - const __global int * classLabelsOrNextNodeIdx, const __global algorithmFPType * ftrValue, - const __global algorithmFPType * classProba, __global algorithmFPType * obsClassHist, algorithmFPType scale, - int nRows, int nCols, int nTrees, int maxTreeSize, int treeOffset) { - const int nClasses = NUM_OF_CLASSES; - const int local_id = get_local_id(0); - const int local_size = get_local_size(0); - const int n_groups = get_num_groups(0); - const int group_id = get_group_id(0); - const int n_tree_groups = get_num_groups(1); - const int tree_group_id = get_group_id(1); - const int tree_id = treeOffset + tree_group_id; - const int leafMark = -1; - - const int nElementsForGroup = nRows / n_groups + !!(nRows % n_groups); - - const int iStart = group_id * nElementsForGroup; - int iEnd = (group_id + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - if (tree_id < nTrees) - { - const __global int * ftrIdxForTree = ftrIdx + tree_id * maxTreeSize; - const __global int * classLabelsOrNextNodeIdxForTree = classLabelsOrNextNodeIdx + tree_id * maxTreeSize; - const __global algorithmFPType * ftrValueForTree = ftrValue + tree_id * maxTreeSize; - const __global algorithmFPType * classProbaForTree = classProba + tree_id * maxTreeSize * nClasses; - - uint treeRootIsSplit = (uint)(leafMark != ftrIdxForTree[0]); - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - uint obsCurrNodeForTree = 0; - uint obsSplitMarkForTree = treeRootIsSplit; - for (; obsSplitMarkForTree > 0;) - { - uint idx = obsSplitMarkForTree * ftrIdxForTree[obsCurrNodeForTree]; - uint sn = (uint)(data[i * nCols + idx] > ftrValueForTree[obsCurrNodeForTree]); - obsCurrNodeForTree -= obsSplitMarkForTree * (obsCurrNodeForTree - (uint)classLabelsOrNextNodeIdxForTree[obsCurrNodeForTree] - sn); - obsSplitMarkForTree = (uint)(ftrIdxForTree[obsCurrNodeForTree] != leafMark); - } - for (int clIdx = 0; clIdx < nClasses; clIdx++) - { - obsClassHist[i * n_tree_groups * nClasses + clIdx * n_tree_groups + tree_group_id] += - scale * (algorithmFPType)classProbaForTree[obsCurrNodeForTree * nClasses + clIdx]; - } - } - } - } - - __kernel void predictByTreesUnweighted(const __global algorithmFPType * data, const __global int * ftrIdx, - const __global int * classLabelsOrNextNodeIdx, const __global algorithmFPType * ftrValue, - __global algorithmFPType * obsClassHist, algorithmFPType scale, int nRows, int nCols, int nTrees, - int maxTreeSize, int treeOffset) { - const int nClasses = NUM_OF_CLASSES; - const int local_id = get_local_id(0); - const int local_size = get_local_size(0); - const int n_groups = get_num_groups(0); - const int group_id = get_group_id(0); - const int n_tree_groups = get_num_groups(1); - const int tree_group_id = get_group_id(1); - const int tree_id = treeOffset + tree_group_id; - const int leafMark = -1; - - const int nElementsForGroup = nRows / n_groups + !!(nRows % n_groups); - - const int iStart = group_id * nElementsForGroup; - int iEnd = (group_id + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - if (tree_id < nTrees) - { - const __global int * ftrIdxForTree = ftrIdx + tree_id * maxTreeSize; - const __global int * classLabelsOrNextNodeIdxForTree = classLabelsOrNextNodeIdx + tree_id * maxTreeSize; - const __global algorithmFPType * ftrValueForTree = ftrValue + tree_id * maxTreeSize; - - uint treeRootIsSplit = (uint)(leafMark != ftrIdxForTree[0]); - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - uint obsCurrNodeForTree = 0; - uint obsSplitMarkForTree = treeRootIsSplit; - for (; obsSplitMarkForTree > 0;) - { - uint idx = obsSplitMarkForTree * ftrIdxForTree[obsCurrNodeForTree]; - uint sn = (uint)(data[i * nCols + idx] > ftrValueForTree[obsCurrNodeForTree]); - obsCurrNodeForTree -= obsSplitMarkForTree * (obsCurrNodeForTree - (uint)classLabelsOrNextNodeIdxForTree[obsCurrNodeForTree] - sn); - obsSplitMarkForTree = (uint)(ftrIdxForTree[obsCurrNodeForTree] != leafMark); - } - int clIdx = classLabelsOrNextNodeIdxForTree[obsCurrNodeForTree]; - obsClassHist[i * n_tree_groups * nClasses + clIdx * n_tree_groups + tree_group_id] += scale; - } - } - } - - __kernel void reduceClassHist(__global algorithmFPType * obsClassHist, __global algorithmFPType * resObsClassHist, int nRows, int nTrees) { - const int nClasses = NUM_OF_CLASSES; - const int group_id = get_group_id(0); - const int n_groups = get_num_groups(0); - const int sub_group_local_id = get_sub_group_local_id(); - const int sub_group_size = get_sub_group_size(); - - const int nElementsForGroup = nRows / n_groups + !!(nRows % n_groups); - - const int iStart = group_id * nElementsForGroup; - int iEnd = (group_id + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - // obsClassHist each row contains certain class values from each tree for this observation - // obsClassHist[0] = obs0_cls0_val_from_tree0, obs0_cls0_val_from_tree1 ... - // obsClassHist[1] = obs0_cls1_val_from_tree0, obs0_cls1_val_from_tree1 ... - - for (int rowIdx = iStart; rowIdx < iEnd; rowIdx++) - { - for (int clIdx = 0; clIdx < nClasses; clIdx++) - { - int class_offset = rowIdx * nTrees * nClasses + clIdx * nTrees; - - algorithmFPType class_val = (algorithmFPType)0; - for (int i = sub_group_local_id; i < nTrees; i += sub_group_size) - { - class_val += obsClassHist[class_offset + i]; - } - - class_val = sub_group_reduce_add(class_val); - - if (0 == sub_group_local_id) - { - resObsClassHist[rowIdx * nClasses + clIdx] = class_val; - } - } - } - } - - __kernel void determineWinners(const __global algorithmFPType * classHist, __global algorithmFPType * res, int nRows) { - const int nClasses = NUM_OF_CLASSES; - const int local_id = get_local_id(0); - const int local_size = get_local_size(0); - const int n_groups = get_num_groups(0); - const int group_id = get_group_id(0); - - const int nElementsForGroup = nRows / n_groups + !!(nRows % n_groups); - - const int iStart = group_id * nElementsForGroup; - int iEnd = (group_id + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - algorithmFPType clsCount = (algorithmFPType)0; - algorithmFPType clsWinner = (algorithmFPType)0; - for (int clIdx = 0; clIdx < nClasses; clIdx++) - { - if (clsCount < classHist[i * nClasses + clIdx]) - { - clsCount = classHist[i * nClasses + clIdx]; - clsWinner = (algorithmFPType)clIdx; - } - } - res[i] = clsWinner; - } - }); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_kernel_oneapi.h b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_kernel_oneapi.h deleted file mode 100644 index 21c6236e8c7..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_kernel_oneapi.h +++ /dev/null @@ -1,126 +0,0 @@ -/* file: df_classification_predict_dense_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for decision forest -// prediction for GPU for the dense method. -//-- -*/ - -#ifndef __DF_CLASSIFICATION_PREDICT_DENSE_KERNEL_ONEAPI_H__ -#define __DF_CLASSIFICATION_PREDICT_DENSE_KERNEL_ONEAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/decision_forest/decision_forest_classification_predict.h" -#include "src/algorithms/dtrees/forest/classification/df_classification_model_impl.h" -#include "algorithms/decision_forest/decision_forest_classification_model.h" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace classification -{ -namespace prediction -{ -namespace internal -{ -template -class PredictKernelOneAPI : public algorithms::Kernel -{ -public: - PredictKernelOneAPI() : _nClasses(0), _nTreeGroups(0), _votingMethod(VotingMethod::unweighted) {}; - PredictKernelOneAPI(const PredictKernelOneAPI &) = delete; - PredictKernelOneAPI & operator=(const PredictKernelOneAPI &) = delete; - ~PredictKernelOneAPI() {}; - - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory, const char * programName, const char * programSrc, - const char * buildOptions); - services::Status compute(services::HostAppIface * const pHostApp, const data_management::NumericTable * a, - const decision_forest::classification::Model * const m, data_management::NumericTable * const r, - data_management::NumericTable * const prob, const size_t nClasses, const VotingMethod votingMethod); - services::Status predictByAllTrees(const services::internal::Buffer & srcBuffer, - const decision_forest::classification::Model * const m, services::internal::sycl::UniversalBuffer & classHist, - size_t nRows, size_t nCols); - - services::Status predictByTreesWeighted(const services::internal::Buffer & srcBuffer, - const services::internal::sycl::UniversalBuffer & featureIndexList, - const services::internal::sycl::UniversalBuffer & leftOrClassTypeList, - const services::internal::sycl::UniversalBuffer & featureValueList, - const services::internal::sycl::UniversalBuffer & classProba, - services::internal::sycl::UniversalBuffer & obsClassHist, algorithmFPType scale, size_t nRows, - size_t nCols, size_t nTrees, size_t maxTreeSize); - services::Status predictByTreesUnweighted(const services::internal::Buffer & srcBuffer, - const services::internal::sycl::UniversalBuffer & featureIndexList, - const services::internal::sycl::UniversalBuffer & leftOrClassTypeList, - const services::internal::sycl::UniversalBuffer & featureValueList, - services::internal::sycl::UniversalBuffer & obsClassHist, algorithmFPType scale, size_t nRows, - size_t nCols, size_t nTrees, size_t maxTreeSize); - - services::Status reduceClassHist(const services::internal::sycl::UniversalBuffer & obsClassHist, - services::internal::sycl::UniversalBuffer & classHist, size_t nRows, size_t nTrees); - services::Status determineWinners(const services::internal::sycl::UniversalBuffer & classHist, - services::internal::Buffer & resBuffer, size_t nRows); - -private: - const size_t _preferableSubGroup = 16; // preferable maximal sub-group size - const size_t _maxLocalSize = 128; - const size_t _maxGroupsNum = 256; - - // following constants showed best performance on benchmark's datasets - const size_t _nRowsLarge = 500000; - const size_t _nRowsMedium = 100000; - - const size_t _nRowsBlocksForLarge = 16; - const size_t _nRowsBlocksForMedium = 8; - - const size_t _nTreesLarge = 192; - const size_t _nTreesMedium = 48; - const size_t _nTreesSmall = 12; - - const size_t _nTreeGroupsForLarge = 128; - const size_t _nTreeGroupsForMedium = 32; - const size_t _nTreeGroupsForSmall = 16; - const size_t _nTreeGroupsMin = 8; - - static constexpr size_t _int32max = static_cast(services::internal::MaxVal::get()); - - size_t _nClasses; - size_t _nTreeGroups; - VotingMethod _votingMethod; - - services::internal::sycl::KernelPtr kernelPredictByTreesWeighted; - services::internal::sycl::KernelPtr kernelPredictByTreesUnweighted; - services::internal::sycl::KernelPtr kernelReduceClassHist; - services::internal::sycl::KernelPtr kernelDetermineWinners; -}; - -} // namespace internal -} // namespace prediction -} // namespace classification -} // namespace decision_forest -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_oneapi_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_oneapi_impl.i deleted file mode 100644 index bc8e9cd882a..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_oneapi_impl.i +++ /dev/null @@ -1,580 +0,0 @@ -/* file: df_classification_predict_dense_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for decision forest classification -// hist method. -//-- -*/ - -#ifndef __DF_CLASSIFICATION_PREDICT_DENSE_ONEAPI_IMPL_I__ -#define __DF_CLASSIFICATION_PREDICT_DENSE_ONEAPI_IMPL_I__ - -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_predict_dense_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_predict_classification_kernels.cl" - -#include "src/algorithms/dtrees/forest/classification/df_classification_model_impl.h" - -#include "src/externals/service_profiler.h" -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "src/data_management/service_numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/services/service_data_utils.h" -#include "src/services/service_algo_utils.h" -#include "src/services/service_arrays.h" -#include "src/services/service_utils.h" -#include "src/services/daal_strings.h" -#include "services/internal/sycl/types.h" - -using namespace daal::services; -using namespace daal::services::internal; -using namespace daal::internal; -using namespace daal::services::internal::sycl; -using namespace daal::algorithms::dtrees::internal; - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace classification -{ -namespace prediction -{ -namespace internal -{ -static services::String getBuildOptions(size_t nClasses) -{ - DAAL_ASSERT(nClasses <= static_cast(services::internal::MaxVal::get())); - char buffer[DAAL_MAX_STRING_SIZE] = { 0 }; - const auto written = daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, static_cast(nClasses)); - services::String nClassesStr(buffer, written); - services::String buildOptions = " -D NUM_OF_CLASSES="; - buildOptions.add(nClassesStr); - - return buildOptions; -} - -template -services::Status PredictKernelOneAPI::buildProgram(ClKernelFactoryIface & factory, const char * programName, - const char * programSrc, const char * buildOptions) -{ - services::Status status; - - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - { - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - build_options.add(" -cl-std=CL1.2 "); - - if (buildOptions) - { - build_options.add(buildOptions); - } - - services::String cachekey("__daal_algorithms_df_batch_classification_"); - cachekey.add(build_options); - cachekey.add(programName); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), programSrc, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -/////////////////////////////////////////////////////////////////////////////////////////// -/* compute method for PredictKernelOneAPI */ -/////////////////////////////////////////////////////////////////////////////////////////// -template -services::Status PredictKernelOneAPI::compute(services::HostAppIface * const pHostApp, const NumericTable * const x, - const decision_forest::classification::Model * const m, - NumericTable * const res, NumericTable * const prob, const size_t nClasses, - const VotingMethod votingMethod) -{ - services::Status status; - - _nClasses = nClasses; - _votingMethod = votingMethod; - - const size_t nRows = x->getNumberOfRows(); - const size_t nCols = x->getNumberOfColumns(); - - const daal::algorithms::decision_forest::classification::internal::ModelImpl * const pModel = - static_cast(m); - const auto nTrees = pModel->size(); - - DAAL_CHECK_EX((nClasses <= _int32max), ErrorIncorrectParameter, ParameterName, nClassesStr()); - - if (nRows > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfRowsInInputNumericTable); - } - if (nCols > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - } - if (nTrees > _int32max) - { - return services::Status(services::ErrorIncorrectSizeOfModel); - } - - services::String buildOptions = getBuildOptions(_nClasses); - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory, "predict_cls_kernels", df_batch_predict_classification_kernels, buildOptions.c_str())); - - kernelPredictByTreesWeighted = kernel_factory.getKernel("predictByTreesWeighted", status); - kernelPredictByTreesUnweighted = kernel_factory.getKernel("predictByTreesUnweighted", status); - kernelReduceClassHist = kernel_factory.getKernel("reduceClassHist", status); - kernelDetermineWinners = kernel_factory.getKernel("determineWinners", status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor dataBlock; - DAAL_CHECK_STATUS_VAR(const_cast(x)->getBlockOfRows(0, nRows, readOnly, dataBlock)); - - BlockDescriptor probBlock; - - auto dataBuffer = dataBlock.getBuffer(); - - UniversalBuffer classHist; - if (prob) - { - DAAL_CHECK_STATUS_VAR(const_cast(prob)->getBlockOfRows(0, nRows, readWrite, probBlock)); - classHist = probBlock.getBuffer(); - } - else - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nClasses, nRows); - classHist = context.allocate(TypeIds::id(), _nClasses * nRows, status); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_CHECK_STATUS_VAR(predictByAllTrees(dataBuffer, m, classHist, nRows, nCols)); - - if (res) - { - BlockDescriptor resBlock; - DAAL_CHECK_STATUS_VAR(const_cast(res)->getBlockOfRows(0, nRows, writeOnly, resBlock)); - - auto resBuffer = resBlock.getBuffer(); - DAAL_CHECK_STATUS_VAR(determineWinners(classHist, resBuffer, nRows)); - DAAL_CHECK_STATUS_VAR(const_cast(res)->releaseBlockOfRows(resBlock)); - } - - DAAL_CHECK_STATUS_VAR(const_cast(x)->releaseBlockOfRows(dataBlock)); - if (prob) - { - DAAL_CHECK_STATUS_VAR(const_cast(prob)->releaseBlockOfRows(probBlock)); - } - - return status; -} - -template -services::Status PredictKernelOneAPI::predictByAllTrees(const services::internal::Buffer & srcBuffer, - const decision_forest::classification::Model * const m, - UniversalBuffer & classHist, size_t nRows, size_t nCols) -{ - services::Status status; - const daal::algorithms::decision_forest::classification::internal::ModelImpl * const pModel = - static_cast(m); - - auto & context = services::internal::getDefaultContext(); - - TArray _aTree; - - const auto nTrees = pModel->size(); - - _aTree.reset(nTrees); - DAAL_CHECK_MALLOC(_aTree.get()); - - _nTreeGroups = _nTreeGroupsMin; - - if (nTrees > _nTreesLarge) - { - _nTreeGroups = _nTreeGroupsForLarge; - } - else if (nTrees > _nTreesMedium) - { - _nTreeGroups = _nTreeGroupsForMedium; - } - else if (nTrees > _nTreesSmall) - { - _nTreeGroups = _nTreeGroupsForSmall; - } - - size_t maxTreeSize = 0; - for (size_t i = 0; i < nTrees; ++i) - { - _aTree[i] = pModel->at(i); - maxTreeSize = maxTreeSize < _aTree[i]->getNumberOfRows() ? _aTree[i]->getNumberOfRows() : maxTreeSize; - } - - if (maxTreeSize > _int32max) - { - return services::Status(services::ErrorIncorrectSizeOfModel); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, maxTreeSize, nTrees); - const size_t treeBlockSize = maxTreeSize * nTrees; - - TArray tFI(treeBlockSize); - TArray tLC(treeBlockSize); - TArray tFV(treeBlockSize); - - bool weighted = false; - auto ftrIdxArr = context.allocate(TypeIds::id(), treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - auto leftNodeIdxOrClassIdArr = context.allocate(TypeIds::id(), treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - auto ftrValueArr = context.allocate(TypeIds::id(), treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - - size_t mulClassesTreeGroups = _nClasses * _nTreeGroups; - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRows, mulClassesTreeGroups); - auto obsClassHist = context.allocate(TypeIds::id(), nRows * mulClassesTreeGroups, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(obsClassHist, (algorithmFPType)0, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(classHist, (algorithmFPType)0, status); - DAAL_CHECK_STATUS_VAR(status); - - UniversalBuffer probasArr; - TArray probasArrHost(treeBlockSize * _nClasses); - DAAL_CHECK_MALLOC(probasArrHost.get()); - - if (_votingMethod == VotingMethod::weighted && pModel->getProbas(0)) - { - probasArr = context.allocate(TypeIds::id(), treeBlockSize * _nClasses, status); - DAAL_CHECK_STATUS_VAR(status); - weighted = true; - } - - for (size_t iTree = 0; iTree < nTrees; iTree++) - { - const size_t treeSize = _aTree[iTree]->getNumberOfRows(); - const DecisionTreeNode * const aNode = (const DecisionTreeNode *)(*_aTree[iTree]).getArray(); - - int32_t * const fi = tFI.get() + iTree * maxTreeSize; - int32_t * const lc = tLC.get() + iTree * maxTreeSize; - algorithmFPType * const fv = tFV.get() + iTree * maxTreeSize; - - PRAGMA_IVDEP - PRAGMA_VECTOR_ALWAYS - for (size_t i = 0; i < treeSize; i++) - { - fi[i] = aNode[i].featureIndex; - lc[i] = aNode[i].leftIndexOrClass; - fv[i] = (algorithmFPType)aNode[i].featureValueOrResponse; - } - - if (weighted) - { - const double * probas = pModel->getProbas(iTree); - - algorithmFPType * dst_ptr = probasArrHost.get() + iTree * maxTreeSize * _nClasses; - - PRAGMA_IVDEP - PRAGMA_VECTOR_ALWAYS - for (size_t i = 0; i < treeSize * _nClasses; i++) - { - dst_ptr[i] = static_cast(probas[i]); - } - } - } - - if (weighted) - { - context.copy(probasArr, 0, (void *)probasArrHost.get(), treeBlockSize * _nClasses, 0, treeBlockSize * _nClasses, status); - DAAL_CHECK_STATUS_VAR(status); - } - - algorithmFPType probasScale = (algorithmFPType)1 / nTrees; - - context.copy(ftrIdxArr, 0, (void *)tFI.get(), treeBlockSize, 0, treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - context.copy(leftNodeIdxOrClassIdArr, 0, (void *)tLC.get(), treeBlockSize, 0, treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - context.copy(ftrValueArr, 0, (void *)tFV.get(), treeBlockSize, 0, treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - - if (weighted) - { - DAAL_CHECK_STATUS_VAR(predictByTreesWeighted(srcBuffer, ftrIdxArr, leftNodeIdxOrClassIdArr, ftrValueArr, probasArr, obsClassHist, probasScale, - nRows, nCols, nTrees, maxTreeSize)); - } - else - { - DAAL_CHECK_STATUS_VAR(predictByTreesUnweighted(srcBuffer, ftrIdxArr, leftNodeIdxOrClassIdArr, ftrValueArr, obsClassHist, probasScale, nRows, - nCols, nTrees, maxTreeSize)); - } - DAAL_CHECK_STATUS_VAR(reduceClassHist(obsClassHist, classHist, nRows, _nTreeGroups)); - - return status; -} - -template -services::Status PredictKernelOneAPI::predictByTreesWeighted( - const services::internal::Buffer & srcBuffer, const UniversalBuffer & featureIndexList, - const UniversalBuffer & leftOrClassTypeList, const UniversalBuffer & featureValueList, const UniversalBuffer & classProba, - UniversalBuffer & obsClassHist, algorithmFPType scale, size_t nRows, size_t nCols, size_t nTrees, size_t maxTreeSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.predictByTreesWeighted); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPredictByTreesWeighted; - - DAAL_CHECK_STATUS_VAR(status); - - size_t localSize = _maxLocalSize; - size_t nRowsBlocks = 1; - if (nRows > _nRowsLarge) - { - nRowsBlocks = _nRowsBlocksForLarge; - } - else if (nRows > _nRowsMedium) - { - nRowsBlocks = _nRowsBlocksForMedium; - } - { - KernelRange local_range(localSize, 1); - KernelRange global_range(nRowsBlocks * localSize, _nTreeGroups); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(nCols <= _int32max); - DAAL_ASSERT(nTrees <= _int32max); - DAAL_ASSERT(maxTreeSize <= _int32max); - - DAAL_ASSERT(srcBuffer.size() == nRows * nCols); - - DAAL_ASSERT_UNIVERSAL_BUFFER(featureIndexList, int32_t, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(leftOrClassTypeList, int32_t, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(featureValueList, algorithmFPType, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(classProba, algorithmFPType, maxTreeSize * nTrees * _nClasses); - DAAL_ASSERT_UNIVERSAL_BUFFER(obsClassHist, algorithmFPType, nRows * _nClasses * _nTreeGroups); - - for (size_t procTrees = 0; procTrees < nTrees; procTrees += _nTreeGroups) - { - KernelArguments args(12, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, srcBuffer, AccessModeIds::read); - args.set(1, featureIndexList, AccessModeIds::read); - args.set(2, leftOrClassTypeList, AccessModeIds::read); - args.set(3, featureValueList, AccessModeIds::read); - args.set(4, classProba, AccessModeIds::read); - args.set(5, obsClassHist, AccessModeIds::readwrite); - args.set(6, scale); - args.set(7, static_cast(nRows)); - args.set(8, static_cast(nCols)); - args.set(9, static_cast(nTrees)); - args.set(10, static_cast(maxTreeSize)); - args.set(11, static_cast(procTrees)); - - context.run(range, kernel, args, status); - - DAAL_CHECK_STATUS_VAR(status); - } - } - - return status; -} - -template -services::Status PredictKernelOneAPI::predictByTreesUnweighted(const services::internal::Buffer & srcBuffer, - const UniversalBuffer & featureIndexList, - const UniversalBuffer & leftOrClassTypeList, - const UniversalBuffer & featureValueList, - UniversalBuffer & obsClassHist, algorithmFPType scale, - size_t nRows, size_t nCols, size_t nTrees, size_t maxTreeSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.predictByTreesUnweighted); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPredictByTreesUnweighted; - - DAAL_CHECK_STATUS_VAR(status); - - size_t localSize = _maxLocalSize; - size_t nRowsBlocks = 1; - if (nRows > _nRowsLarge) - { - nRowsBlocks = _nRowsBlocksForLarge; - } - else if (nRows > _nRowsMedium) - { - nRowsBlocks = _nRowsBlocksForMedium; - } - { - KernelRange local_range(localSize, 1); - KernelRange global_range(nRowsBlocks * localSize, _nTreeGroups); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(nCols <= _int32max); - DAAL_ASSERT(nTrees <= _int32max); - DAAL_ASSERT(maxTreeSize <= _int32max); - - DAAL_ASSERT(srcBuffer.size() == nRows * nCols); - - DAAL_ASSERT_UNIVERSAL_BUFFER(featureIndexList, int32_t, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(leftOrClassTypeList, int32_t, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(featureValueList, algorithmFPType, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(obsClassHist, algorithmFPType, nRows * _nClasses * _nTreeGroups); - - for (size_t procTrees = 0; procTrees < nTrees; procTrees += _nTreeGroups) - { - KernelArguments args(11, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, srcBuffer, AccessModeIds::read); - args.set(1, featureIndexList, AccessModeIds::read); - args.set(2, leftOrClassTypeList, AccessModeIds::read); - args.set(3, featureValueList, AccessModeIds::read); - args.set(4, obsClassHist, AccessModeIds::readwrite); - args.set(5, scale); - args.set(6, static_cast(nRows)); - args.set(7, static_cast(nCols)); - args.set(8, static_cast(nTrees)); - args.set(9, static_cast(maxTreeSize)); - args.set(10, static_cast(procTrees)); - - context.run(range, kernel, args, status); - - DAAL_CHECK_STATUS_VAR(status); - } - } - - return status; -} - -template -services::Status PredictKernelOneAPI::reduceClassHist(const UniversalBuffer & obsClassHist, UniversalBuffer & classHist, - size_t nRows, size_t nTrees) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.reduceClassHist); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & kernel = kernelReduceClassHist; - - size_t localSize = _preferableSubGroup; - size_t nGroups = _maxGroupsNum; - { - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(nTrees <= _int32max); - - DAAL_ASSERT_UNIVERSAL_BUFFER(obsClassHist, algorithmFPType, nRows * _nClasses * _nTreeGroups); - DAAL_ASSERT_UNIVERSAL_BUFFER(classHist, algorithmFPType, nRows * _nClasses); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, obsClassHist, AccessModeIds::read); - args.set(1, classHist, AccessModeIds::readwrite); - args.set(2, static_cast(nRows)); - args.set(3, static_cast(nTrees)); - - KernelRange local_range(localSize); - KernelRange global_range(nGroups * localSize); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status PredictKernelOneAPI::determineWinners(const UniversalBuffer & classHist, - services::internal::Buffer & resBuffer, size_t nRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.determineWinners); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelDetermineWinners; - - size_t localSize = _maxLocalSize; - size_t nGroups = _maxGroupsNum; - - { - DAAL_ASSERT(nRows <= _int32max); - - DAAL_ASSERT(resBuffer.size() == nRows * 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(classHist, algorithmFPType, nRows * _nClasses); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, classHist, AccessModeIds::read); - args.set(1, resBuffer, AccessModeIds::write); - args.set(2, static_cast(nRows)); - - KernelRange local_range(localSize); - KernelRange global_range(nGroups * localSize); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -} /* namespace internal */ -} /* namespace prediction */ -} /* namespace classification */ -} /* namespace decision_forest */ -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_kernel_oneapi.h b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_kernel_oneapi.h deleted file mode 100644 index df0b0e02cea..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_kernel_oneapi.h +++ /dev/null @@ -1,182 +0,0 @@ -/* file: df_classification_train_hist_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for decision forest -// training for GPU for the hist method. -//-- -*/ - -#ifndef __DF_CLASSIFICATION_TRAIN_HIST_KERNEL_ONEAPI_H__ -#define __DF_CLASSIFICATION_TRAIN_HIST_KERNEL_ONEAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "src/algorithms/dtrees/forest/classification/df_classification_model_impl.h" -#include "algorithms/decision_forest/decision_forest_classification_training_types.h" -#include "algorithms/decision_forest/decision_forest_classification_model.h" -#include "src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.h" -#include "src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace classification -{ -namespace training -{ -namespace internal -{ -template -class ClassificationTrainBatchKernelOneAPI : public daal::algorithms::Kernel -{ -public: - ClassificationTrainBatchKernelOneAPI() {} - services::Status compute(services::HostAppIface * pHostApp, const NumericTable * x, const NumericTable * y, - decision_forest::classification::Model & m, Result & res, const Parameter & par) - { - return services::ErrorMethodNotImplemented; - } -}; - -template -class ClassificationTrainBatchKernelOneAPI : public daal::algorithms::Kernel -{ -public: - ClassificationTrainBatchKernelOneAPI() : _nClasses(0), _nRows(0), _nFeatures(0), _nSelectedRows(0), _nMaxBinsAmongFtrs(0), _totalBins(0) {}; - - services::Status compute(services::HostAppIface * pHostApp, const NumericTable * x, const NumericTable * y, - decision_forest::classification::Model & m, Result & res, const Parameter & par); - -private: - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory, const char * programName, const char * programSrc, - const char * buildOptions); - - size_t getPartHistRequiredMemSize(size_t nSelectedFeatures, size_t nMaxBinsAmongFtrs); - - services::Status computeBestSplit(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, - services::internal::sycl::UniversalBuffer & nodeOffsets, services::internal::sycl::UniversalBuffer & binOffsets, - services::internal::sycl::UniversalBuffer & splitInfo, - services::internal::sycl::UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, - size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, algorithmFPType impurityThreshold); - - services::Status computeBestSplitSinglePass( - const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, services::internal::sycl::UniversalBuffer & binOffsets, - services::internal::sycl::UniversalBuffer & nodeList, services::internal::sycl::UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, - services::internal::sycl::UniversalBuffer & impList, services::internal::sycl::UniversalBuffer & nodeImpDecreaseList, - bool updateImpDecreaseRequired, size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, algorithmFPType impurityThreshold); - - services::Status computeBestSplitByHistogram( - const services::internal::sycl::UniversalBuffer & nodeHistogramList, services::internal::sycl::UniversalBuffer & selectedFeatures, - size_t nSelectedFeatures, services::internal::sycl::UniversalBuffer & nodeList, services::internal::sycl::UniversalBuffer & nodeIndices, - size_t nodeIndicesOffset, services::internal::sycl::UniversalBuffer & binOffsets, services::internal::sycl::UniversalBuffer & splitInfo, - services::internal::sycl::UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, size_t nNodes, size_t nMaxBinsAmongFtrs, - size_t minObservationsInLeafNode, algorithmFPType impurityThreshold); - - services::Status computePartialHistograms(const services::internal::sycl::UniversalBuffer & data, - services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, - services::internal::sycl::UniversalBuffer & nodeList, - services::internal::sycl::UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, - services::internal::sycl::UniversalBuffer & binOffsets, size_t nMaxBinsAmongFtrs, size_t nFeatures, - size_t nNodes, services::internal::sycl::UniversalBuffer & partialHistograms, - size_t nPartialHistograms); - - services::Status reducePartialHistograms(services::internal::sycl::UniversalBuffer & partialHistograms, - services::internal::sycl::UniversalBuffer & histograms, size_t nPartialHistograms, size_t nNodes, - size_t nSelectedFeatures, size_t nMaxBinsAmongFtrs, size_t reduceLocalSize); - - services::Status computeResults(const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, - const size_t nFeatures, const services::internal::sycl::UniversalBuffer & oobIndices, - const services::internal::sycl::UniversalBuffer & oobRowsNumList, - services::internal::sycl::UniversalBuffer & oobBuf, algorithmFPType * varImp, algorithmFPType * varImpVariance, - size_t nBuiltTrees, const engines::EnginePtr & engine, size_t nTreesInBlock, size_t treeIndex, - const Parameter & par); - - algorithmFPType computeOOBError(const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, - const size_t nFeatures, const services::internal::sycl::UniversalBuffer & indices, size_t indicesOffset, size_t n, - services::internal::sycl::UniversalBuffer oobBuf, services::Status & status); - - algorithmFPType computeOOBErrorPerm(const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, - const size_t nFeatures, const services::internal::sycl::UniversalBuffer & indices, size_t indicesOffset, - const int * indicesPerm, const size_t testFtrInd, size_t n, services::Status & status); - - services::Status finalizeOOBError(const algorithmFPType * y, const services::internal::sycl::UniversalBuffer & oobBuf, const size_t nRows, - algorithmFPType * res, algorithmFPType * resPerObs, algorithmFPType * resAccuracy, - algorithmFPType * resDecisionFunction); - - services::Status finalizeVarImp(const Parameter & par, algorithmFPType * varImp, algorithmFPType * varImpVariance, size_t nFeatures); - - services::internal::sycl::KernelPtr kernelComputePartialHistograms; - services::internal::sycl::KernelPtr kernelReducePartialHistograms; - services::internal::sycl::KernelPtr kernelComputeBestSplitByHistogram; - services::internal::sycl::KernelPtr kernelComputeBestSplitSinglePass; - - decision_forest::internal::TreeLevelBuildHelperOneAPI _treeLevelBuildHelper; - - const size_t _maxWorkItemsPerGroup = 256; // should be a power of two for interal needs - const size_t _preferableSubGroup = 16; // preferable maximal sub-group size - const size_t _maxLocalSize = 128; - const size_t _maxLocalSums = 256; - const size_t _maxLocalHistograms = 256; - const size_t _preferableGroupSize = 256; - const size_t _minRowsBlock = 256; - const size_t _maxBins = 256; - const size_t _reduceLocalSizePartHist = 64; - - const size_t _minPreferableLocalSizeForPartHistKernel = 32; - - const double _globalMemFractionForTreeBlock = 0.6; // part of free global mem which can be used for processing block of tree - const double _globalMemFractionForPartHist = 0.2; // part of free global mem which can be used for partial histograms - const size_t _maxMemAllocSizeForAlgo = 1073741824; // 1 Gb it showed better efficiency than using just platform info.maxMemAllocSize - const size_t _minRowsBlocksForMaxPartHistNum = 16384; - const size_t _minRowsBlocksForOneHist = 128; - - const size_t _nNodesGroups = 3; // all nodes are split on groups (big, medium, small) - const size_t _nodeGroupProps = 2; // each nodes Group contains props: numOfNodes, maxNumOfBlocks - - static constexpr size_t _int32max = static_cast(services::internal::MaxVal::get()); - - size_t _nClasses; - size_t _nRows; - size_t _nFeatures; - size_t _nSelectedRows; - size_t _nMaxBinsAmongFtrs; - size_t _totalBins; - size_t _preferableLocalSizeForPartHistKernel; // local size for histogram collection depends on num of selected features - size_t _maxPartHistCumulativeSize; // is calculated at the beggining of compute using _globalMemFractionForPartHist -}; - -} // namespace internal -} // namespace training -} // namespace classification -} // namespace decision_forest -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i deleted file mode 100644 index 3e7ffad0083..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_oneapi_impl.i +++ /dev/null @@ -1,1218 +0,0 @@ -/* file: df_classification_train_hist_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for decision forest classification -// hist method. -//-- -*/ - -#ifndef __DF_CLASSIFICATION_TRAIN_HIST_ONEAPI_IMPL_I__ -#define __DF_CLASSIFICATION_TRAIN_HIST_ONEAPI_IMPL_I__ - -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_train_hist_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/classification/oneapi/cl_kernels/df_batch_classification_kernels.cl" - -#include "src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.i" -#include "src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.i" -#include "src/algorithms/dtrees/forest/classification/df_classification_model_impl.h" -#include "src/algorithms/dtrees/forest/classification/oneapi/df_classification_tree_helper_impl.i" - -#include "src/externals/service_profiler.h" -#include "src/externals/service_rng.h" -#include "src/externals/service_math.h" //will remove after migrating finalize MDA to GPU -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/services/service_data_utils.h" -#include "src/services/service_algo_utils.h" -#include "src/services/service_arrays.h" -#include "src/services/service_utils.h" -#include "src/services/daal_strings.h" -#include "src/algorithms/engines/engine_types_internal.h" -#include "services/internal/sycl/types.h" - -using namespace daal::algorithms::decision_forest::internal; -using namespace daal::algorithms::decision_forest::classification::internal; -using namespace daal::internal; -using namespace daal::services::internal; - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace classification -{ -namespace training -{ -namespace internal -{ -template -static services::String getFPTypeAccuracy() -{ - if (IsSameType::value) - { - return services::String(" -D algorithmFPTypeAccuracy=(float)1e-5 "); - } - if (IsSameType::value) - { - return services::String(" -D algorithmFPTypeAccuracy=(double)1e-10 "); - } - return services::String(); -} - -static services::String getBuildOptions(size_t nClasses) -{ - DAAL_ASSERT(nClasses <= static_cast(services::internal::MaxVal::get())); - char buffer[DAAL_MAX_STRING_SIZE] = { 0 }; - const auto written = daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, static_cast(nClasses)); - services::String nClassesStr(buffer, written); - - services::String buildOptions = " -D NODE_PROPS=6 -D IMPURITY_PROPS=1 -D HIST_PROPS="; - buildOptions.add(nClassesStr); - buildOptions.add(" -D NUM_OF_CLASSES="); - buildOptions.add(nClassesStr); - - return buildOptions; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::buildProgram(ClKernelFactoryIface & factory, const char * programName, - const char * programSrc, const char * buildOptions) -{ - services::Status status; - - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - { - auto fptype_name = getKeyFPType(); - auto fptype_accuracy = getFPTypeAccuracy(); - auto build_options = fptype_name; - build_options.add(fptype_accuracy); - build_options.add(" -cl-std=CL1.2 "); - build_options.add(" -D LOCAL_BUFFER_SIZE=256 -D MAX_WORK_ITEMS_PER_GROUP=256 "); - - if (buildOptions) - { - build_options.add(buildOptions); - } - - services::String cachekey("__daal_algorithms_df_batch_classification_"); - cachekey.add(build_options); - cachekey.add(programName); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), programSrc, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::computeBestSplitByHistogram( - const UniversalBuffer & nodeHistogramList, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, UniversalBuffer & nodeList, - UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, UniversalBuffer & binOffsets, UniversalBuffer & impList, - UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, size_t nNodes, size_t nMaxBinsAmongFtrs, size_t minObservationsInLeafNode, - algorithmFPType impurityThreshold) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeBestSpitByHistogramLevel); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeBestSplitByHistogram; - - { - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nodeIndicesOffset <= _int32max); - DAAL_ASSERT(updateImpDecreaseRequired <= _int32max); - DAAL_ASSERT(nMaxBinsAmongFtrs <= _int32max); - DAAL_ASSERT(minObservationsInLeafNode <= _int32max); - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeHistogramList, algorithmFPType, nNodes * nSelectedFeatures * _nMaxBinsAmongFtrs * _nClasses); - DAAL_ASSERT_UNIVERSAL_BUFFER(selectedFeatures, int32_t, nNodes * nSelectedFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * TreeLevelRecord::_nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeIndices, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, _nFeatures + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(impList, algorithmFPType, nNodes * (TreeLevelRecord::_nNodeImpProps + _nClasses)); - if (updateImpDecreaseRequired) DAAL_ASSERT_UNIVERSAL_BUFFER(nodeImpDecreaseList, algorithmFPType, nNodes); - - KernelArguments args(13, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nodeHistogramList, AccessModeIds::read); - args.set(1, selectedFeatures, AccessModeIds::read); - args.set(2, static_cast(nSelectedFeatures)); - args.set(3, binOffsets, AccessModeIds::read); - args.set(4, nodeList, AccessModeIds::readwrite); // nodeList will be updated with split attributes - args.set(5, nodeIndices, AccessModeIds::read); - args.set(6, static_cast(nodeIndicesOffset)); - args.set(7, impList, AccessModeIds::write); - args.set(8, nodeImpDecreaseList, AccessModeIds::write); - args.set(9, static_cast(updateImpDecreaseRequired)); - args.set(10, static_cast(nMaxBinsAmongFtrs)); - args.set(11, static_cast(minObservationsInLeafNode)); - args.set(12, impurityThreshold); - - const size_t numOfSubGroupsPerNode = 8; //add logic for adjusting it in accordance with nNodes - size_t localSize = _preferableSubGroup * numOfSubGroupsPerNode; - - KernelRange local_range(localSize, 1); - KernelRange global_range(localSize, nNodes); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::computeBestSplitSinglePass( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, UniversalBuffer & binOffsets, UniversalBuffer & nodeList, - UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, UniversalBuffer & impList, UniversalBuffer & nodeImpDecreaseList, - bool updateImpDecreaseRequired, size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, algorithmFPType impurityThreshold) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeBestSplitSinglePass); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeBestSplitSinglePass; - - { - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nodeIndicesOffset <= _int32max); - DAAL_ASSERT(updateImpDecreaseRequired <= _int32max); - DAAL_ASSERT(nFeatures <= _int32max); - DAAL_ASSERT(minObservationsInLeafNode <= _int32max); - DAAL_ASSERT(response.size() == _nRows); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, _nRows * _nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int32_t, _nSelectedRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(selectedFeatures, int32_t, nNodes * nSelectedFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, _nFeatures + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * TreeLevelRecord::_nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeIndices, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(impList, algorithmFPType, nNodes * (TreeLevelRecord::_nNodeImpProps + _nClasses)); - if (updateImpDecreaseRequired) DAAL_ASSERT_UNIVERSAL_BUFFER(nodeImpDecreaseList, algorithmFPType, nNodes); - - KernelArguments args(15, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::read); - args.set(2, selectedFeatures, AccessModeIds::read); - args.set(3, static_cast(nSelectedFeatures)); - args.set(4, response, AccessModeIds::read); - args.set(5, binOffsets, AccessModeIds::read); - args.set(6, nodeList, AccessModeIds::readwrite); // nodeList will be updated with split attributes - args.set(7, nodeIndices, AccessModeIds::read); - args.set(8, static_cast(nodeIndicesOffset)); - args.set(9, impList, AccessModeIds::write); - args.set(10, nodeImpDecreaseList, AccessModeIds::write); - args.set(11, static_cast(updateImpDecreaseRequired)); - args.set(12, static_cast(nFeatures)); - args.set(13, static_cast(minObservationsInLeafNode)); - args.set(14, impurityThreshold); - - const size_t numOfSubGroupsPerNode = 8; //add logic for adjusting it in accordance with nNodes - size_t localSize = _preferableSubGroup * numOfSubGroupsPerNode; - - KernelRange local_range(localSize, 1); - KernelRange global_range(localSize, nNodes); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -size_t ClassificationTrainBatchKernelOneAPI::getPartHistRequiredMemSize(size_t nSelectedFeatures, size_t nMaxBinsAmongFtrs) -{ - // mul overflow for nSelectedFeatures * _nMaxBinsAmongFtrs and for nHistBins * _nHistProps were checked before kernel call in compute - const size_t nHistBins = nSelectedFeatures * _nMaxBinsAmongFtrs; - return sizeof(algorithmFPType) * nHistBins * _nClasses; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::computeBestSplit( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, UniversalBuffer & nodeList, UniversalBuffer & binOffsets, UniversalBuffer & impList, - UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, - algorithmFPType impurityThreshold) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - // no overflow check is required because of _nNodesGroups and _nodeGroupProps are small constants - auto nodesGroups = context.allocate(TypeIds::id(), _nNodesGroups * _nodeGroupProps, status); - DAAL_CHECK_STATUS_VAR(status); - auto nodeIndices = context.allocate(TypeIds::id(), nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR( - _treeLevelBuildHelper.splitNodeListOnGroupsBySize(nodeList, nNodes, nodesGroups, _nNodesGroups, _nodeGroupProps, nodeIndices)); - - auto nodesGroupsHost = nodesGroups.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - size_t nGroupNodes = 0; - size_t processedNodes = 0; - - for (size_t i = 0; i < _nNodesGroups; i++, processedNodes += nGroupNodes) - { - nGroupNodes = nodesGroupsHost.get()[i * _nodeGroupProps + 0]; - if (0 == nGroupNodes) continue; - - size_t maxGroupBlocksNum = nodesGroupsHost.get()[i * _nodeGroupProps + 1]; - - size_t groupIndicesOffset = processedNodes; - - if (maxGroupBlocksNum > 1) - { - const size_t partHistSize = getPartHistRequiredMemSize(nSelectedFeatures, _nMaxBinsAmongFtrs); - - size_t nPartialHistograms = maxGroupBlocksNum <= _minRowsBlocksForOneHist ? 1 : _maxLocalHistograms; - - if (nPartialHistograms > 1 && maxGroupBlocksNum < _minRowsBlocksForMaxPartHistNum) - { - while (nPartialHistograms > 1 - && (nPartialHistograms * _minRowsBlocksForOneHist > maxGroupBlocksNum - || nPartialHistograms * partHistSize > _maxPartHistCumulativeSize)) - { - nPartialHistograms >>= 1; - } - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nGroupNodes, partHistSize); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nGroupNodes * partHistSize, nPartialHistograms); - - const size_t maxPHBlockElems = _maxPartHistCumulativeSize / sizeof(algorithmFPType); - - const size_t nPHBlockElems = nGroupNodes * nPartialHistograms * partHistSize; - const size_t nPHBlocks = nPHBlockElems / maxPHBlockElems ? (nPHBlockElems / maxPHBlockElems + !!(nPHBlockElems % maxPHBlockElems)) : 1; - - size_t nBlockNodes = nGroupNodes / nPHBlocks + !!(nGroupNodes % nPHBlocks); - - for (size_t blockIndicesOffset = groupIndicesOffset; blockIndicesOffset < groupIndicesOffset + nGroupNodes; - blockIndicesOffset += nBlockNodes) - { - nBlockNodes = services::internal::min(nBlockNodes, groupIndicesOffset + nGroupNodes - blockIndicesOffset); - if (1 == nPartialHistograms) - { - auto nodesHistograms = context.allocate(TypeIds::id(), nBlockNodes * partHistSize, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(computePartialHistograms(data, treeOrder, selectedFeatures, nSelectedFeatures, response, nodeList, - nodeIndices, blockIndicesOffset, binOffsets, _nMaxBinsAmongFtrs, nFeatures, - nBlockNodes, nodesHistograms, nPartialHistograms)); - - DAAL_CHECK_STATUS_VAR(computeBestSplitByHistogram(nodesHistograms, selectedFeatures, nSelectedFeatures, nodeList, nodeIndices, - blockIndicesOffset, binOffsets, impList, nodeImpDecreaseList, - updateImpDecreaseRequired, nBlockNodes, _nMaxBinsAmongFtrs, - minObservationsInLeafNode, impurityThreshold)); - } - else - { - auto partialHistograms = - context.allocate(TypeIds::id(), nBlockNodes * nPartialHistograms * partHistSize, status); - DAAL_CHECK_STATUS_VAR(status); - auto nodesHistograms = context.allocate(TypeIds::id(), nBlockNodes * partHistSize, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(computePartialHistograms(data, treeOrder, selectedFeatures, nSelectedFeatures, response, nodeList, - nodeIndices, blockIndicesOffset, binOffsets, _nMaxBinsAmongFtrs, nFeatures, - nBlockNodes, partialHistograms, nPartialHistograms)); - DAAL_CHECK_STATUS_VAR(reducePartialHistograms(partialHistograms, nodesHistograms, nPartialHistograms, nBlockNodes, - nSelectedFeatures, _nMaxBinsAmongFtrs, _reduceLocalSizePartHist)); - - DAAL_CHECK_STATUS_VAR(computeBestSplitByHistogram(nodesHistograms, selectedFeatures, nSelectedFeatures, nodeList, nodeIndices, - blockIndicesOffset, binOffsets, impList, nodeImpDecreaseList, - updateImpDecreaseRequired, nBlockNodes, _nMaxBinsAmongFtrs, - minObservationsInLeafNode, impurityThreshold)); - } - } - } - else - { - DAAL_CHECK_STATUS_VAR(computeBestSplitSinglePass(data, treeOrder, selectedFeatures, nSelectedFeatures, response, binOffsets, nodeList, - nodeIndices, groupIndicesOffset, impList, nodeImpDecreaseList, updateImpDecreaseRequired, - nFeatures, nGroupNodes, minObservationsInLeafNode, impurityThreshold)); - } - } - - return status; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::computePartialHistograms( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, UniversalBuffer & nodeList, UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, - UniversalBuffer & binOffsets, size_t nMaxBinsAmongFtrs, size_t nFeatures, size_t nNodes, UniversalBuffer & partialHistograms, - size_t nPartialHistograms) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computePartialHistograms); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputePartialHistograms; - - { - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nodeIndicesOffset <= _int32max); - DAAL_ASSERT(nMaxBinsAmongFtrs <= _int32max); - DAAL_ASSERT(nFeatures <= _int32max); - DAAL_ASSERT(response.size() == _nRows); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, _nRows * _nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int32_t, _nSelectedRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(selectedFeatures, int32_t, nNodes * nSelectedFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, _nFeatures + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * TreeLevelRecord::_nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeIndices, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHistograms, algorithmFPType, - nNodes * nPartialHistograms * nSelectedFeatures * _nMaxBinsAmongFtrs * _nClasses); - - context.fill(partialHistograms, (algorithmFPType)0, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(12, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::read); - args.set(2, nodeList, AccessModeIds::read); - args.set(3, nodeIndices, AccessModeIds::read); - args.set(4, static_cast(nodeIndicesOffset)); - args.set(5, selectedFeatures, AccessModeIds::read); - args.set(6, response, AccessModeIds::read); - args.set(7, binOffsets, AccessModeIds::read); - args.set(8, static_cast(nMaxBinsAmongFtrs)); // max num of bins among all ftrs - args.set(9, static_cast(nFeatures)); - args.set(10, partialHistograms, AccessModeIds::write); - args.set(11, static_cast(nSelectedFeatures)); - - size_t localSize = _preferableLocalSizeForPartHistKernel; - - KernelRange local_range(localSize, 1); - KernelRange global_range(nPartialHistograms * localSize, nNodes); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::reducePartialHistograms( - UniversalBuffer & partialHistograms, UniversalBuffer & histograms, size_t nPartialHistograms, size_t nNodes, size_t nSelectedFeatures, - size_t nMaxBinsAmongFtrs, size_t reduceLocalSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.reducePartialHistograms); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelReducePartialHistograms; - - { - DAAL_ASSERT(nPartialHistograms <= _int32max); - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nMaxBinsAmongFtrs <= _int32max); - - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHistograms, algorithmFPType, - nNodes * nPartialHistograms * nSelectedFeatures * _nMaxBinsAmongFtrs * _nClasses); - DAAL_ASSERT_UNIVERSAL_BUFFER(histograms, algorithmFPType, nNodes * nSelectedFeatures * _nMaxBinsAmongFtrs * _nClasses); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialHistograms, AccessModeIds::read); - args.set(1, histograms, AccessModeIds::write); - args.set(2, static_cast(nPartialHistograms)); - args.set(3, static_cast(nSelectedFeatures)); - args.set(4, static_cast(nMaxBinsAmongFtrs)); // max num of bins among all ftrs - - KernelRange local_range(1, reduceLocalSize, 1); - // overflow for nMaxBinsAmongFtrs * nSelectedFeatures should be checked in compute - KernelRange global_range(nMaxBinsAmongFtrs * nSelectedFeatures, reduceLocalSize, nNodes); - - KernelNDRange range(3); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -static void shuffle(void * state, size_t n, int * dst) -{ - RNGsInst rng; - int idx[2]; - - for (size_t i = 0; i < n; ++i) - { - rng.uniform(2, idx, state, 0, n); - daal::services::internal::swap(dst[idx[0]], dst[idx[1]]); - } -} - -template -services::Status selectParallelizationTechnique(const Parameter & par, engines::internal::ParallelizationTechnique & technique) -{ - auto engineImpl = dynamic_cast(par.engine.get()); - - engines::internal::ParallelizationTechnique techniques[] = { engines::internal::family, engines::internal::leapfrog, - engines::internal::skipahead }; - - for (auto & t : techniques) - { - if (engineImpl->hasSupport(t)) - { - technique = t; - return services::Status(); - } - } - return services::Status(ErrorEngineNotSupported); -} - -/* following methods are related to results computation (OBB err, varImportance MDA/MDA_Scaled)*/ -/* they will be migrated on GPU when prediction layer forGPU is ready*/ -template -services::Status ClassificationTrainBatchKernelOneAPI::computeResults( - const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, size_t nRows, size_t nFeatures, - const UniversalBuffer & oobIndices, const UniversalBuffer & oobRowsNumList, UniversalBuffer & oobBuf, algorithmFPType * varImp, - algorithmFPType * varImpVariance, size_t nBuiltTrees, const engines::EnginePtr & engine, size_t nTreesInBlock, size_t treeIndex, - const Parameter & par) -{ - DAAL_ASSERT_UNIVERSAL_BUFFER(oobRowsNumList, int32_t, nTreesInBlock + 1); - - services::Status status; - const bool mdaRequired(par.varImportance == decision_forest::training::MDA_Raw || par.varImportance == decision_forest::training::MDA_Scaled); - - size_t nOOB = 0; - size_t oobIndicesOffset = 0; - - { - auto nOOBRowsHost = oobRowsNumList.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - oobIndicesOffset = static_cast(nOOBRowsHost.get()[treeIndex]); - nOOB = static_cast(nOOBRowsHost.get()[treeIndex + 1] - nOOBRowsHost.get()[treeIndex]); - } - - if ((par.resultsToCompute & (decision_forest::training::computeOutOfBagError | decision_forest::training::computeOutOfBagErrorPerObservation) - || mdaRequired) - && nOOB) - { - const algorithmFPType oobError = computeOOBError(t, x, y, nRows, nFeatures, oobIndices, oobIndicesOffset, nOOB, oobBuf, status); - DAAL_CHECK_STATUS_VAR(status); - - if (mdaRequired) - { - DAAL_ASSERT(varImp); - TArray permutation(nOOB); - DAAL_CHECK_MALLOC(permutation.get()); - for (size_t i = 0; i < nOOB; ++i) - { - permutation[i] = i; - } - - const algorithmFPType div1 = algorithmFPType(1) / algorithmFPType(nBuiltTrees); - daal::internal::RNGsInst rng; - auto engineImpl = dynamic_cast(engine.get()); - - for (size_t ftr = 0; ftr < nFeatures; ftr++) - { - shuffle(engineImpl->getState(), nOOB, permutation.get()); - const algorithmFPType permOOBError = - computeOOBErrorPerm(t, x, y, nRows, nFeatures, oobIndices, oobIndicesOffset, permutation.get(), ftr, nOOB, status); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType diff = (permOOBError - oobError); - const algorithmFPType delta = diff - varImp[ftr]; - varImp[ftr] += div1 * delta; - if (varImpVariance) - { - varImpVariance[ftr] += delta * (diff - varImp[ftr]); - } - } - } - DAAL_CHECK_STATUS_VAR(status); - } - return status; -} - -template -algorithmFPType ClassificationTrainBatchKernelOneAPI::computeOOBError( - const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, const size_t nFeatures, - const UniversalBuffer & indices, size_t indicesOffset, size_t n, UniversalBuffer oobBuf, services::Status & status) -{ - typedef DFTreeConverter DFTreeConverterType; - - DAAL_ASSERT(x); - DAAL_ASSERT(y); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int32_t, indicesOffset + n); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobBuf, uint32_t, nRows * _nClasses); - - auto rowsIndHost = indices.template get().toHost(ReadWriteMode::readOnly, status); - auto oobBufHost = oobBuf.template get().toHost(ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, algorithmFPType(0)); - - //compute prediction error on each OOB row and get its mean online formulae (Welford) - //TODO: can be threader_for() block - - algorithmFPType mean = algorithmFPType(0); - for (size_t i = 0; i < n; i++) - { - int rowInd = rowsIndHost.get()[indicesOffset + i]; - DAAL_ASSERT(rowInd < nRows); - size_t prediction = DFTreeConverterType::TreeHelperType::predict(t, &x[rowInd * nFeatures]); - oobBufHost.get()[rowInd * _nClasses + prediction]++; - mean += algorithmFPType(prediction != size_t(y[rowInd])); - } - - return mean / n; -} - -template -algorithmFPType ClassificationTrainBatchKernelOneAPI::computeOOBErrorPerm( - const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, const size_t nFeatures, - const UniversalBuffer & indices, size_t indicesOffset, const int * indicesPerm, const size_t testFtrInd, size_t n, services::Status & status) -{ - typedef DFTreeConverter DFTreeConverterType; - - DAAL_ASSERT(x); - DAAL_ASSERT(y); - DAAL_ASSERT(indicesPerm); - DAAL_ASSERT(testFtrInd < nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int32_t, indicesOffset + n); - - auto rowsIndHost = indices.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, algorithmFPType(0)); - - TArray buf(nFeatures); - DAAL_CHECK_COND_ERROR(buf.get(), status, services::ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, algorithmFPType(0)); - - algorithmFPType mean = algorithmFPType(0); - for (size_t i = 0; i < n; i++) - { - int rowInd = rowsIndHost.get()[indicesOffset + i]; - int rowIndPerm = indicesPerm[i]; - DAAL_ASSERT(rowInd < nRows); - DAAL_ASSERT(rowIndPerm < nRows); - services::internal::tmemcpy(buf.get(), &x[rowInd * nFeatures], nFeatures); - buf[testFtrInd] = x[rowIndPerm * nFeatures + testFtrInd]; - size_t prediction = DFTreeConverterType::TreeHelperType::predict(t, buf.get()); - mean += algorithmFPType(prediction != size_t(y[rowInd])); - } - - return mean / n; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::finalizeOOBError(const algorithmFPType * y, - const UniversalBuffer & oobBuf, const size_t nRows, - algorithmFPType * res, algorithmFPType * resPerObs, - algorithmFPType * resAccuracy, - algorithmFPType * resDecisionFunction) -{ - services::Status status; - - DAAL_ASSERT(y); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobBuf, uint32_t, nRows * _nClasses); - - auto oobBufHost = oobBuf.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - size_t nPredicted = 0; - algorithmFPType _res = 0; - - for (size_t i = 0; i < nRows; i++) - { - size_t prediction = 0; - size_t expectation(y[i]); - size_t maxVal = 0; - algorithmFPType sum = 0; - for (size_t clsIdx = 0; clsIdx < _nClasses; clsIdx++) - { - size_t val = oobBufHost.get()[i * _nClasses + clsIdx]; - if (val > maxVal) - { - maxVal = val; - prediction = clsIdx; - } - sum += static_cast(val); - } - - sum = (sum > algorithmFPType(0)) ? sum : algorithmFPType(1); - if (resDecisionFunction) - { - for (size_t clsIdx = 0; clsIdx < _nClasses; clsIdx++) - { - const size_t val = oobBufHost.get()[i * _nClasses + clsIdx]; - resDecisionFunction[i * _nClasses + clsIdx] = static_cast(val) / sum; - } - } - - if (0 < maxVal) - { - algorithmFPType predictionRes = algorithmFPType(prediction != expectation); - if (resPerObs) resPerObs[i] = predictionRes; - _res += predictionRes; - nPredicted++; - } - else if (resPerObs) - resPerObs[i] = algorithmFPType(-1); //was not in OOB set of any tree and hence not predicted - } - - if (res) *res = (0 < nPredicted) ? _res / algorithmFPType(nPredicted) : 0; - if (resAccuracy) *resAccuracy = (0 < nPredicted) ? algorithmFPType(1) - _res / algorithmFPType(nPredicted) : algorithmFPType(1); - - return status; -} - -template -services::Status ClassificationTrainBatchKernelOneAPI::finalizeVarImp(const Parameter & par, algorithmFPType * varImp, - algorithmFPType * varImpVariance, size_t nFeatures) -{ - if (par.varImportance == decision_forest::training::MDA_Scaled) - { - if (par.nTrees > 1) - { - DAAL_ASSERT(varImpVariance); - const algorithmFPType div = algorithmFPType(1) / algorithmFPType(par.nTrees); - for (size_t i = 0; i < nFeatures; i++) - { - varImpVariance[i] *= div; - if (varImpVariance[i] > algorithmFPType(0)) - varImp[i] /= daal::internal::MathInst::sSqrt(varImpVariance[i] * div); - } - } - else - { - DAAL_ASSERT(varImp); - for (size_t i = 0; i < nFeatures; i++) - { - varImp[i] = algorithmFPType(0); - } - } - } - else if (par.varImportance == decision_forest::training::MDI) - { - DAAL_ASSERT(varImp); - const algorithmFPType div = algorithmFPType(1) / algorithmFPType(par.nTrees); - for (size_t i = 0; i < nFeatures; i++) varImp[i] *= div; - } - return services::Status(); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -/* compute method for ClassificationTrainBatchKernelOneAPI */ -/////////////////////////////////////////////////////////////////////////////////////////// -template -services::Status ClassificationTrainBatchKernelOneAPI::compute(HostAppIface * pHostApp, const NumericTable * x, - const NumericTable * y, - decision_forest::classification::Model & m, Result & res, - const Parameter & par) -{ - services::Status status; - - typedef DFTreeConverter DFTreeConverterType; - typedef TreeLevelRecord TreeLevel; - - _nClasses = par.nClasses; - - _nRows = x->getNumberOfRows(); - _nFeatures = x->getNumberOfColumns(); - - DAAL_CHECK_EX((par.nClasses <= _int32max), ErrorIncorrectParameter, ParameterName, nClassesStr()); - DAAL_CHECK_EX((par.minObservationsInLeafNode <= _int32max), ErrorIncorrectParameter, ParameterName, minObservationsInLeafNodeStr()); - DAAL_CHECK_EX((par.featuresPerNode <= _int32max), ErrorIncorrectParameter, ParameterName, featuresPerNodeStr()); - DAAL_CHECK_EX((par.maxBins <= _int32max), ErrorIncorrectParameter, ParameterName, maxBinsStr()); - DAAL_CHECK_EX((par.minBinSize <= _int32max), ErrorIncorrectParameter, ParameterName, minBinSizeStr()); - DAAL_CHECK_EX((par.nTrees <= _int32max), ErrorIncorrectParameter, ParameterName, nTreesStr()); - - if (_nRows > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfRowsInInputNumericTable); - } - if (_nFeatures > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - } - - const size_t nSelectedFeatures = par.featuresPerNode ? par.featuresPerNode : daal::internal::MathInst::sSqrt(_nFeatures); - - _nSelectedRows = par.observationsPerTreeFraction * _nRows; - DAAL_CHECK_EX((_nSelectedRows > 0), ErrorIncorrectParameter, ParameterName, observationsPerTreeFractionStr()); - - _preferableLocalSizeForPartHistKernel = _preferableGroupSize; - - while (_preferableLocalSizeForPartHistKernel - > services::internal::max(nSelectedFeatures, _minPreferableLocalSizeForPartHistKernel)) - { - _preferableLocalSizeForPartHistKernel >>= 1; - } - - const bool mdaRequired(par.varImportance == decision_forest::training::MDA_Raw || par.varImportance == decision_forest::training::MDA_Scaled); - const bool oobRequired = - (par.resultsToCompute & (decision_forest::training::computeOutOfBagError | decision_forest::training::computeOutOfBagErrorPerObservation) - || mdaRequired); - - decision_forest::classification::internal::ModelImpl & mdImpl = - *static_cast(&m); - DAAL_CHECK_MALLOC(mdImpl.resize(par.nTrees)); - - services::String buildOptions = getBuildOptions(_nClasses); - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.init(buildOptions.c_str(), TreeLevel::_nNodeSplitProps)); - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - - auto & info = context.getInfoDevice(); - - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory, "part1", df_batch_classification_kernels_part1, buildOptions.c_str())); - kernelComputeBestSplitSinglePass = kernel_factory.getKernel("computeBestSplitSinglePass", status); - - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory, "part2", df_batch_classification_kernels_part2, buildOptions.c_str())); - kernelComputeBestSplitByHistogram = kernel_factory.getKernel("computeBestSplitByHistogram", status); - kernelComputePartialHistograms = kernel_factory.getKernel("computePartialHistograms", status); - kernelReducePartialHistograms = kernel_factory.getKernel("reducePartialHistograms", status); - DAAL_CHECK_STATUS_VAR(status); - - dtrees::internal::BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy); - decision_forest::internal::IndexedFeaturesOneAPI indexedFeatures; - dtrees::internal::FeatureTypes featTypes; - - // init indexed features. - DAAL_CHECK_MALLOC(featTypes.init(*x)); - DAAL_CHECK_STATUS(status, (indexedFeatures.init(*const_cast(x), &featTypes, &prm))); - - _totalBins = indexedFeatures.totalBins(); - /* calculating the maximal number of bins for feature among all features */ - { - DAAL_ASSERT_UNIVERSAL_BUFFER(indexedFeatures.binOffsets(), uint32_t, _nFeatures + 1); - auto binOffsetsHost = indexedFeatures.binOffsets().template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - _nMaxBinsAmongFtrs = 0; - for (size_t i = 0; i < _nFeatures; i++) - { - auto nFtrBins = static_cast(binOffsetsHost.get()[i + 1] - binOffsetsHost.get()[i]); - _nMaxBinsAmongFtrs = (_nMaxBinsAmongFtrs < nFtrBins) ? nFtrBins : _nMaxBinsAmongFtrs; - } - } - - // no need to check for _nMaxBinsAmongFtrs < INT32_MAX because it will not be bigger than _nRows and _nRows was already checked - // check mul overflow for _nMaxBinsAmongFtrs * nSelectedFeatures - // and _nMaxBinsAmongFtrs * nSelectedFeatures * _nClasses because they are used further in kernels - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nMaxBinsAmongFtrs, nSelectedFeatures); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nMaxBinsAmongFtrs * nSelectedFeatures, _nClasses); - - // define num of trees which can be built in parallel - const size_t partHistSize = getPartHistRequiredMemSize(nSelectedFeatures, _nMaxBinsAmongFtrs); // alloc space at least for one part hist - const size_t maxMemAllocSize = services::internal::min(info.maxMemAllocSize, size_t(_maxMemAllocSizeForAlgo)); - - size_t usedMemSize = sizeof(algorithmFPType) * _nRows * (_nFeatures + 1); // input table size + response - usedMemSize += indexedFeatures.getRequiredMemSize(_nFeatures, _nRows); - usedMemSize += oobRequired ? sizeof(algorithmFPType) * _nRows * _nClasses : 0; - usedMemSize += partHistSize; // alloc space at least for one part hist - - size_t availableGlobalMemSize = info.globalMemSize > usedMemSize ? info.globalMemSize - usedMemSize : 0; - - size_t availableMemSizeForTreeBlock = - services::internal::min(maxMemAllocSize, static_cast(availableGlobalMemSize * _globalMemFractionForTreeBlock)); - - size_t requiredMemSizeForOneTree = - oobRequired ? _treeLevelBuildHelper.getOOBRowsRequiredMemSize(_nRows, 1 /* for 1 tree */, par.observationsPerTreeFraction) : 0; - requiredMemSizeForOneTree += sizeof(int32_t) * _nSelectedRows * 2; // main tree order and auxilliary one used for partitioning - - size_t treeBlock = availableMemSizeForTreeBlock / requiredMemSizeForOneTree; - - if (treeBlock <= 0) - { - // not enough memory even for one tree - return services::Status(services::ErrorMemoryAllocationFailed); - } - - treeBlock = services::internal::min(par.nTrees, treeBlock); - - availableGlobalMemSize = - availableGlobalMemSize > (treeBlock * requiredMemSizeForOneTree) ? availableGlobalMemSize - (treeBlock * requiredMemSizeForOneTree) : 0; - // size for one part hist was already reserved, add some more if there is available mem - _maxPartHistCumulativeSize = services::internal::min( - maxMemAllocSize, static_cast(partHistSize + availableGlobalMemSize * _globalMemFractionForPartHist)); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nSelectedRows, treeBlock); - daal::services::internal::TArray selectedRowsHost(_nSelectedRows * treeBlock); - DAAL_CHECK_MALLOC(selectedRowsHost.get()); - - auto treeOrderLev = context.allocate(TypeIds::id(), _nSelectedRows * treeBlock, status); - DAAL_CHECK_STATUS_VAR(status); - auto treeOrderLevBuf = context.allocate(TypeIds::id(), _nSelectedRows * treeBlock, status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor dataBlock; - DAAL_CHECK_STATUS_VAR(const_cast(x)->getBlockOfRows(0, _nRows, readOnly, dataBlock)); - - /* blocks for varImp MDI calculation */ - bool mdiRequired = (par.varImportance == decision_forest::training::MDI); - auto nodeImpDecreaseList = context.allocate(TypeIds::id(), 1, status); // holder will be reallocated in loop - DAAL_CHECK_STATUS_VAR(status); - BlockDescriptor varImpBlock; - NumericTablePtr varImpResPtr = res.get(variableImportance); - - if (mdiRequired || mdaRequired) - { - DAAL_CHECK_STATUS_VAR(varImpResPtr->getBlockOfRows(0, 1, writeOnly, varImpBlock)); - context.fill(varImpBlock.getBuffer(), (algorithmFPType)0, status); - DAAL_CHECK_STATUS_VAR(status); - } - - /* blocks for OutOfBag error calculation */ - UniversalBuffer oobBufferPerObs; - if (oobRequired) - { - // oobBufferPerObs contains nClassed counters for all out of bag observations for all trees - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nRows, _nClasses); - oobBufferPerObs = context.allocate(TypeIds::id(), _nRows * _nClasses, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(oobBufferPerObs, 0, status); - DAAL_CHECK_STATUS_VAR(status); - } - - /* blocks for MDA scaled error calculation */ - bool mdaScaledRequired = (par.varImportance == decision_forest::training::MDA_Scaled); - daal::services::internal::TArrayCalloc varImpVariance; // for now it is calculated on host - if (mdaScaledRequired) - { - varImpVariance.reset(_nFeatures); - } - - /*init engines*/ - engines::internal::ParallelizationTechnique technique = engines::internal::family; - selectParallelizationTechnique(par, technique); - engines::internal::Params params(par.nTrees); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, par.nTrees - 1, par.nTrees); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, (par.nTrees - 1) * par.nTrees, _nRows); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, (par.nTrees - 1) * par.nTrees * _nRows, (par.featuresPerNode + 1)); - for (size_t i = 0; i < par.nTrees; i++) - { - params.nSkip[i] = i * par.nTrees * _nRows * (par.featuresPerNode + 1); - } - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, par.nTrees, sizeof(engines::EnginePtr)); - daal::services::internal::TArray engines(par.nTrees); - engines::internal::EnginesCollection enginesCollection(par.engine, technique, params, engines, &status); - DAAL_CHECK_STATUS_VAR(status); - daal::services::internal::TArray enginesBaseImpl(par.nTrees); - for (size_t treeIndex = 0; treeIndex < par.nTrees; treeIndex++) - { - enginesBaseImpl[treeIndex] = dynamic_cast(engines[treeIndex].get()); - if (!enginesBaseImpl[treeIndex]) return Status(ErrorEngineNotSupported); - } - - for (size_t iter = 0; (iter < par.nTrees) && !algorithms::internal::isCancelled(status, pHostApp); iter += treeBlock) - { - size_t nTrees = services::internal::min(par.nTrees - iter, treeBlock); - - BlockDescriptor responseBlock; - DAAL_CHECK_STATUS_VAR(const_cast(y)->getBlockOfRows(0, _nRows, readOnly, responseBlock)); - - size_t nNodes = nTrees; // num of potential nodes to split on current tree level - auto oobRowsNumList = context.allocate(TypeIds::id(), nTrees + 1, status); - DAAL_CHECK_STATUS_VAR(status); - - Collection DFTreeRecords; - Collection levelNodeLists; // lists of nodes int props(rowsOffset, rows, ftrId, ftrVal ... ) - Collection levelNodeImpLists; // list of nodes fptype props (impurity, mean) - UniversalBuffer oobRows; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodes, TreeLevel::_nNodeSplitProps); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodes, TreeLevel::_nNodeImpProps); - auto nodeVsTreeMap = context.allocate(TypeIds::id(), nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - levelNodeLists.push_back(context.allocate(TypeIds::id(), nNodes * TreeLevel::_nNodeSplitProps, status)); - DAAL_CHECK_STATUS_VAR(status); - levelNodeImpLists.push_back(context.allocate(TypeIds::id(), nNodes * (TreeLevel::_nNodeImpProps + _nClasses), status)); - DAAL_CHECK_STATUS_VAR(status); - - { - auto treeMap = nodeVsTreeMap.template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - auto rootNode = levelNodeLists[0].template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - for (size_t node = 0; node < nNodes; node++) - { - treeMap.get()[node] = static_cast(iter + node); // check for par.nTrees less than int32 was done at the beggining - rootNode.get()[node * TreeLevel::_nNodeSplitProps + 0] = _nSelectedRows * node; // rows offset - rootNode.get()[node * TreeLevel::_nNodeSplitProps + 1] = _nSelectedRows; // num of rows - } - } - - if (par.bootstrap) - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.RNG); - - for (size_t node = 0; node < nNodes; node++) - { - daal::internal::RNGsInst rng; - rng.uniform(_nSelectedRows, selectedRowsHost.get() + _nSelectedRows * node, enginesBaseImpl[iter + node]->getState(), 0, _nRows); - } - - context.copy(treeOrderLev, 0, (void *)selectedRowsHost.get(), _nSelectedRows * nNodes, 0, _nSelectedRows * nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - } - else - { - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.initializeTreeOrder(_nSelectedRows, nTrees, treeOrderLev)); - } - - if (oobRequired) - { - _treeLevelBuildHelper.getOOBRows(treeOrderLev, _nSelectedRows, nTrees, oobRowsNumList, - oobRows); // oobRowsNumList and oobRows are the output - } - - for (size_t level = 0; nNodes > 0; level++) - { - auto nodeList = levelNodeLists[level]; - auto impList = levelNodeImpLists[level]; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, (nNodes + 1), nSelectedFeatures); - daal::services::internal::TArray selectedFeaturesHost( - (nNodes + 1) * nSelectedFeatures); // first part is used features indices, +1 - part for generator - DAAL_CHECK_MALLOC(selectedFeaturesHost.get()); - - auto selectedFeaturesCom = context.allocate(TypeIds::id(), nNodes * nSelectedFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - if (nSelectedFeatures != _nFeatures) - { - daal::internal::RNGsInst rng; - auto treeMap = nodeVsTreeMap.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - for (size_t node = 0; node < nNodes; node++) - { - rng.uniformWithoutReplacement(nSelectedFeatures, selectedFeaturesHost.get() + node * nSelectedFeatures, - selectedFeaturesHost.get() + (node + 1) * nSelectedFeatures, - enginesBaseImpl[treeMap.get()[node]]->getState(), 0, _nFeatures); - } - } - else - { - for (size_t node = 0; node < nNodes; node++) - { - for (size_t i = 0; i < nSelectedFeatures; i++) - { - selectedFeaturesHost.get()[node * nSelectedFeatures + i] = i; - } - } - } - - context.copy(selectedFeaturesCom, 0, (void *)selectedFeaturesHost.get(), nSelectedFeatures * nNodes, 0, nSelectedFeatures * nNodes, - status); - DAAL_CHECK_STATUS_VAR(status); - - if (mdiRequired) - { - nodeImpDecreaseList = context.allocate(TypeIds::id(), nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_CHECK_STATUS_VAR(computeBestSplit(indexedFeatures.getFullData(), treeOrderLev, selectedFeaturesCom, nSelectedFeatures, - responseBlock.getBuffer(), nodeList, indexedFeatures.binOffsets(), impList, nodeImpDecreaseList, - mdiRequired, _nFeatures, nNodes, par.minObservationsInLeafNode, par.impurityThreshold)); - - if (par.maxTreeDepth > 0 && par.maxTreeDepth == level) - { - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.convertSplitToLeaf(nodeList, nNodes)); - TreeLevel levelRecord; - DAAL_CHECK_STATUS_VAR(levelRecord.init(nodeList, impList, nNodes, _nClasses)); - DFTreeRecords.push_back(levelRecord); - break; - } - - TreeLevel levelRecord; - DAAL_CHECK_STATUS_VAR(levelRecord.init(nodeList, impList, nNodes, _nClasses)); - DFTreeRecords.push_back(levelRecord); - - if (mdiRequired) - { - /*mdi is calculated only on split nodes and not calculated on last level*/ - auto varImpBuffer = varImpBlock.getBuffer(); - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.updateMDIVarImportance(nodeList, nodeImpDecreaseList, nNodes, varImpBuffer, _nFeatures)); - } - - size_t nNodesNewLevel; - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.getNumOfSplitNodes(nodeList, nNodes, nNodesNewLevel)); - - if (nNodesNewLevel) - { - /*there are split nodes -> next level is required*/ - nNodesNewLevel *= 2; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodesNewLevel, TreeLevel::_nNodeSplitProps); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodesNewLevel, (TreeLevel::_nNodeImpProps + _nClasses)); - auto nodeListNewLevel = context.allocate(TypeIds::id(), nNodesNewLevel * TreeLevel::_nNodeSplitProps, status); - DAAL_CHECK_STATUS_VAR(status); - auto nodeVsTreeMapNew = context.allocate(TypeIds::id(), nNodesNewLevel, status); - DAAL_CHECK_STATUS_VAR(status); - auto impListNewLevel = - context.allocate(TypeIds::id(), nNodesNewLevel * (TreeLevel::_nNodeImpProps + _nClasses), status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR( - _treeLevelBuildHelper.doNodesSplit(nodeList, nNodes, nodeListNewLevel, nNodesNewLevel, nodeVsTreeMap, nodeVsTreeMapNew)); - - levelNodeLists.push_back(nodeListNewLevel); - levelNodeImpLists.push_back(impListNewLevel); - - nodeVsTreeMap = nodeVsTreeMapNew; - - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.doLevelPartition(indexedFeatures.getFullData(), nodeList, nNodes, treeOrderLev, - treeOrderLevBuf, _nSelectedRows, _nFeatures)); - } - - nNodes = nNodesNewLevel; - } // for level - - DFTreeConverterType converter; - typename DFTreeConverterType::TreeHelperType mTreeHelper(nTrees); - - services::Collection > binValuesHost(_nFeatures); - DAAL_CHECK_MALLOC(binValuesHost.data()); - services::Collection binValues(_nFeatures); - DAAL_CHECK_MALLOC(binValues.data()); - - for (size_t i = 0; i < _nFeatures; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(indexedFeatures.binBorders(i), algorithmFPType, indexedFeatures.numIndices(i)); - binValuesHost[i] = indexedFeatures.binBorders(i).template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - binValues[i] = binValuesHost[i].get(); - } - - DAAL_CHECK_STATUS_VAR(converter.convertToDFDecisionTree(DFTreeRecords, binValues.data(), mTreeHelper, _nClasses)); - - for (size_t tree = 0; tree < nTrees; tree++) - { - mdImpl.add(mTreeHelper._tree_list[tree], _nClasses, iter + tree); - - DAAL_CHECK_STATUS_VAR(computeResults(mTreeHelper._tree_list[tree], dataBlock.getBlockPtr(), responseBlock.getBlockPtr(), _nSelectedRows, - _nFeatures, oobRows, oobRowsNumList, oobBufferPerObs, varImpBlock.getBlockPtr(), - varImpVariance.get(), iter + tree + 1, engines[iter + tree], nTrees, tree, par)); - } - - DAAL_CHECK_STATUS_VAR(const_cast(y)->releaseBlockOfRows(responseBlock)); - } - - /* Finalize results */ - if (par.resultsToCompute - & (decision_forest::training::computeOutOfBagError | decision_forest::training::computeOutOfBagErrorPerObservation - | decision_forest::training::computeOutOfBagErrorAccuracy | decision_forest::training::computeOutOfBagErrorDecisionFunction)) - { - BlockDescriptor responseBlock; - DAAL_CHECK_STATUS_VAR(const_cast(y)->getBlockOfRows(0, _nRows, readOnly, responseBlock)); - - NumericTablePtr oobErrPtr = res.get(outOfBagError); - BlockDescriptor oobErrBlock; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagError) - DAAL_CHECK_STATUS_VAR(oobErrPtr->getBlockOfRows(0, 1, writeOnly, oobErrBlock)); - - NumericTablePtr oobErrAccuracyPtr = res.get(outOfBagErrorAccuracy); - BlockDescriptor oobErrAccuracyBlock; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagErrorAccuracy) - DAAL_CHECK_STATUS_VAR(oobErrAccuracyPtr->getBlockOfRows(0, 1, writeOnly, oobErrAccuracyBlock)); - - NumericTablePtr oobErrPerObsPtr = res.get(outOfBagErrorPerObservation); - BlockDescriptor oobErrPerObsBlock; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagErrorPerObservation) - DAAL_CHECK_STATUS_VAR(oobErrPerObsPtr->getBlockOfRows(0, _nRows, writeOnly, oobErrPerObsBlock)); - - NumericTablePtr oobErrDecisionFunctionPtr = res.get(outOfBagErrorDecisionFunction); - BlockDescriptor oobErrDecisionFunctionBlock; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagErrorDecisionFunction) - DAAL_CHECK_STATUS_VAR(oobErrDecisionFunctionPtr->getBlockOfRows(0, _nRows, writeOnly, oobErrDecisionFunctionBlock)); - - DAAL_CHECK_STATUS_VAR(finalizeOOBError(responseBlock.getBlockPtr(), oobBufferPerObs, _nRows, oobErrBlock.getBlockPtr(), - oobErrPerObsBlock.getBlockPtr(), oobErrAccuracyBlock.getBlockPtr(), - oobErrDecisionFunctionBlock.getBlockPtr())); - - if (oobErrPtr) DAAL_CHECK_STATUS_VAR(oobErrPtr->releaseBlockOfRows(oobErrBlock)); - - if (oobErrPerObsPtr) DAAL_CHECK_STATUS_VAR(oobErrPerObsPtr->releaseBlockOfRows(oobErrPerObsBlock)); - - DAAL_CHECK_STATUS_VAR(const_cast(y)->releaseBlockOfRows(responseBlock)); - } - - if (par.varImportance != decision_forest::training::none && par.varImportance != decision_forest::training::MDA_Raw) - { - DAAL_CHECK_STATUS_VAR(finalizeVarImp(par, varImpBlock.getBlockPtr(), varImpVariance.get(), _nFeatures)); - } - - if (mdiRequired || mdaRequired) DAAL_CHECK_STATUS_VAR(varImpResPtr->releaseBlockOfRows(varImpBlock)); - - DAAL_CHECK_STATUS_VAR(const_cast(x)->releaseBlockOfRows(dataBlock)); - - return status; -} - -} /* namespace internal */ -} /* namespace training */ -} /* namespace classification */ -} /* namespace decision_forest */ -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_tree_helper_impl.i b/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_tree_helper_impl.i deleted file mode 100644 index e7bd413cbe9..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/classification/oneapi/df_classification_tree_helper_impl.i +++ /dev/null @@ -1,212 +0,0 @@ -/* file: df_classification_tree_helper_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of the class defining the decision forest classification tree -//-- -*/ - -#ifndef __DF_CLASSIFICATION_TREE_HELPER_IMPL__ -#define __DF_CLASSIFICATION_TREE_HELPER_IMPL__ - -#include "data_management/data/aos_numeric_table.h" -#include "src/services/service_arrays.h" -#include "src/algorithms/dtrees/dtrees_predict_dense_default_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace classification -{ -namespace internal -{ -using namespace daal::algorithms::dtrees::internal; -using namespace daal::services::internal; - -template -class ClassificationTreeHelperOneAPI -{ -public: - typedef dtrees::internal::TreeImpClassification<> TreeType; - typedef typename TreeType::NodeType NodeType; - - ClassificationTreeHelperOneAPI() = delete; - explicit ClassificationTreeHelperOneAPI(size_t nTrees) : _allocator(_cNumNodesHint) { _tree_list.reset(nTrees); } - ~ClassificationTreeHelperOneAPI() {} - - typename NodeType::Leaf * makeLeaf(size_t n, size_t response, algorithmFPType impurity, algorithmFPType * hist, size_t nClasses) - { - typename NodeType::Leaf * pNode = _allocator.allocLeaf(nClasses); - DAAL_ASSERT(n > 0); - pNode->response.value = response; - pNode->count = n; - pNode->impurity = impurity; - - for (size_t i = 0; i < nClasses; i++) - { - pNode->hist[i] = hist[i]; - } - - return pNode; - } - - typename NodeType::Split * makeSplit(size_t n, size_t iFeature, algorithmFPType featureValue, bool bUnordered, algorithmFPType impurity, - typename NodeType::Base * left, typename NodeType::Base * right) - { - typename NodeType::Split * pNode = _allocator.allocSplit(); - pNode->set(iFeature, featureValue, bUnordered); - pNode->kid[0] = left; - pNode->kid[1] = right; - pNode->impurity = impurity; - pNode->count = n; - - return pNode; - } - - static algorithmFPType predict(const dtrees::internal::Tree & t, const algorithmFPType * x) - { - const typename NodeType::Base * pNode = dtrees::prediction::internal::findNode(t, x); - DAAL_ASSERT(pNode); - return NodeType::castLeaf(pNode)->response.value; - } - - static const size_t _cNumNodesHint = 512; //number of nodes as a hint for allocator to grow by - TreeType::Allocator _allocator; - TArray _tree_list; -}; - -template -struct TreeLevelRecord -{ - TreeLevelRecord() : _isInitialized(false), _nNodes(0), _nClasses(0) {} - services::Status init(services::internal::sycl::UniversalBuffer & nodeList, services::internal::sycl::UniversalBuffer & impInfo, size_t nNodes, - size_t nClasses) - { - services::Status status; - - _nNodes = nNodes; - _nClasses = nClasses; - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(impInfo, algorithmFPType, nNodes * (_nNodeImpProps + _nClasses)); - - auto nodeListHost = nodeList.template get().toHost(ReadWriteMode::readOnly, status); - auto impInfoHost = impInfo.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - _nodeList = nodeListHost; - _impInfo = impInfoHost; - - _isInitialized = true; - - return status; - } - - bool isInitialized() const { return _isInitialized; } - size_t getNodesNum() { return _nNodes; } - int getRowsNum(size_t nodeIdx) { return _nodeList.get()[nodeIdx * _nNodeSplitProps + 1]; } - int getFtrIdx(size_t nodeIdx) { return _nodeList.get()[nodeIdx * _nNodeSplitProps + 2]; } - int getFtrVal(size_t nodeIdx) { return _nodeList.get()[nodeIdx * _nNodeSplitProps + 3]; } - algorithmFPType getImpurity(size_t nodeIdx) { return _impInfo.get()[nodeIdx * (_nNodeImpProps + _nClasses) + 0]; } - size_t getResponse(size_t nodeIdx) { return static_cast(_nodeList.get()[nodeIdx * _nNodeSplitProps + 5]); } - algorithmFPType * getHist(size_t nodeIdx) { return &_impInfo.get()[nodeIdx * (_nNodeImpProps + _nClasses) + _nNodeImpProps]; } - bool hasUnorderedFtr(size_t nodeIdx) { return false; /* unordered features are not supported yet */ } - - constexpr static int _nNodeImpProps = 1; - constexpr static int _nNodeSplitProps = 6; - - SharedPtr _nodeList; - SharedPtr _impInfo; - size_t _nNodes; - size_t _nClasses; - - bool _isInitialized; -}; - -template -struct DFTreeConverter -{ - typedef ClassificationTreeHelperOneAPI TreeHelperType; - - services::Status convertToDFDecisionTree(Collection > & treeLevelsList, algorithmFPType ** binValues, - TreeHelperType & treeBuilder, size_t nClasses) - { - services::Status status; - typedef TArray DFTreeNodesArr; - typedef SharedPtr DFTreeNodesArrPtr; - - DFTreeNodesArrPtr dfTreeLevelNodesPrev; - bool unorderedFeaturesUsed = false; - const int notFoundVal = -1; - - size_t level = treeLevelsList.size(); - DAAL_ASSERT(level); - - do - { - level--; - TreeLevelRecord & record = treeLevelsList[level]; - DAAL_ASSERT(record.isInitialized()); - - DFTreeNodesArrPtr dfTreeLevelNodes(new DFTreeNodesArr(record.getNodesNum())); - DAAL_CHECK_MALLOC(dfTreeLevelNodes.get()); - DAAL_CHECK_MALLOC(dfTreeLevelNodes->get()); - - size_t nSplits = 0; - // nSplits is used to calculate index of child nodes on next level - for (size_t nodeIdx = 0; nodeIdx < record.getNodesNum(); nodeIdx++) - { - if (record.getFtrIdx(nodeIdx) == notFoundVal) - { - // leaf node - dfTreeLevelNodes->get()[nodeIdx] = treeBuilder.makeLeaf(record.getRowsNum(nodeIdx), record.getResponse(nodeIdx), - record.getImpurity(nodeIdx), record.getHist(nodeIdx), nClasses); - } - else - { - DAAL_ASSERT(dfTreeLevelNodesPrev->get()); - //split node - dfTreeLevelNodes->get()[nodeIdx] = treeBuilder.makeSplit( - record.getRowsNum(nodeIdx), record.getFtrIdx(nodeIdx), binValues[record.getFtrIdx(nodeIdx)][record.getFtrVal(nodeIdx)], - static_cast(record.hasUnorderedFtr(nodeIdx)), record.getImpurity(nodeIdx), dfTreeLevelNodesPrev->get()[nSplits * 2], - dfTreeLevelNodesPrev->get()[nSplits * 2 + 1]); - nSplits++; - } - } - - dfTreeLevelNodesPrev = dfTreeLevelNodes; - } while (level > 0); - - for (size_t tree = 0; tree < treeBuilder._tree_list.size(); tree++) - { - treeBuilder._tree_list[tree].reset(dfTreeLevelNodesPrev->get()[tree], unorderedFeaturesUsed); - } - return status; - } -}; - -} // namespace internal -} // namespace classification -} // namespace decision_forest -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_common_kernels.cl b/cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_common_kernels.cl deleted file mode 100644 index 51915ebf785..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_common_kernels.cl +++ /dev/null @@ -1,81 +0,0 @@ -/* file: df_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of decision forest OpenCL kernels. -//-- -*/ - -#ifndef __DF_KERNELS_CL__ -#define __DF_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - df_common_kernels, - - __kernel void extractColumn(const __global algorithmFPType * data, __global algorithmFPType * values, __global int * indices, int featureId, - int nFeatures, int nRows) { - const int id = get_global_id(0); - values[id] = data[id * nFeatures + featureId]; - indices[id] = id; - } - - __kernel void collectBinBorders(const __global algorithmFPType * values, const __global int * binOffsets, __global algorithmFPType * binBorders) { - const int id = get_global_id(0); - binBorders[id] = values[binOffsets[id]]; - } - - __kernel void computeBins(const __global algorithmFPType * values, const __global int * indices, const __global algorithmFPType * binBorders, - __global int * bins, int nRows, int nBins) { - const int n_groups = get_num_groups(0); - const int n_sub_groups = get_num_sub_groups(); - const int n_total_sub_groups = n_sub_groups * n_groups; - const int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const int local_size = get_sub_group_size(); - - const int id = get_local_id(0); - const int local_id = get_sub_group_local_id(); - const int sub_group_id = get_sub_group_id(); - const int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - int iStart = group_id * nElementsForSubgroup; - int iEnd = (group_id + 1) * nElementsForSubgroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - int curBin = 0; - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - algorithmFPType value = values[i]; - while (binBorders[curBin] < value) curBin++; - bins[indices[i]] = curBin; - } - } - - __kernel void storeColumn(const __global int * data, __global int * fullData, int featureId, int nFeatures, int nRows) { - const int id = get_global_id(0); - fullData[id * nFeatures + featureId] = data[id]; - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_tree_level_build_helper_kernels.cl b/cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_tree_level_build_helper_kernels.cl deleted file mode 100644 index c6d8840f040..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/oneapi/cl_kernels/df_tree_level_build_helper_kernels.cl +++ /dev/null @@ -1,441 +0,0 @@ -/* file: df_tree_level_build_helper_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of tree level build helper OpenCL kernels. -//-- -*/ - -#ifndef __DF_TREE_LEVEL_BUILD_HELPER_KERNELS_CL__ -#define __DF_TREE_LEVEL_BUILD_HELPER_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - df_tree_level_build_helper_kernels, - - __kernel void initializeTreeOrder(__global int * treeOrder) { - const int id = get_global_id(0); - const int nRows = get_global_size(0); - const int tree = get_global_id(1); - treeOrder[nRows * tree + id] = id; - } - - __kernel void partitionCopy(const __global int * treeOrderBuf, __global int * treeOrder, int offset) { - const int id = get_global_id(0); - treeOrder[offset + id] = treeOrderBuf[offset + id]; - } - - __kernel void doLevelPartitionByGroups(const __global int * data, const __global int * nodeList, __global int * nodeAuxList, - const __global int * treeOrder, __global int * treeOrderBuf, int nNodes, int nFeatures) { - const int nNodeProp = NODE_PROPS; // num of split attributes for node - const int nAuxNodeProp = AUX_NODE_PROPS; // num of auxilliary attributes for node - const int maxBlocksNum = PARTITION_MAX_BLOCKS_NUM; - const int minBlockSize = PARTITION_MIN_BLOCK_SIZE; - const int leafMark = -1; - - const int sub_group_size = get_sub_group_size(); - const int work_group_size = get_local_size(0); - const int sub_groups_in_work_group_num = work_group_size / sub_group_size; // num of subgroups for current node processing - - const int sub_group_local_id = get_sub_group_local_id(); - const int work_group_local_id = get_local_id(0); - - const int sub_group_id = get_group_id(0) * sub_groups_in_work_group_num + work_group_local_id / sub_group_size; - const int sub_groups_num = get_num_groups(0) * sub_groups_in_work_group_num; // num of subgroups for current node processing - - const int totalBlocksNum = nNodes * maxBlocksNum; - - for (int blockIndGlob = sub_group_id; blockIndGlob < totalBlocksNum; blockIndGlob += sub_groups_num) - { - const int nodeId = blockIndGlob / maxBlocksNum; - const int blockInd = blockIndGlob % maxBlocksNum; - - __global const int * node = nodeList + nodeId * nNodeProp; - const int offset = node[0]; - const int nRows = node[1]; - const int featId = node[2]; - const int splitVal = node[3]; - const int nRowsLeft = node[4]; // num of items in Left part of node - - int nodeBlocks = nRows / minBlockSize ? min(nRows / minBlockSize, maxBlocksNum) : 1; - - // if node has blocks less than maxBlocksNum then subgroup will just go to the next node - if (featId != leafMark && blockInd < nodeBlocks) // split node - { - __global const int * nodeAux = nodeAuxList + nodeId * nAuxNodeProp; - - const int blockSize = nodeBlocks > 1 ? nRows / nodeBlocks + !!(nRows % nodeBlocks) : nRows; - - const int iEnd = min((blockInd + 1) * blockSize, nRows); - const int iStart = min(blockInd * blockSize, iEnd); - const int rowsForGroup = iEnd - iStart; - - int groupLeftBoundary = 0; - int groupRightBoundary = 0; - - if (nodeBlocks > 1 && rowsForGroup > 0) - { - int groupRowsToRight = 0; - for (int i = iStart + sub_group_local_id; i < iEnd; i += sub_group_size) - { - int id = treeOrder[offset + i]; - int toRight = (int)(data[id * nFeatures + featId] > splitVal); - groupRowsToRight += sub_group_reduce_add(toRight); - } - - if (0 == sub_group_local_id) - { - groupLeftBoundary = atomic_add(nodeAux + 0, rowsForGroup - groupRowsToRight); - groupRightBoundary = atomic_add(nodeAux + 1, groupRowsToRight); - } - groupLeftBoundary = sub_group_broadcast(groupLeftBoundary, 0); - groupRightBoundary = sub_group_broadcast(groupRightBoundary, 0); - } - - int groupRowsToRight = 0; - for (int i = iStart + sub_group_local_id; i < iEnd; i += sub_group_size) - { - const int id = treeOrder[offset + i]; - const int toRight = (int)(data[id * nFeatures + featId] > splitVal); - const int boundary = groupRowsToRight + sub_group_scan_exclusive_add(toRight); - const int posNew = (toRight ? nRowsLeft + groupRightBoundary + boundary : groupLeftBoundary + i - iStart - boundary); - treeOrderBuf[offset + posNew] = id; - groupRowsToRight += sub_group_reduce_add(toRight); - } - } - } - } - - __kernel void getNumOfSplitNodes(const __global int * nodeList, int nNodes, __global int * nSplitNodes) { - const int local_id = get_sub_group_local_id(); - const int local_size = get_sub_group_size(); - const int nNodeProp = NODE_PROPS; // num of node properties in node - const int badVal = -1; - - int sum = 0; - for (int i = local_id; i < nNodes; i += local_size) - { - sum += (int)(nodeList[i * nNodeProp + 2] != badVal); - } - - sum = sub_group_reduce_add(sum); - - if (local_id == 0) - { - nSplitNodes[0] = sum; - } - } - - __kernel void convertSplitToLeaf(__global int * nodeList) { - const int nNodeProp = NODE_PROPS; // num of split attributes for node - const int leafMark = -1; - const int id = get_global_id(0); - - nodeList[id * nNodeProp + 2] = leafMark; - nodeList[id * nNodeProp + 3] = leafMark; - } - - __kernel void doNodesSplit(const __global int * nodeList, int nNodes, __global int * nodeListNew, const __global int * nodeVsTreeMap, - __global int * nodeVsTreeMapNew) { - const int nNodeProp = NODE_PROPS; // num of split attributes for node - const int badVal = -1; - const int local_id = get_sub_group_local_id(); - const int local_size = get_sub_group_size(); - - int nCreatedNodes = 0; - for (int i = local_id; i < nNodes; i += local_size) - { - int splitNode = (int)(nodeList[i * nNodeProp + 2] != badVal); // featId != -1 - int newLeftNodePos = nCreatedNodes + sub_group_scan_exclusive_add(splitNode) * 2; - if (splitNode) - { - // split parent node on left and right nodes - __global const int * nodeP = nodeList + i * nNodeProp; - __global int * nodeL = nodeListNew + newLeftNodePos * nNodeProp; - __global int * nodeR = nodeListNew + (newLeftNodePos + 1) * nNodeProp; - - nodeL[0] = nodeP[0]; // rows offset - nodeL[1] = nodeP[4]; // nRows - nodeL[2] = badVal; // featureId - nodeL[3] = badVal; // featureVal - nodeL[4] = nodeP[4]; // num of items in Left part = nRows in new node - - nodeR[0] = nodeL[0] + nodeL[1]; - nodeR[1] = nodeP[1] - nodeL[1]; - nodeR[2] = badVal; - nodeR[3] = badVal; - nodeR[4] = nodeR[1]; // num of items in Left part = nRows in new node - - nodeVsTreeMapNew[newLeftNodePos] = nodeVsTreeMap[i]; - nodeVsTreeMapNew[newLeftNodePos + 1] = nodeVsTreeMap[i]; - } - nCreatedNodes += sub_group_reduce_add(splitNode) * 2; - } - } - - __kernel void splitNodeListOnGroupsBySize(const __global int * nodeList, int nNodes, __global int * bigNodesGroups, __global int * nodeIndices, - int minRowsBlock) { - /*for now only 3 groups are produced, may be more required*/ - const int bigNodeLowBorderBlocksNum = BIG_NODE_LOW_BORDER_BLOCKS_NUM; // fine, need to experiment and find better one - const int blockSize = minRowsBlock; - const int nNodeProp = NODE_PROPS; // num of split attributes for node - - const int local_id = get_sub_group_local_id(); - const int nodeId = get_global_id(1); - const int local_size = get_sub_group_size(); - - int nBigNodes = 0; - int maxBigBlocksNum = 1; - int nMidNodes = 0; - int maxMidBlocksNum = 1; - - /*calculate num of big and mid nodes*/ - for (int i = local_id; i < nNodes; i += local_size) - { - int nRows = nodeList[i * nNodeProp + 1]; - int nBlocks = nRows / blockSize + !!(nRows % blockSize); - - int bigNode = (int)(nBlocks > bigNodeLowBorderBlocksNum); - int midNode = (int)(nBlocks <= bigNodeLowBorderBlocksNum && nBlocks > 1); - - nBigNodes += sub_group_reduce_add(bigNode); - nMidNodes += sub_group_reduce_add(midNode); - maxBigBlocksNum = max(maxBigBlocksNum, bigNode ? nBlocks : 0); - maxBigBlocksNum = sub_group_reduce_max(maxBigBlocksNum); - maxMidBlocksNum = max(maxMidBlocksNum, midNode ? nBlocks : 0); - maxMidBlocksNum = sub_group_reduce_max(maxMidBlocksNum); - } - - nBigNodes = sub_group_broadcast(nBigNodes, 0); - nMidNodes = sub_group_broadcast(nMidNodes, 0); - - if (0 == local_id) - { - bigNodesGroups[0] = nBigNodes; - bigNodesGroups[1] = maxBigBlocksNum; - bigNodesGroups[2] = nMidNodes; - bigNodesGroups[3] = maxMidBlocksNum; - bigNodesGroups[4] = nNodes - nBigNodes - nMidNodes; - bigNodesGroups[5] = 1; - } - - int sumBig = 0; - int sumMid = 0; - - /*split nodes on groups*/ - for (int i = local_id; i < nNodes; i += local_size) - { - int nRows = nodeList[i * nNodeProp + 1]; - int nBlocks = nRows / blockSize + !!(nRows % blockSize); - int bigNode = (int)(nBlocks > bigNodeLowBorderBlocksNum); - int midNode = (int)(nBlocks <= bigNodeLowBorderBlocksNum && nBlocks > 1); - - int boundaryBig = sumBig + sub_group_scan_exclusive_add(bigNode); - int boundaryMid = sumMid + sub_group_scan_exclusive_add(midNode); - int posNew = (bigNode ? boundaryBig : (midNode ? nBigNodes + boundaryMid : nBigNodes + nMidNodes + i - boundaryBig - boundaryMid)); - nodeIndices[posNew] = i; - sumBig += sub_group_reduce_add(bigNode); - sumMid += sub_group_reduce_add(midNode); - } - } - - __kernel void updateMDIVarImportance(const __global int * nodeList, const __global algorithmFPType * nodeImpDecreaseList, int nNodes, - __global algorithmFPType * featureImportanceList) { - const int nNodeProp = NODE_PROPS; // num of node properties in nodeList - - const int local_id = get_local_id(0); - const int sub_group_local_id = get_sub_group_local_id(); - const int sub_group_size = get_sub_group_size(); - const int local_size = get_local_size(0); - const int n_sub_groups = local_size / sub_group_size; // num of subgroups for current node processing - const int sub_group_id = local_id / sub_group_size; - const int max_sub_groups_num = 16; //replace with define - - const int bufIdx = get_global_id(1) % (max_sub_groups_num / n_sub_groups); // local buffer is shared between 16 sub groups - const int ftrId = get_global_id(1); - - const int leafMark = -1; - const int nElementsForSubgroup = nNodes / n_sub_groups + !!(nNodes % n_sub_groups); - - __local algorithmFPType bufI[max_sub_groups_num]; // storage for impurity decrease - - int iStart = sub_group_id * nElementsForSubgroup; - int iEnd = (sub_group_id + 1) * nElementsForSubgroup; - - iEnd = (iEnd > nNodes) ? nNodes : iEnd; - - algorithmFPType ftrImp = (algorithmFPType)0; - - for (int nodeIdx = iStart + sub_group_local_id; nodeIdx < iEnd; nodeIdx += sub_group_size) - { - int splitFtrId = nodeList[nodeIdx * nNodeProp + 2]; - ftrImp += sub_group_reduce_add((splitFtrId != leafMark && ftrId == splitFtrId) ? nodeImpDecreaseList[nodeIdx] : (algorithmFPType)0); - } - - if (0 == sub_group_local_id) - { - if (1 == n_sub_groups) - { - featureImportanceList[ftrId] += ftrImp; - } - else - { - bufI[bufIdx + sub_group_id] = ftrImp; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - if (1 < n_sub_groups && 0 == sub_group_id) - { - // first sub group for current node reduces over local buffer if required - algorithmFPType ftrImp = (sub_group_local_id < n_sub_groups) ? bufI[bufIdx + sub_group_local_id] : (algorithmFPType)0; - algorithmFPType totalFtrImp = sub_group_reduce_add(ftrImp); - - if (0 == local_id) - { - featureImportanceList[ftrId] += totalFtrImp; - } - } - } - - __kernel void markPresentRows(const __global int * rowsList, __global int * rowsBuffer, int nRows, int tree) { - const int n_groups = get_num_groups(0); - const int n_sub_groups = get_num_sub_groups(); - const int n_total_sub_groups = n_sub_groups * n_groups; - const int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const int local_size = get_sub_group_size(); - - const int id = get_local_id(0); - const int local_id = get_sub_group_local_id(); - const int sub_group_id = get_sub_group_id(); - const int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - const int itemPresentMark = 1; - - int iStart = group_id * nElementsForSubgroup; - int iEnd = (group_id + 1) * nElementsForSubgroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - rowsBuffer[nRows * tree + rowsList[nRows * tree + i]] = itemPresentMark; - } - } - - __kernel void countAbsentRowsForBlocks(const __global int * rowsBuffer, __global int * partialSums, int nRows, int tree) { - const int n_groups = get_num_groups(0); - const int n_sub_groups = get_num_sub_groups(); - const int n_total_sub_groups = n_sub_groups * n_groups; - const int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const int local_size = get_sub_group_size(); - - const int id = get_local_id(0); - const int local_id = get_sub_group_local_id(); - const int sub_group_id = get_sub_group_id(); - const int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - const int itemAbsentMark = -1; - - int iStart = group_id * nElementsForSubgroup; - int iEnd = (group_id + 1) * nElementsForSubgroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - int subSum = 0; - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - subSum += (int)(itemAbsentMark == rowsBuffer[nRows * tree + i]); - } - - int sum = sub_group_reduce_add(subSum); - - if (local_id == 0) - { - partialSums[group_id] = sum; - } - } - - __kernel void countAbsentRowsTotal(const __global int * partialSums, __global int * partialPrefixSums, __global int * totalSum, int nSubgroupSums, - int tree) { - if (get_sub_group_id() > 0) return; - - const int local_size = get_sub_group_size(); - const int local_id = get_sub_group_local_id(); - - int sum = 0; - - for (int i = local_id; i < nSubgroupSums; i += local_size) - { - int value = partialSums[i]; - int boundary = sub_group_scan_exclusive_add(value); - partialPrefixSums[nSubgroupSums * tree + i] = sum + boundary; - sum += sub_group_reduce_add(value); - } - - if (local_id == 0) - { - totalSum[tree + 1] = totalSum[tree] + sum; - } - } - - __kernel void fillOOBRowsListByBlocks(const __global int * rowsBuffer, const __global int * partialPrefixSums, int nRows, int tree, - const __global int * oobRowsNumList, __global int * oobRowsList) { - const int n_groups = get_num_groups(0); - const int n_sub_groups = get_num_sub_groups(); - const int n_total_sub_groups = n_sub_groups * n_groups; - const int local_size = get_sub_group_size(); - - const int id = get_local_id(0); - const int local_id = get_sub_group_local_id(); - const int sub_group_id = get_sub_group_id(); - const int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - const int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - - const int itemAbsentMark = -1; - - const int oobRowsListOffset = oobRowsNumList[tree]; - - int iStart = group_id * nElementsForSubgroup; - int iEnd = (group_id + 1) * nElementsForSubgroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - int groupOffset = partialPrefixSums[n_groups * tree + group_id]; - int sum = 0; - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - int oobRow = (int)(itemAbsentMark == rowsBuffer[nRows * tree + i]); - int pos = groupOffset + sum + sub_group_scan_exclusive_add(oobRow); - if (oobRow) - { - oobRowsList[oobRowsListOffset + pos] = i; - } - sum += sub_group_reduce_add(oobRow); - } - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.h b/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.h deleted file mode 100644 index a8a1f8d8dfe..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.h +++ /dev/null @@ -1,140 +0,0 @@ -/* file: df_feature_type_helper_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of a service class that provides optimal access to the feature types -//-- -*/ - -#ifndef __DF_FEATURE_TYPE_HELPER_ONEAPI_H__ -#define __DF_FEATURE_TYPE_HELPER_ONEAPI_H__ - -#include "src/algorithms/dtrees/dtrees_feature_type_helper.h" -#include "src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.h" -#include "src/threading/threading.h" -#include "src/algorithms/service_error_handling.h" -#include "src/algorithms/service_sort.h" -#include "src/algorithms/dtrees/service_array.h" -#include "src/services/service_arrays.h" -#include "src/externals/service_memory.h" -#include "src/services/service_data_utils.h" -#include "src/data_management/service_numeric_table.h" - -#include "src/algorithms/dtrees/forest/oneapi/cl_kernels/df_common_kernels.cl" - -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace internal -{ -////////////////////////////////////////////////////////////////////////////////////////// -// IndexedFeatures. Creates and stores index of every feature -// Sorts every feature and creates the mapping: features value -> index of the value -// in the sorted array of unique values of the feature in increasing order -////////////////////////////////////////////////////////////////////////////////////////// -template -class IndexedFeaturesOneAPI -{ -public: - typedef size_t IndexType; - typedef uint32_t BinType; - - struct FeatureEntry - { - DAAL_NEW_DELETE(); - IndexType numIndices = 0; //number of indices or bins - IndexType offset = 0; - services::internal::sycl::UniversalBuffer binBorders; //right bin borders - - services::Status allocBorders(); - ~FeatureEntry(); - }; - -public: - IndexedFeaturesOneAPI() : _nCols(0), _nRows(0) {} - ~IndexedFeaturesOneAPI(); - - size_t getRequiredMemSize(size_t nCols, size_t nRows); - - services::Status init(NumericTable & nt, const dtrees::internal::FeatureTypes * featureTypes, const dtrees::internal::BinParams * pBinPrm); - - //get max number of indices for that feature - IndexType numIndices(size_t iCol) const { return _entries[iCol].numIndices; } - - IndexType totalBins() const { return _totalBins; } - - services::internal::sycl::UniversalBuffer & binBorders(size_t iCol) { return _entries[iCol].binBorders; } - - services::internal::sycl::UniversalBuffer & binOffsets() { return _binOffsets; } - - services::internal::sycl::UniversalBuffer & getFullData() { return _fullData; } - - size_t nRows() const { return _nRows; } - size_t nCols() const { return _nCols; } - -protected: - services::Status alloc(size_t nCols, size_t nRows); - - services::Status extractColumn(const services::internal::Buffer & data, services::internal::sycl::UniversalBuffer & values, - services::internal::sycl::UniversalBuffer & indices, int32_t featureId, int32_t nFeatures, int32_t nRows); - - services::Status collectBinBorders(services::internal::sycl::UniversalBuffer & values, services::internal::sycl::UniversalBuffer & binOffsets, - services::internal::sycl::UniversalBuffer & binBorders, int32_t nRows, int32_t maxBins); - - services::Status computeBins(services::internal::sycl::UniversalBuffer & values, services::internal::sycl::UniversalBuffer & indices, - services::internal::sycl::UniversalBuffer & binBorders, services::internal::sycl::UniversalBuffer & bins, - int32_t nRows, int32_t nBins, int32_t maxBins, int32_t localSize, int32_t nLocalBlocks); - - services::Status computeBins(services::internal::sycl::UniversalBuffer & values, services::internal::sycl::UniversalBuffer & indices, - services::internal::sycl::UniversalBuffer & bins, FeatureEntry & entry, int32_t nRows, - const dtrees::internal::BinParams * pBinPrm); - - services::Status makeIndex(const services::internal::Buffer & data, int32_t featureId, int32_t nFeatures, int32_t nRows, - const dtrees::internal::BinParams * pBinPrm, services::internal::sycl::UniversalBuffer & _values, - services::internal::sycl::UniversalBuffer & _values_buf, services::internal::sycl::UniversalBuffer & _indices, - services::internal::sycl::UniversalBuffer & _indices_buf, services::internal::sycl::UniversalBuffer & bins, - FeatureEntry & entry); - - services::Status storeColumn(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & fullData, - int32_t featureId, int32_t nFeatures, int32_t nRows); - -protected: - services::internal::sycl::UniversalBuffer _fullData; - services::internal::sycl::UniversalBuffer _binOffsets; - daal::internal::TArray _entries; - size_t _nRows; - size_t _nCols; - IndexType _totalBins; - - static constexpr size_t _int32max = static_cast(services::internal::MaxVal::get()); - - const int32_t _preferableSubGroup = 16; // preferable maximal sub-group size -}; - -} /* namespace internal */ -} /* namespace decision_forest */ -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.i b/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.i deleted file mode 100644 index 5ea84785e25..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.i +++ /dev/null @@ -1,467 +0,0 @@ -/* file: df_feature_type_helper_oneapi.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// GPU-dependent initialization of service data structure -//-- -*/ -#include "src/algorithms/dtrees/dtrees_feature_type_helper.h" - -#include "src/services/service_data_utils.h" -#include "src/sycl/sorter.h" -#include "src/externals/service_profiler.h" - -using namespace daal::services::internal::sycl; -using namespace daal::services::internal; - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace internal -{ -template -struct GetIntegerTypeForFPType; - -template <> -struct GetIntegerTypeForFPType -{ - using Type = uint32_t; -}; - -template <> -struct GetIntegerTypeForFPType -{ - using Type = uint64_t; -}; - -template -services::String getOpenCLKeyType(const services::String & typeName); - -template <> -inline services::String getOpenCLKeyType(const services::String & typeName) -{ - return services::String(" -D ") + typeName + services::String("=uint "); -} - -template <> -inline services::String getOpenCLKeyType(const services::String & typeName) -{ - return services::String(" -D ") + typeName + services::String("=ulong "); -} - -template -static services::Status buildProgram(ClKernelFactoryIface & factory) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - - services::Status status; - - auto fptype_name = getKeyFPType(); - auto radixtype_name = getOpenCLKeyType::Type>("radixIntType"); - auto build_options = fptype_name + radixtype_name; - - services::String cachekey("__daal_algorithms_df_common_"); - cachekey.add(build_options); - build_options.add(" -cl-std=CL1.2 "); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), df_common_kernels, build_options.c_str(), status); - - return status; -} - -template -IndexedFeaturesOneAPI::~IndexedFeaturesOneAPI() -{} - -template -IndexedFeaturesOneAPI::FeatureEntry::~FeatureEntry() -{} - -template -services::Status IndexedFeaturesOneAPI::FeatureEntry::allocBorders() -{ - auto & context = services::internal::getDefaultContext(); - services::Status status; - - binBorders = context.allocate(TypeIds::id(), numIndices, status); - return status; -} - -template -size_t IndexedFeaturesOneAPI::getRequiredMemSize(size_t nCols, size_t nRows) -{ - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRows, nCols); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, sizeof(BinType), nRows * nCols); - - size_t requiredMem = sizeof(BinType) * (nCols + 1); - - requiredMem += sizeof(BinType) * nRows * nCols; // data vs ftrs bin map table (_fullData) - return requiredMem; -} - -template -services::Status IndexedFeaturesOneAPI::alloc(size_t nC, size_t nR) -{ - auto & context = services::internal::getDefaultContext(); - services::Status status; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nR, nC); - _fullData = context.allocate(TypeIds::id(), nR * nC, status); - DAAL_CHECK_STATUS_VAR(status); - - _binOffsets = context.allocate(TypeIds::id(), nC + 1, status); - DAAL_CHECK_STATUS_VAR(status); - - _entries.reset(nC); - DAAL_CHECK_MALLOC(_entries.get()); - _nCols = nC; - _nRows = nR; - _totalBins = 0; - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::extractColumn(const services::internal::Buffer & data, - UniversalBuffer & values, UniversalBuffer & indices, int32_t featureId, - int32_t nFeatures, int32_t nRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.extractColumn); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(factory)); - - auto kernel = factory.getKernel("extractColumn", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(data), algorithmFPType, nRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(values, algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int32_t, nRows); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, values, AccessModeIds::write); - args.set(2, indices, AccessModeIds::write); - args.set(3, featureId); - args.set(4, nFeatures); - args.set(5, nRows); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - return status; -} - -template -services::Status IndexedFeaturesOneAPI::collectBinBorders(UniversalBuffer & values, UniversalBuffer & binOffsets, - UniversalBuffer & binBorders, int32_t nRows, int32_t maxBins) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.collectBinBorders); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("collectBinBorders", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(values, algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, int32_t, maxBins); - DAAL_ASSERT_UNIVERSAL_BUFFER(binBorders, algorithmFPType, maxBins); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, values, AccessModeIds::read); - args.set(1, binOffsets, AccessModeIds::read); - args.set(2, binBorders, AccessModeIds::write); - - KernelRange global_range(maxBins); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::computeBins(UniversalBuffer & values, UniversalBuffer & indices, - UniversalBuffer & binBorders, UniversalBuffer & bins, int32_t nRows, - int32_t nBins, int32_t maxBins, int32_t localSize, int32_t nLocalBlocks) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.computeBins); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("computeBins", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(values, algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int32_t, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(binBorders, algorithmFPType, maxBins); - DAAL_ASSERT_UNIVERSAL_BUFFER(bins, BinType, nRows); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, values, AccessModeIds::read); - args.set(1, indices, AccessModeIds::read); - args.set(2, binBorders, AccessModeIds::read); - args.set(3, bins, AccessModeIds::write); - args.set(4, nRows); - args.set(5, nBins); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalBlocks); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::computeBins(UniversalBuffer & values, UniversalBuffer & indices, UniversalBuffer & bins, - FeatureEntry & entry, int32_t nRows, const dtrees::internal::BinParams * pBinPrm) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - const int32_t maxBins = pBinPrm->maxBins < nRows ? pBinPrm->maxBins : nRows; - const int32_t localSize = _preferableSubGroup; - const int32_t nLocalBlocks = 1024 * localSize < nRows ? 1024 : (nRows / localSize) + !!(nRows % localSize); - - auto binOffsets = context.allocate(TypeIds::id(), maxBins, status); - DAAL_CHECK_STATUS_VAR(status); - auto binBorders = context.allocate(TypeIds::id(), maxBins, status); - DAAL_CHECK_STATUS_VAR(status); - - { - auto binOffsetsHost = binOffsets.template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_MALLOC(binOffsetsHost.get()); - int32_t offset = 0; - for (int32_t i = 0; i < maxBins; i++) - { - offset += (nRows + i) / maxBins; - binOffsetsHost.get()[i] = offset - 1; - } - } - - DAAL_CHECK_STATUS_VAR(collectBinBorders(values, binOffsets, binBorders, nRows, maxBins)); - - int32_t nBins = 0; - { - DAAL_ASSERT_UNIVERSAL_BUFFER(binBorders, algorithmFPType, maxBins); - auto binBordersHost = binBorders.template get().toHost(ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_MALLOC(binBordersHost.get()); - for (int32_t i = 0; i < maxBins; i++) - { - if (nBins == 0 || (nBins > 0 && binBordersHost.get()[i] != binBordersHost.get()[nBins - 1])) - { - binBordersHost.get()[nBins] = binBordersHost.get()[i]; - nBins++; - } - } - } - - DAAL_CHECK_STATUS_VAR(computeBins(values, indices, binBorders, bins, nRows, nBins, maxBins, localSize, nLocalBlocks)); - - entry.numIndices = static_cast(nBins); - entry.binBorders = binBorders; - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::makeIndex(const services::internal::Buffer & data, int32_t featureId, - int32_t nFeatures, int32_t nRows, const dtrees::internal::BinParams * pBinPrm, - UniversalBuffer & _values, UniversalBuffer & _values_buf, - UniversalBuffer & _indices, UniversalBuffer & _indices_buf, UniversalBuffer & bins, - FeatureEntry & entry) -{ - DAAL_CHECK_STATUS_VAR(extractColumn(data, _values, _indices, featureId, nFeatures, nRows)); - DAAL_CHECK_STATUS_VAR(sort::RadixSort::sortIndices(_values, _indices, _values_buf, _indices_buf, nRows)); - DAAL_CHECK_STATUS_VAR(computeBins(_values, _indices, bins, entry, nRows, pBinPrm)); - return services::Status(); -} - -template -services::Status IndexedFeaturesOneAPI::storeColumn(const UniversalBuffer & data, UniversalBuffer & fullData, int32_t featureId, - int32_t nFeatures, int32_t nRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.storeColumn); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("storeColumn", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(data, BinType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(fullData, BinType, nRows * nFeatures); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, fullData, AccessModeIds::write); - args.set(2, featureId); - args.set(3, nFeatures); - args.set(4, nRows); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::init(NumericTable & nt, const dtrees::internal::FeatureTypes * featureTypes, - const dtrees::internal::BinParams * pBinPrm) -{ - dtrees::internal::FeatureTypes autoFT; - if (!featureTypes) - { - DAAL_CHECK_MALLOC(autoFT.init(nt)); - featureTypes = &autoFT; - } - - const size_t nRsz = nt.getNumberOfRows(); - const size_t nCsz = nt.getNumberOfColumns(); - - if (nRsz > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfRowsInInputNumericTable); - } - if (nCsz > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - } - - const int32_t nC = static_cast(nCsz); - const int32_t nR = static_cast(nRsz); - - services::Status status = alloc(nCsz, nRsz); - DAAL_CHECK_STATUS_VAR(status); - - auto & context = services::internal::getDefaultContext(); - - //allocating auxilliary buffers - services::Collection _data; - - DAAL_CHECK_MALLOC(_data.resize(nCsz)); - - for (size_t i = 0; i < nCsz; i++) - { - _data[i] = context.allocate(TypeId::uint32, nRsz, status); - DAAL_CHECK_STATUS_VAR(status); - } - - auto _values = context.allocate(TypeIds::id(), nRsz, status); - DAAL_CHECK_STATUS_VAR(status); - auto _values_buf = context.allocate(TypeIds::id(), nRsz, status); - DAAL_CHECK_STATUS_VAR(status); - - auto _indices = context.allocate(TypeIds::id(), nRsz, status); - DAAL_CHECK_STATUS_VAR(status); - auto _indices_buf = context.allocate(TypeIds::id(), nRsz, status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor dataBlock; - - if (nt.getDataLayout() == NumericTableIface::soa) - { - for (int32_t i = 0; i < nC; i++) - { - DAAL_CHECK_STATUS_VAR(nt.getBlockOfColumnValues(i, 0, nR, readOnly, dataBlock)); - auto dataBuffer = dataBlock.getBuffer(); - DAAL_CHECK_STATUS_VAR(makeIndex(dataBuffer, 0, 1, nR, pBinPrm, _values, _values_buf, _indices, _indices_buf, _data[i], _entries[i])); - DAAL_CHECK_STATUS_VAR(nt.releaseBlockOfColumnValues(dataBlock)); - } - } - else - { - DAAL_CHECK_STATUS_VAR(nt.getBlockOfRows(0, nR, readOnly, dataBlock)); - auto dataBuffer = dataBlock.getBuffer(); - for (int32_t i = 0; i < nC; i++) - { - DAAL_CHECK_STATUS_VAR(makeIndex(dataBuffer, i, nC, nR, pBinPrm, _values, _values_buf, _indices, _indices_buf, _data[i], _entries[i])); - } - DAAL_CHECK_STATUS_VAR(nt.releaseBlockOfRows(dataBlock)); - } - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(_binOffsets, BinType, nC + 1); - auto binOffsetsHost = _binOffsets.template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_MALLOC(binOffsetsHost.get()); - BinType total = 0; - for (int32_t i = 0; i < nC; i++) - { - DAAL_CHECK_STATUS_VAR(storeColumn(_data[i], _fullData, i, nC, nR)); - binOffsetsHost.get()[i] = total; - _entries[i].offset = total; - total += _entries[i].numIndices; - } - binOffsetsHost.get()[nC] = total; - _totalBins = total; - } - - return status; -} - -} /* namespace internal */ -} /* namespace decision_forest */ -} /* namespace algorithms */ -} /* namespace daal */ diff --git a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.h b/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.h deleted file mode 100644 index 659f13a069b..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.h +++ /dev/null @@ -1,141 +0,0 @@ -/* file: df_tree_level_build_helper_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of a service class that provides -// common kernels required for building tree levels -//-- -*/ - -#ifndef __DF_TREE_LEVEL_BUILD_HELPER_ONEAPI_H__ -#define __DF_TREE_LEVEL_BUILD_HELPER_ONEAPI_H__ - -#include "src/algorithms/service_error_handling.h" -#include "src/algorithms/dtrees/service_array.h" -#include "src/services/service_arrays.h" -#include "src/externals/service_memory.h" -#include "src/services/service_data_utils.h" - -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace internal -{ -////////////////////////////////////////////////////////////////////////////////////////// -// TreeLevelBuildHelperOneAPI - contains common kernels (for classification and regression) -// required for building tree level -////////////////////////////////////////////////////////////////////////////////////////// -template -class TreeLevelBuildHelperOneAPI -{ -public: - TreeLevelBuildHelperOneAPI() : _nNodeProps(0) {} - ~TreeLevelBuildHelperOneAPI() {} - - services::Status init(const char * buildOptions, size_t nNodeProps); - - services::Status initializeTreeOrder(size_t nRows, size_t nTrees, services::internal::sycl::UniversalBuffer & treeOrder); - - services::Status convertSplitToLeaf(services::internal::sycl::UniversalBuffer & nodeList, size_t nNodes); - - services::Status markPresentRows(const services::internal::sycl::UniversalBuffer & rowsList, - services::internal::sycl::UniversalBuffer & rowsBuffer, size_t nRows, size_t localSize, size_t nSubgroupSums, - size_t nTrees, size_t tree); - services::Status countAbsentRowsForBlocks(const services::internal::sycl::UniversalBuffer & rowsBuffer, size_t nRows, - services::internal::sycl::UniversalBuffer & partialSums, size_t localSize, size_t nSubgroupSums, - size_t nTrees, size_t tree); - services::Status countAbsentRowsTotal(const services::internal::sycl::UniversalBuffer & partialSums, - services::internal::sycl::UniversalBuffer & partialPrefixSums, - services::internal::sycl::UniversalBuffer & oobRowsNumList, size_t localSize, size_t nSubgroupSums, - size_t nTrees, size_t tree); - services::Status fillOOBRowsListByBlocks(const services::internal::sycl::UniversalBuffer & rowsBuffer, size_t nRows, - const services::internal::sycl::UniversalBuffer & partialPrefixSums, size_t localSize, - size_t nSubgroupSums, services::internal::sycl::UniversalBuffer & oobRowsNumList, size_t totalOOBRowsNum, - size_t nTrees, size_t tree, services::internal::sycl::UniversalBuffer & oobRowsList); - - size_t getOOBRowsRequiredMemSize(size_t nRows, size_t nTrees, double observationsPerTreeFraction); - services::Status getOOBRows(const services::internal::sycl::UniversalBuffer & rowsList, size_t nRows, size_t nTrees, - services::internal::sycl::UniversalBuffer & oobRowsNumList, services::internal::sycl::UniversalBuffer & oobRowsList); - - services::Status getNumOfSplitNodes(const services::internal::sycl::UniversalBuffer & nodeList, size_t nNodes, size_t & nSplitNodes); - - services::Status doNodesSplit(const services::internal::sycl::UniversalBuffer & nodeList, size_t nNodes, - services::internal::sycl::UniversalBuffer & nodeListNew, size_t nNodesNew, - const services::internal::sycl::UniversalBuffer & nodeVsTreeMap, - services::internal::sycl::UniversalBuffer & nodeVsTreeMapNew); - - services::Status splitNodeListOnGroupsBySize(const services::internal::sycl::UniversalBuffer & nodeList, size_t nNodes, - services::internal::sycl::UniversalBuffer & bigNodesGroups, const size_t nGroups, - const size_t nGroupProps, services::internal::sycl::UniversalBuffer & nodeIndices); - - services::Status doLevelPartition(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & nodeList, - size_t nNodes, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & treeOrderBuf, size_t nRows, size_t nFeatures); - - services::Status partitionCopy(services::internal::sycl::UniversalBuffer & treeOrderBuf, services::internal::sycl::UniversalBuffer & treeOrder, - size_t iStart, size_t nRows); - - services::Status updateMDIVarImportance(const services::internal::sycl::UniversalBuffer & nodeList, - const services::internal::sycl::UniversalBuffer & nodeImpDecreaseList, size_t nNodes, - services::internal::Buffer & varImp, size_t nFeatures); - -private: - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory, const char * buildOptions = nullptr); - - services::internal::sycl::KernelPtr kernelInitializeTreeOrder; - services::internal::sycl::KernelPtr kernelConvertSplitToLeaf; - services::internal::sycl::KernelPtr kernelGetNumOfSplitNodes; - services::internal::sycl::KernelPtr kernelDoNodesSplit; - services::internal::sycl::KernelPtr kernelDoLevelPartitionByGroups; - services::internal::sycl::KernelPtr kernelSplitNodeListOnGroupsBySize; - - services::internal::sycl::KernelPtr kernelMarkPresentRows; - services::internal::sycl::KernelPtr kernelCountAbsentRowsForBlocks; - services::internal::sycl::KernelPtr kernelCountAbsentRowsTotal; - services::internal::sycl::KernelPtr kernelFillOOBRowsListByBlocks; - - services::internal::sycl::KernelPtr kernelUpdateMDIVarImportance; - services::internal::sycl::KernelPtr kernelPartitionCopy; - - const size_t _maxLocalSums = 256; - const size_t _minRowsBlock = 256; - - const size_t _preferableGroupSize = 256; - const size_t _preferablePartitionGroupSize = 128; // it showed best perf - const size_t _preferablePartitionGroupsNum = 8192; - const size_t _maxWorkItemsPerGroup = 256; // should be a power of two for interal needs - const size_t _preferableSubGroup = 16; // preferable maximal sub-group size - const size_t _auxNodeBufferProps = 2; // auxilliary buffer for nodes partitioning - const size_t _partitionMaxBlocksNum = 256; // max blocks number for one node - const double _aproximateOOBRowsFraction = 0.6; - const size_t _int32max = static_cast(services::internal::MaxVal::get()); - size_t _nNodeProps; -}; - -} /* namespace internal */ -} /* namespace decision_forest */ -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.i b/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.i deleted file mode 100644 index 81ff275a10d..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.i +++ /dev/null @@ -1,724 +0,0 @@ -/* file: df_tree_level_build_helper_oneapi.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of common functions for building tree level -//-- -*/ -#include "src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.h" -#include "src/algorithms/dtrees/forest/oneapi/cl_kernels/df_tree_level_build_helper_kernels.cl" - -#include "src/services/service_data_utils.h" -#include "src/externals/service_profiler.h" - -using namespace daal::services::internal::sycl; -using namespace daal::services; -using namespace daal::services::internal; - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace internal -{ -template -services::Status TreeLevelBuildHelperOneAPI::buildProgram(ClKernelFactoryIface & factory, const char * buildOptions) -{ - services::Status status; - DAAL_ITTNOTIFY_SCOPED_TASK(treeLevelHelperOneAPI.buildProgram); - { - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - - build_options.add(" -cl-std=CL1.2 "); - if (buildOptions) - { - build_options.add(buildOptions); - } - - char buffer[DAAL_MAX_STRING_SIZE] = { 0 }; - auto written = daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, static_cast(_auxNodeBufferProps)); - services::String nAuxNodePropsStr(buffer, written); - - written = daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, static_cast(_partitionMaxBlocksNum)); - services::String partitionMaxBlocksNumStr(buffer, written); - - build_options.add( - " -D BIG_NODE_LOW_BORDER_BLOCKS_NUM=32 -D LOCAL_BUFFER_SIZE=256 -D MAX_WORK_ITEMS_PER_GROUP=256 -D PARTITION_MIN_BLOCK_SIZE=128 "); - - build_options.add(" -D AUX_NODE_PROPS="); - build_options.add(nAuxNodePropsStr); - - build_options.add(" -D PARTITION_MAX_BLOCKS_NUM="); - build_options.add(partitionMaxBlocksNumStr); - - services::String cachekey("__daal_algorithms_df_tree_level_build_helper_"); - cachekey.add(build_options); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), df_tree_level_build_helper_kernels, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::initializeTreeOrder(size_t nRows, size_t nTrees, UniversalBuffer & treeOrder) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.initializeTreeOrder); - - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int32_t, nRows * nTrees); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelInitializeTreeOrder; - - { - KernelArguments args(1, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, treeOrder, AccessModeIds::write); - - KernelRange global_range(nRows, nTrees); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::markPresentRows(const UniversalBuffer & rowsList, UniversalBuffer & rowsBuffer, - size_t nRows, size_t localSize, size_t nSubgroupSums, size_t nTrees, - size_t tree) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.markPresentRows); - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(rowsList, int32_t, nRows * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(rowsBuffer, int32_t, nRows * nTrees); - - auto & context = services::internal::getDefaultContext(); - - { - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(tree <= _int32max); - - auto & kernel = kernelMarkPresentRows; - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - args.set(0, rowsList, AccessModeIds::read); - args.set(1, rowsBuffer, AccessModeIds::write); - args.set(2, static_cast(nRows)); - args.set(3, static_cast(tree)); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nSubgroupSums); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::countAbsentRowsForBlocks(const UniversalBuffer & rowsBuffer, size_t nRows, - UniversalBuffer & partialSums, size_t localSize, - size_t nSubgroupSums, size_t nTrees, size_t tree) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.countAbsentRowsForBlocks); - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(rowsBuffer, int32_t, nRows * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialSums, int32_t, nSubgroupSums); - - auto & context = services::internal::getDefaultContext(); - - { - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(tree <= _int32max); - - auto & kernel = kernelCountAbsentRowsForBlocks; - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, rowsBuffer, AccessModeIds::read); - args.set(1, partialSums, AccessModeIds::write); - args.set(2, static_cast(nRows)); - args.set(3, static_cast(tree)); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nSubgroupSums); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::countAbsentRowsTotal(const UniversalBuffer & partialSums, - UniversalBuffer & partialPrefixSums, - UniversalBuffer & oobRowsNumList, size_t localSize, - size_t nSubgroupSums, size_t nTrees, size_t tree) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.countAbsentRowsTotal); - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(partialSums, int32_t, nSubgroupSums); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialPrefixSums, int32_t, nSubgroupSums); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobRowsNumList, int32_t, nTrees + 1); - - auto & context = services::internal::getDefaultContext(); - - { - DAAL_ASSERT(nSubgroupSums <= _int32max); - DAAL_ASSERT(tree <= _int32max); - - auto & kernel = kernelCountAbsentRowsTotal; - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialSums, AccessModeIds::read); - args.set(1, partialPrefixSums, AccessModeIds::write); - args.set(2, oobRowsNumList, AccessModeIds::write); - args.set(3, static_cast(nSubgroupSums)); - args.set(4, static_cast(tree)); - - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::fillOOBRowsListByBlocks(const UniversalBuffer & rowsBuffer, size_t nRows, - const UniversalBuffer & partialPrefixSums, size_t localSize, - size_t nSubgroupSums, UniversalBuffer & oobRowsNumList, - size_t totalOOBRowsNum, size_t nTrees, size_t tree, - UniversalBuffer & oobRowsList) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.fillOOBRowsListByBlocks); - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(rowsBuffer, int32_t, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialPrefixSums, int32_t, nSubgroupSums); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobRowsNumList, int32_t, nTrees + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobRowsList, int32_t, totalOOBRowsNum); - - auto & context = services::internal::getDefaultContext(); - - { - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(tree <= _int32max); - - auto & kernel = kernelFillOOBRowsListByBlocks; - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, rowsBuffer, AccessModeIds::read); - args.set(1, partialPrefixSums, AccessModeIds::read); - args.set(2, static_cast(nRows)); - args.set(3, static_cast(tree)); - args.set(4, oobRowsNumList, AccessModeIds::read); - args.set(5, oobRowsList, AccessModeIds::write); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nSubgroupSums); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -size_t TreeLevelBuildHelperOneAPI::getOOBRowsRequiredMemSize(size_t nRows, size_t nTrees, double observationsPerTreeFraction) -{ - // mem size occupied on GPU for storing OOB rows indices - size_t oobRowsAproxNum = nRows * (1.0 - observationsPerTreeFraction) + nRows * observationsPerTreeFraction * _aproximateOOBRowsFraction; - return sizeof(int32_t) * oobRowsAproxNum * nTrees; -} - -template -services::Status TreeLevelBuildHelperOneAPI::getOOBRows(const UniversalBuffer & rowsList, size_t nRows, size_t nTrees, - UniversalBuffer & oobRowsNumList, UniversalBuffer & oobRowsList) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - const int absentMark = -1; - const size_t localSize = _preferableSubGroup; - const size_t nSubgroupSums = _maxLocalSums * localSize < nRows ? _maxLocalSums : (nRows / localSize + !(nRows / localSize)); - - DAAL_ASSERT_UNIVERSAL_BUFFER(rowsList, int32_t, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobRowsNumList, int32_t, nTrees + 1); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRows, nTrees); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nSubgroupSums, nTrees); - - auto rowsBuffer = context.allocate(TypeIds::id(), nRows * nTrees, status); // it is filled with marks Present/Absent for each rows - DAAL_CHECK_STATUS_VAR(status); - auto partialSums = context.allocate(TypeIds::id(), nSubgroupSums, status); - DAAL_CHECK_STATUS_VAR(status); - auto partialPrefixSums = context.allocate(TypeIds::id(), nSubgroupSums * nTrees, status); - DAAL_CHECK_STATUS_VAR(status); - int32_t totalOOBRowsNum = 0; - - { - auto nOOBRowsHost = oobRowsNumList.template get().toHost(ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - nOOBRowsHost.get()[0] = 0; - } - - context.fill(rowsBuffer, absentMark, status); - DAAL_CHECK_STATUS_VAR(status); - - for (size_t tree = 0; tree < nTrees; tree++) - { - DAAL_CHECK_STATUS_VAR(markPresentRows(rowsList, rowsBuffer, nRows, localSize, nSubgroupSums, nTrees, tree)); - DAAL_CHECK_STATUS_VAR(countAbsentRowsForBlocks(rowsBuffer, nRows, partialSums, localSize, nSubgroupSums, nTrees, tree)); - DAAL_CHECK_STATUS_VAR(countAbsentRowsTotal(partialSums, partialPrefixSums, oobRowsNumList, localSize, nSubgroupSums, nTrees, tree)); - } - - { - auto nOOBRowsHost = oobRowsNumList.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - totalOOBRowsNum = nOOBRowsHost.get()[nTrees]; - } - - if (totalOOBRowsNum > 0) - { - // assign buffer of required size to the input oobRowsList buffer - oobRowsList = context.allocate(TypeIds::id(), totalOOBRowsNum, status); - DAAL_CHECK_STATUS_VAR(status); - - auto nOOBRowsHost = oobRowsNumList.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - for (size_t tree = 0; tree < nTrees; tree++) - { - size_t nOOBRows = static_cast(nOOBRowsHost.get()[tree + 1] - nOOBRowsHost.get()[tree]); - - if (nOOBRows > 0) - { - DAAL_CHECK_STATUS_VAR(fillOOBRowsListByBlocks(rowsBuffer, nRows, partialPrefixSums, localSize, nSubgroupSums, oobRowsNumList, - totalOOBRowsNum, nTrees, tree, oobRowsList)); - } - } - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::getNumOfSplitNodes(const UniversalBuffer & nodeList, size_t nNodes, - size_t & nSplitNodes) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.getNumOfSplitNodes); - - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeProps); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelGetNumOfSplitNodes; - - auto bufNSplitNodes = context.allocate(TypeIds::id(), 1, status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT(nNodes <= _int32max); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nodeList, AccessModeIds::read); - args.set(1, static_cast(nNodes)); - args.set(2, bufNSplitNodes, AccessModeIds::write); - - size_t localSize = _preferableSubGroup; - - // will add more range for it - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - auto bufNsplitNodesHost = bufNSplitNodes.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - nSplitNodes = bufNsplitNodesHost.get()[0]; - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::convertSplitToLeaf(UniversalBuffer & nodeList, size_t nNodes) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.convertSplitToLeaf); - - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeProps); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelConvertSplitToLeaf; - - { - KernelArguments args(1, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nodeList, AccessModeIds::readwrite); - - KernelRange global_range(nNodes); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::doNodesSplit(const UniversalBuffer & nodeList, size_t nNodes, - UniversalBuffer & nodeListNew, size_t nNodesNew, - const UniversalBuffer & nodeVsTreeMap, UniversalBuffer & nodeVsTreeMapNew) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.doNodesSplit); - - /*split rows for each nodes in accordance with best split info*/ - - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeListNew, int32_t, nNodesNew * _nNodeProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeVsTreeMap, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeVsTreeMapNew, int32_t, nNodesNew); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelDoNodesSplit; - - { - DAAL_ASSERT(nNodes <= _int32max); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nodeList, AccessModeIds::read); - args.set(1, static_cast(nNodes)); - args.set(2, nodeListNew, AccessModeIds::write); - args.set(3, nodeVsTreeMap, AccessModeIds::read); - args.set(4, nodeVsTreeMapNew, AccessModeIds::write); - - size_t localSize = _preferableSubGroup; - - // will add more range for it - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::splitNodeListOnGroupsBySize(const UniversalBuffer & nodeList, size_t nNodes, - UniversalBuffer & nodesGroups, const size_t nGroups, - const size_t nGroupProps, UniversalBuffer & nodeIndices) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.splitNodeListOnGroupsBySize); - - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeIndices, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodesGroups, int32_t, nGroups * nGroupProps); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelSplitNodeListOnGroupsBySize; - - { - DAAL_ASSERT(nNodes <= _int32max); - DAAL_ASSERT(_minRowsBlock <= _int32max); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nodeList, AccessModeIds::read); - args.set(1, static_cast(nNodes)); - args.set(2, nodesGroups, AccessModeIds::write); - args.set(3, nodeIndices, AccessModeIds::write); - args.set(4, static_cast(_minRowsBlock)); - - size_t localSize = _preferableSubGroup; - - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::doLevelPartition(const UniversalBuffer & data, UniversalBuffer & nodeList, - size_t nNodes, UniversalBuffer & treeOrder, - UniversalBuffer & treeOrderBuf, size_t nRows, size_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.doLevelPartition); - - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, nRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int32_t, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrderBuf, int32_t, nRows); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelDoLevelPartitionByGroups; - - { - DAAL_ASSERT(nNodes <= _int32max); - DAAL_ASSERT(nFeatures <= _int32max); - - // nNodes * _partitionMaxBlocksNum is used inside kernel - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(int32_t, nNodes, _partitionMaxBlocksNum); - - // nodeAuxList is auxilliary buffer for synchronization of left and right boundaries of blocks (nElemsToLeft, nElemsToRight) - // processed by subgroups in the same node - // no mul overflow check is required due to there is already buffer of size nNodes * _nNodeProps - DAAL_ASSERT(_auxNodeBufferProps <= _nNodeProps); - - auto nodeAuxList = context.allocate(TypeIds::id(), nNodes * _auxNodeBufferProps, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(nodeAuxList, 0, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - - args.set(0, data, AccessModeIds::read); - args.set(1, nodeList, AccessModeIds::read); - args.set(2, nodeAuxList, AccessModeIds::readwrite); - args.set(3, treeOrder, AccessModeIds::read); - args.set(4, treeOrderBuf, AccessModeIds::write); - args.set(5, static_cast(nNodes)); - args.set(6, static_cast(nFeatures)); - - const size_t localSize = _preferablePartitionGroupSize; - - KernelRange local_range(localSize); - KernelRange global_range(localSize * _preferablePartitionGroupsNum); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - swap(treeOrder, treeOrderBuf); - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::partitionCopy(UniversalBuffer & treeOrderBuf, UniversalBuffer & treeOrder, - size_t iStart, size_t nRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.partitionCopy); - - services::Status status; - - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int32_t, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrderBuf, int32_t, nRows); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPartitionCopy; - - { - DAAL_ASSERT(iStart <= _int32max); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, treeOrderBuf, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::write); - args.set(2, static_cast(iStart)); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status TreeLevelBuildHelperOneAPI::updateMDIVarImportance(const UniversalBuffer & nodeList, - const UniversalBuffer & nodeImpDecreaseList, size_t nNodes, - services::internal::Buffer & varImp, - size_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateMDIVarImportance); - - services::Status status; - - DAAL_ASSERT(varImp.size() == nFeatures); - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeImpDecreaseList, algorithmFPType, nNodes); - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelUpdateMDIVarImportance; - - { - DAAL_ASSERT(nNodes <= _int32max); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nodeList, AccessModeIds::read); - args.set(1, nodeImpDecreaseList, AccessModeIds::read); - args.set(2, static_cast(nNodes)); - args.set(3, varImp, AccessModeIds::write); - - int localSize = _preferableGroupSize; - //calculating local size in way to have all subgroups for node in one group to use local buffer - while (localSize > nNodes && localSize > _preferableSubGroup) - { - localSize >>= 1; - } - - KernelRange local_range(localSize, 1); - KernelRange global_range(localSize, nFeatures); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -/////////////////////////////////////////////////////////////////////////////////////////// -/* init method for TreeLevelBuildHelperOneAPI */ -/////////////////////////////////////////////////////////////////////////////////////////// -template -services::Status TreeLevelBuildHelperOneAPI::init(const char * buildOptions, size_t nNodeProps) -{ - services::Status status; - - _nNodeProps = nNodeProps; - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory, buildOptions)); - - kernelInitializeTreeOrder = kernel_factory.getKernel("initializeTreeOrder", status); - kernelPartitionCopy = kernel_factory.getKernel("partitionCopy", status); - - kernelConvertSplitToLeaf = kernel_factory.getKernel("convertSplitToLeaf", status); - kernelGetNumOfSplitNodes = kernel_factory.getKernel("getNumOfSplitNodes", status); - kernelDoNodesSplit = kernel_factory.getKernel("doNodesSplit", status); - kernelDoLevelPartitionByGroups = kernel_factory.getKernel("doLevelPartitionByGroups", status); - kernelSplitNodeListOnGroupsBySize = kernel_factory.getKernel("splitNodeListOnGroupsBySize", status); - - kernelMarkPresentRows = kernel_factory.getKernel("markPresentRows", status); - kernelCountAbsentRowsForBlocks = kernel_factory.getKernel("countAbsentRowsForBlocks", status); - kernelCountAbsentRowsTotal = kernel_factory.getKernel("countAbsentRowsTotal", status); - kernelFillOOBRowsListByBlocks = kernel_factory.getKernel("fillOOBRowsListByBlocks", status); - kernelUpdateMDIVarImportance = kernel_factory.getKernel("updateMDIVarImportance", status); - - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -} /* namespace internal */ -} /* namespace decision_forest */ -} /* namespace algorithms */ -} /* namespace daal */ diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/BUILD b/cpp/daal/src/algorithms/dtrees/forest/regression/BUILD index fcea30453c9..f20959123c4 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/BUILD +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/regression:kernel", "@onedal//cpp/daal/src/algorithms/dtrees:kernel", "@onedal//cpp/daal/src/algorithms/dtrees/forest:kernel", diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_container.h b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_container.h index b151e0c056b..8eb96a21845 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_container.h +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_container.h @@ -25,7 +25,6 @@ #include "algorithms/decision_forest/decision_forest_regression_predict.h" #include "src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch.h" -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_kernel_oneapi.h" #include "src/services/service_algo_utils.h" namespace daal @@ -41,17 +40,7 @@ namespace prediction template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : PredictionContainerIface() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (!deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::PredictKernelOneAPI, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); } template @@ -63,9 +52,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - Input * input = static_cast(_in); Result * result = static_cast(_res); @@ -76,16 +62,8 @@ services::Status BatchContainer::compute() daal::services::Environment::env & env = *_env; - if (!deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::PredictKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), a, m, r); - } - else - { - __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), a, m, r); - } + __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + daal::services::internal::hostApp(*input), a, m, r); } } // namespace prediction diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_fpt_dispatcher.cpp index ed0d2805885..3cdae850423 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_fpt_dispatcher.cpp @@ -29,7 +29,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(decision_forest::regression::prediction::BatchContainer, batch, DAAL_FPTYPE, - decision_forest::regression::prediction::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(decision_forest::regression::prediction::BatchContainer, batch, DAAL_FPTYPE, + decision_forest::regression::prediction::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index 7ab64b2b09f..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_predict_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* file: df_regression_predict_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of prediction stage of decision forest algorithm for GPU. -//-- -*/ - -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace regression -{ -namespace prediction -{ -namespace internal -{ -template class DAAL_EXPORT PredictKernelOneAPI; -} -} // namespace prediction -} // namespace regression -} // namespace decision_forest -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_container.h b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_container.h index 86474a7d83f..5a63c1f79a1 100644 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_container.h +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_container.h @@ -28,7 +28,6 @@ #include "algorithms/decision_forest/decision_forest_regression_training_types.h" #include "algorithms/decision_forest/decision_forest_regression_training_batch.h" #include "src/algorithms/dtrees/forest/regression/df_regression_train_kernel.h" -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_kernel_oneapi.h" #include "src/algorithms/dtrees/forest/regression/df_regression_model_impl.h" #include "src/services/service_algo_utils.h" @@ -47,17 +46,7 @@ namespace interface2 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method == hist && !deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::RegressionTrainBatchKernelOneAPI, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::RegressionTrainBatchKernel, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::RegressionTrainBatchKernel, algorithmFPType, method); } template @@ -69,9 +58,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - Input * input = static_cast(_in); Result * result = static_cast(_res); @@ -84,16 +70,8 @@ services::Status BatchContainer::compute() const decision_forest::regression::training::Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - if (method == hist && !deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::RegressionTrainBatchKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), x, y, *m, *result, *par); - } - else - { - __DAAL_CALL_KERNEL(env, internal::RegressionTrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), x, y, w, *m, *result, *par); - } + __DAAL_CALL_KERNEL(env, internal::RegressionTrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + daal::services::internal::hostApp(*input), x, y, w, *m, *result, *par); } template diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_batch_fpt_dispatcher.cpp old mode 100755 new mode 100644 index 6baef0e77f4..1c51315ff75 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_dense_default_batch_fpt_dispatcher.cpp @@ -27,8 +27,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(decision_forest::regression::training::BatchContainer, batch, DAAL_FPTYPE, - decision_forest::regression::training::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(decision_forest::regression::training::BatchContainer, batch, DAAL_FPTYPE, + decision_forest::regression::training::defaultDense) namespace decision_forest { namespace regression diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_fpt_dispatcher.cpp old mode 100755 new mode 100644 index 7da03238add..d2ea86f3cbf --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_fpt_dispatcher.cpp @@ -27,8 +27,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(decision_forest::regression::training::BatchContainer, batch, DAAL_FPTYPE, - decision_forest::regression::training::hist) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(decision_forest::regression::training::BatchContainer, batch, DAAL_FPTYPE, + decision_forest::regression::training::hist) namespace decision_forest { namespace regression diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_oneapi_fpt.cpp deleted file mode 100644 index 2c5336b19c0..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/df_regression_train_hist_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* file: df_regression_train_hist_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of decision forest regression training functions for the hist method -//-- -*/ - -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace regression -{ -namespace training -{ -namespace internal -{ -template class DAAL_EXPORT RegressionTrainBatchKernelOneAPI; -} - -} // namespace training -} // namespace regression -} // namespace decision_forest -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_predict_regression_kernels.cl b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_predict_regression_kernels.cl deleted file mode 100644 index fb5a6710235..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_predict_regression_kernels.cl +++ /dev/null @@ -1,113 +0,0 @@ -/* file: df_batch_regression_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of decision forest Batch regression OpenCL kernels. -//-- -*/ - -#ifndef __DF_BATCH_PREDICT_REGRESSION_KERNELS_CL__ -#define __DF_BATCH_PREDICT_REGRESSION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - df_batch_predict_regression_kernels, - __kernel void predictByTreesGroup(const __global algorithmFPType * data, const __global int * ftrIdx, - const __global int * classLabelsOrNextNodeIdx, const __global algorithmFPType * ftrValueOrResponse, - __global algorithmFPType * obsResponses, int nRows, int nCols, int nTrees, int maxTreeSize, int treeOffset) { - const int local_id = get_local_id(0); - const int local_size = get_local_size(0); - const int n_groups = get_num_groups(0); - const int group_id = get_group_id(0); - const int n_tree_groups = get_num_groups(1); - const int tree_group_id = get_group_id(1); - const int tree_id = treeOffset + tree_group_id; - const int leafMark = -1; - - const int nElementsForGroup = nRows / n_groups + !!(nRows % n_groups); - - const int iStart = group_id * nElementsForGroup; - int iEnd = (group_id + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - if (tree_id < nTrees) - { - const __global int * ftrIdxForTree = ftrIdx + tree_id * maxTreeSize; - const __global int * classLabelsOrNextNodeIdxForTree = classLabelsOrNextNodeIdx + tree_id * maxTreeSize; - const __global algorithmFPType * ftrValueOrResponseForTree = ftrValueOrResponse + tree_id * maxTreeSize; - - uint treeRootIsSplit = (uint)(leafMark != ftrIdxForTree[0]); - - for (int i = iStart + local_id; i < iEnd; i += local_size) - { - uint obsCurrNodeForTree = 0; - uint obsSplitMarkForTree = treeRootIsSplit; - for (; obsSplitMarkForTree > 0;) - { - uint idx = obsSplitMarkForTree * ftrIdxForTree[obsCurrNodeForTree]; - uint sn = (uint)(data[i * nCols + idx] > ftrValueOrResponseForTree[obsCurrNodeForTree]); - obsCurrNodeForTree -= obsSplitMarkForTree * (obsCurrNodeForTree - (uint)classLabelsOrNextNodeIdxForTree[obsCurrNodeForTree] - sn); - obsSplitMarkForTree = (uint)(ftrIdxForTree[obsCurrNodeForTree] != leafMark); - } - obsResponses[i * n_tree_groups + tree_group_id] += ftrValueOrResponseForTree[obsCurrNodeForTree]; - } - } - } - - __kernel void reduceResponse(__global algorithmFPType * obsResponses, __global algorithmFPType * resObsResponse, int nRows, int nTrees, - algorithmFPType scale) { - const int group_id = get_group_id(0); - const int n_groups = get_num_groups(0); - const int sub_group_local_id = get_sub_group_local_id(); - const int sub_group_size = get_sub_group_size(); - - const int nElementsForGroup = nRows / n_groups + !!(nRows % n_groups); - - const int iStart = group_id * nElementsForGroup; - int iEnd = (group_id + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - // obsResponses each row contains responses from each tree for this observation - // obsResponses[0] = obs0_resp0_from_tree0, obs0_resp1_from_tree1 ... - // obsResponses[1] = obs1_resp0_from_tree0, obs1_resp1_from_tree1 ... - - for (int rowIdx = iStart; rowIdx < iEnd; rowIdx++) - { - int resp_offset = rowIdx * nTrees; - - algorithmFPType resp_val = (algorithmFPType)0; - for (int i = sub_group_local_id; i < nTrees; i += sub_group_size) - { - resp_val += obsResponses[resp_offset + i]; - } - - resp_val = sub_group_reduce_add(resp_val); - - if (0 == sub_group_local_id) - { - resObsResponse[rowIdx] = resp_val * scale; - } - } - }); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_regression_kernels.cl b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_regression_kernels.cl deleted file mode 100644 index f44a6eca5d7..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_regression_kernels.cl +++ /dev/null @@ -1,607 +0,0 @@ -/* file: df_batch_regression_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of decision forest Batch Regression OpenCL kernels. -//-- -*/ - -#ifndef __DF_BATCH_REGRESSION_KERNELS_CL__ -#define __DF_BATCH_REGRESSION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - df_batch_regression_kernels_part1, - - inline int fpEq(algorithmFPType a, algorithmFPType b) { return (int)(fabs(a - b) <= algorithmFPTypeAccuracy); } - - inline int fpGt(algorithmFPType a, algorithmFPType b) { return (int)((a - b) > algorithmFPTypeAccuracy); } - - // merge input arrs with stat values - void mergeStatArr(algorithmFPType * n, algorithmFPType * mean, algorithmFPType * sum2Cent, algorithmFPType * mrgN, algorithmFPType * mrgMean, - algorithmFPType * mrgSum2Cent) { - algorithmFPType sumN1N2 = n[0] + n[1]; - algorithmFPType mulN1N2 = n[0] * n[1]; - algorithmFPType deltaScale = mulN1N2 / sumN1N2; - algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2; - algorithmFPType delta = mean[1] - mean[0]; - - *mrgSum2Cent = sum2Cent[0] + sum2Cent[1] + delta * delta * deltaScale; - *mrgMean = (mean[0] * n[0] + mean[1] * n[1]) * meanScale; - *mrgN = sumN1N2; - } - - // merge single value to stat - void mergeValToStat(algorithmFPType val, algorithmFPType * mrgN, algorithmFPType * mrgMean, algorithmFPType * mrgSum2Cent) { - *mrgN += (algorithmFPType)1; - algorithmFPType invN = ((algorithmFPType)1) / *mrgN; - algorithmFPType delta = val - *mrgMean; - *mrgMean += delta * invN; - *mrgSum2Cent += delta * (val - *mrgMean); - } - - __kernel void computeBestSplitSinglePass(const __global int * data, const __global int * treeOrder, const __global int * selectedFeatures, - int nSelectedFeatures, const __global algorithmFPType * response, const __global int * binOffsets, - __global int * nodeList, const __global int * nodeIndices, int nodeIndicesOffset, - __global algorithmFPType * splitInfo, __global algorithmFPType * nodeImpDecreaseList, - int updateImpDecreaseRequired, int nFeatures, int minObservationsInLeafNode, - algorithmFPType impurityThreshold) { - // this kernel is targeted for processing nodes with small number of rows - // nodeList will be updated with split attributes - // spliInfo will contain node impurity and mean - const int nNodeProp = NODE_PROPS; // num of node properties in nodeList - const int nImpProp = IMPURITY_PROPS; - const int leafMark = -1; - - const int local_id = get_local_id(0); - const int sub_group_local_id = get_sub_group_local_id(); - const int sub_group_size = get_sub_group_size(); - const int local_size = get_local_size(0); - const int n_sub_groups = local_size / sub_group_size; // num of subgroups for current node processing - const int sub_group_id = local_id / sub_group_size; - const int max_sub_groups_num = 16; //replace with define - - const int nodeIdx = get_global_id(1); - const int nodeId = nodeIndices[nodeIndicesOffset + nodeIdx]; - - const int rowsOffset = nodeList[nodeId * nNodeProp + 0]; - const int nRows = nodeList[nodeId * nNodeProp + 1]; - - // each sub group will process sub_group_size bins and produce 1 best split for it - const int maxBinsBlocks = max_sub_groups_num; - __local algorithmFPType bufI[maxBinsBlocks]; // storage for impurity decrease - __local int bufS[maxBinsBlocks * nNodeProp]; // storage for split info - - const algorithmFPType minImpDec = (algorithmFPType)-1e30; - algorithmFPType curImpDec = minImpDec; - int valNotFound = 1 << 30; - int curFeatureValue = leafMark; - int curFeatureId = leafMark; - - nodeList[nodeId * nNodeProp + 2] = curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = nRows; - - algorithmFPType mrgN = (algorithmFPType)0; - algorithmFPType mrgMean = (algorithmFPType)0; - algorithmFPType mrgSum2Cent = (algorithmFPType)0; - - algorithmFPType imp = (algorithmFPType)0; // node impurity - - algorithmFPType bestLN = (algorithmFPType)0; - - int totalBins = 0; - - // totalBins is calculated by each subgroup - for (int featIdx = sub_group_local_id; featIdx < nSelectedFeatures; featIdx += sub_group_size) - { - int featId = selectedFeatures[nodeId * nSelectedFeatures + featIdx]; - int nBins = binOffsets[featId + 1] - binOffsets[featId]; - totalBins += sub_group_reduce_add(nBins); - } - totalBins = sub_group_broadcast(totalBins, 0); - - int currFtrIdx = 0; - int featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - int binId = 0; - int currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - int passedBins = 0; - - for (int i = local_id; i < totalBins; i += local_size) - { - while (i >= passedBins + currFtrBins) - { - passedBins += currFtrBins; - currFtrIdx++; - featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - } - binId = i - passedBins; - - mrgN = (algorithmFPType)0; - mrgMean = (algorithmFPType)0; - mrgSum2Cent = (algorithmFPType)0; - - algorithmFPType mrgLRN[2] = { (algorithmFPType)0 }; - algorithmFPType mrgLRMean[2] = { (algorithmFPType)0 }; - algorithmFPType mrgLRSum2Cent[2] = { (algorithmFPType)0 }; - - for (int row = 0; row < nRows; row++) - { - int id = treeOrder[rowsOffset + row]; - int bin = data[id * nFeatures + featId]; - - mergeValToStat(response[id], &mrgLRN[(int)(bin > binId)], &mrgLRMean[(int)(bin > binId)], &mrgLRSum2Cent[(int)(bin > binId)]); - } - - mergeStatArr(mrgLRN, mrgLRMean, mrgLRSum2Cent, &mrgN, &mrgMean, &mrgSum2Cent); - - algorithmFPType impDec = mrgSum2Cent - (mrgLRSum2Cent[0] + mrgLRSum2Cent[1]); - imp = mrgSum2Cent / mrgN; // mrgN isn't 0 due to it is num of rows in node - - if ((algorithmFPType)0 < impDec && (!fpEq(imp, (algorithmFPType)0)) && imp >= impurityThreshold - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) || (fpEq(impDec, curImpDec) && featId < curFeatureId)) - && mrgLRN[0] >= minObservationsInLeafNode && mrgLRN[1] >= minObservationsInLeafNode) - { - curFeatureId = featId; - curFeatureValue = binId; - curImpDec = impDec; - - bestLN = mrgLRN[0]; - } - } // for i - - algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - int impDecIsBest = fpEq(bestImpDec, curImpDec); - int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - if (1 == n_sub_groups) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)bestLN; - - splitNodeInfo[0] = imp; - splitNodeInfo[1] = mrgMean; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = curImpDec / mrgN; - } - else - { - bufS[sub_group_id * nNodeProp + 0] = curFeatureId; - bufS[sub_group_id * nNodeProp + 1] = curFeatureValue; - bufS[sub_group_id * nNodeProp + 2] = (int)bestLN; - - bufI[sub_group_id] = curImpDec; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (1 < n_sub_groups && 0 == sub_group_id) - { - // first sub group for current node reduces over local buffer if required - algorithmFPType curImpDec = (sub_group_local_id < n_sub_groups) ? bufI[sub_group_local_id] : minImpDec; - - int curFeatureId = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 0] : valNotFound; - int curFeatureValue = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 1] : valNotFound; - int LN = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 2] : 0; - - for (int i = sub_group_size + sub_group_local_id; i < n_sub_groups; i += sub_group_size) - { - algorithmFPType impDec = bufI[i]; - int featId = bufS[i * nNodeProp + 0]; - int featVal = bufS[i * nNodeProp + 1]; - int tLN = bufS[i * nNodeProp + 2]; - if ((algorithmFPType)0 < impDec - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) - || (fpEq(impDec, curImpDec) && (featId < curFeatureId || (featId == curFeatureId && featVal < curFeatureValue))))) - { - curFeatureId = featId; - curFeatureValue = featVal; - curImpDec = impDec; - - LN = tLN; - } - } - // now all info in the range of one subgroup - - const algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - const int impDecIsBest = fpEq(bestImpDec, curImpDec); - const int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - const int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - const bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - const bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)LN; - - splitNodeInfo[0] = imp; - splitNodeInfo[1] = mrgMean; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = curImpDec / mrgN; - } - } - }); - -DECLARE_SOURCE( - df_batch_regression_kernels_part2, - - inline int fpEq(algorithmFPType a, algorithmFPType b) { return (int)(fabs(a - b) <= algorithmFPTypeAccuracy); } - - inline int fpGt(algorithmFPType a, algorithmFPType b) { return (int)((a - b) > algorithmFPTypeAccuracy); } - - // merge input arrs with stat values - void mergeStatArr(algorithmFPType * n, algorithmFPType * mean, algorithmFPType * sum2Cent, algorithmFPType * mrgN, algorithmFPType * mrgMean, - algorithmFPType * mrgSum2Cent) { - algorithmFPType sumN1N2 = n[0] + n[1]; - algorithmFPType mulN1N2 = n[0] * n[1]; - algorithmFPType deltaScale = mulN1N2 / sumN1N2; - algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2; - algorithmFPType delta = mean[1] - mean[0]; - - *mrgSum2Cent = sum2Cent[0] + sum2Cent[1] + delta * delta * deltaScale; - *mrgMean = (mean[0] * n[0] + mean[1] * n[1]) * meanScale; - *mrgN = sumN1N2; - } - - // merge to stats in one - void mergeStat(algorithmFPType n, algorithmFPType mean, algorithmFPType sum2Cent, algorithmFPType * mrgLN, algorithmFPType * mrgLMean, - algorithmFPType * mrgLSum2Cent) { - algorithmFPType sumN1N2 = *mrgLN + n; - algorithmFPType mulN1N2 = *mrgLN * n; - algorithmFPType deltaScale = mulN1N2 / sumN1N2; - algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2; - algorithmFPType delta = mean - *mrgLMean; - - *mrgLSum2Cent = *mrgLSum2Cent + sum2Cent + delta * delta * deltaScale; - *mrgLMean = (*mrgLMean * *mrgLN + mean * n) * meanScale; - *mrgLN = sumN1N2; - } - - __kernel void computeBestSplitByHistogram(const __global algorithmFPType * histograms, const __global int * selectedFeatures, - int nSelectedFeatures, const __global int * binOffsets, __global int * nodeList, - const __global int * nodeIndices, int nodeIndicesOffset, __global algorithmFPType * splitInfo, - __global algorithmFPType * nodeImpDecreaseList, int updateImpDecreaseRequired, int nMaxBinsAmongFtrs, - int minObservationsInLeafNode, algorithmFPType impurityThreshold) { - // this kernel has almost the same code as computeBestSplitSinglePass - // the difference is that here for each potential split point we pass through bins hist instead of rows - // nodeList will be updated with split attributes in this kernel - // spliInfo will contain node impurity and mean - const int nProp = HIST_PROPS; // num of characteristics in histogram - const int nNodeProp = NODE_PROPS; // num of node properties in nodeList - const int nImpProp = IMPURITY_PROPS; - const int local_id = get_local_id(0); - const int sub_group_local_id = get_sub_group_local_id(); - const int sub_group_size = get_sub_group_size(); - const int nodeIdx = get_global_id(1); - const int nodeId = nodeIndices[nodeIndicesOffset + nodeIdx]; - const int leafMark = -1; - - const int local_size = get_local_size(0); - const int n_sub_groups = local_size / sub_group_size; // num of subgroups for current node processing - const int sub_group_id = local_id / sub_group_size; - const int max_sub_groups_num = 16; //replace with define - - const int rowsOffset = nodeList[nodeId * nNodeProp + 0]; - const int nRows = nodeList[nodeId * nNodeProp + 1]; - - // each sub group will process sub_group_size bins and produce 1 best split for it - const int maxBinsBlocks = max_sub_groups_num; - __local algorithmFPType bufI[maxBinsBlocks]; // storage for impurity decrease - __local int bufS[maxBinsBlocks * nNodeProp]; // storage for split info - - const algorithmFPType minImpDec = (algorithmFPType)-1e30; - int valNotFound = 1 << 30; - int curFeatureValue = leafMark; - int curFeatureId = leafMark; - algorithmFPType curImpDec = minImpDec; - - nodeList[nodeId * nNodeProp + 2] = curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = nRows; - - algorithmFPType mrgN = (algorithmFPType)0; - algorithmFPType mrgMean = (algorithmFPType)0; - algorithmFPType mrgSum2Cent = (algorithmFPType)0; - - algorithmFPType imp = (algorithmFPType)0; // node impurity - - algorithmFPType bestLN = (algorithmFPType)0; - - int totalBins = 0; - - // totalBins is calculated by each subgroup - for (int featIdx = sub_group_local_id; featIdx < nSelectedFeatures; featIdx += sub_group_size) - { - int featId = selectedFeatures[nodeId * nSelectedFeatures + featIdx]; - int nBins = binOffsets[featId + 1] - binOffsets[featId]; - totalBins += sub_group_reduce_add(nBins); - } - - totalBins = sub_group_broadcast(totalBins, 0); - - int currFtrIdx = 0; - int featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - int binId = 0; - int currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - int passedBins = 0; - - for (int i = local_id; i < totalBins; i += local_size) - { - while (i >= passedBins + currFtrBins) - { - passedBins += currFtrBins; - currFtrIdx++; - featId = selectedFeatures[nodeId * nSelectedFeatures + currFtrIdx]; - currFtrBins = binOffsets[featId + 1] - binOffsets[featId]; - } - binId = i - passedBins; - - const __global algorithmFPType * nodeHistogram = histograms + nodeIdx * nSelectedFeatures * nMaxBinsAmongFtrs * nProp; - const __global algorithmFPType * histogramForFeature = nodeHistogram + currFtrIdx * nMaxBinsAmongFtrs * nProp; - - // calculate merged statistics - mrgN = (algorithmFPType)0; - mrgMean = (algorithmFPType)0; - mrgSum2Cent = (algorithmFPType)0; - - algorithmFPType mrgLRN[2] = { (algorithmFPType)0 }; - algorithmFPType mrgLRMean[2] = { (algorithmFPType)0 }; - algorithmFPType mrgLRSum2Cent[2] = { (algorithmFPType)0 }; - - for (int tbin = 0; tbin < currFtrBins; tbin++) - { - int binOffset = tbin * nProp; - algorithmFPType n = histogramForFeature[binOffset + 0]; - - algorithmFPType mean = histogramForFeature[binOffset + 1]; - algorithmFPType sum2Cent = histogramForFeature[binOffset + 2]; - if ((algorithmFPType)0 == n) continue; - - mergeStat(n, mean, sum2Cent, &mrgLRN[(int)(tbin > binId)], &mrgLRMean[(int)(tbin > binId)], &mrgLRSum2Cent[(int)(tbin > binId)]); - } - - mergeStatArr(mrgLRN, mrgLRMean, mrgLRSum2Cent, &mrgN, &mrgMean, &mrgSum2Cent); - - algorithmFPType impDec = mrgSum2Cent - (mrgLRSum2Cent[0] + mrgLRSum2Cent[1]); - imp = mrgSum2Cent / mrgN; // mrgN isn't 0 due to it is num of rows in node - - if ((algorithmFPType)0 < impDec && (!fpEq(imp, (algorithmFPType)0)) && imp >= impurityThreshold - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) || (fpEq(impDec, curImpDec) && featId < curFeatureId)) - && mrgLRN[0] >= minObservationsInLeafNode && mrgLRN[1] >= minObservationsInLeafNode) - { - curFeatureId = featId; - curFeatureValue = binId; - curImpDec = impDec; - - bestLN = mrgLRN[0]; - } - } // for i - - algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - int impDecIsBest = fpEq(bestImpDec, curImpDec); - int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - if (1 == n_sub_groups) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)bestLN; - - splitNodeInfo[0] = imp; - splitNodeInfo[1] = mrgMean; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = curImpDec / mrgN; - } - else - { - bufS[sub_group_id * nNodeProp + 0] = curFeatureId; - bufS[sub_group_id * nNodeProp + 1] = curFeatureValue; - bufS[sub_group_id * nNodeProp + 2] = (int)bestLN; - - bufI[sub_group_id] = curImpDec; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - if (1 < n_sub_groups && 0 == sub_group_id) - { - // first sub group for current node reduces over local buffer if required - algorithmFPType curImpDec = (sub_group_local_id < n_sub_groups) ? bufI[sub_group_local_id] : minImpDec; - - int curFeatureId = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 0] : valNotFound; - int curFeatureValue = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 1] : valNotFound; - int LN = sub_group_local_id < n_sub_groups ? bufS[sub_group_local_id * nNodeProp + 2] : 0; - - for (int i = sub_group_size + sub_group_local_id; i < n_sub_groups; i += sub_group_size) - { - algorithmFPType impDec = bufI[i]; - int featId = bufS[i * nNodeProp + 0]; - int featVal = bufS[i * nNodeProp + 1]; - int tLN = bufS[i * nNodeProp + 2]; - if ((algorithmFPType)0 < impDec - && (curFeatureValue == leafMark || fpGt(impDec, curImpDec) - || (fpEq(impDec, curImpDec) && (featId < curFeatureId || (featId == curFeatureId && featVal < curFeatureValue))))) - { - curFeatureId = featId; - curFeatureValue = featVal; - curImpDec = impDec; - - LN = tLN; - } - } - // now all info in the range of one subgroup - - const algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - - const int impDecIsBest = fpEq(bestImpDec, curImpDec); - const int bestFeatureId = sub_group_reduce_min(impDecIsBest ? curFeatureId : valNotFound); - const int bestFeatureValue = sub_group_reduce_min((bestFeatureId == curFeatureId && impDecIsBest) ? curFeatureValue : valNotFound); - - const bool noneSplitFoundBySubGroup = ((leafMark == bestFeatureId) && (0 == sub_group_local_id)); - const bool mySplitIsBest = (leafMark != bestFeatureId && curFeatureId == bestFeatureId && curFeatureValue == bestFeatureValue); - if (noneSplitFoundBySubGroup || mySplitIsBest) - { - __global algorithmFPType * splitNodeInfo = splitInfo + nodeId * nImpProp; - nodeList[nodeId * nNodeProp + 2] = curFeatureId == valNotFound ? leafMark : curFeatureId; - nodeList[nodeId * nNodeProp + 3] = curFeatureValue == valNotFound ? leafMark : curFeatureValue; - nodeList[nodeId * nNodeProp + 4] = (int)LN; - - splitNodeInfo[0] = imp; - splitNodeInfo[1] = mrgMean; - - if (updateImpDecreaseRequired) nodeImpDecreaseList[nodeId] = curImpDec / mrgN; - } - } - } - - __kernel void computePartialHistograms(const __global int * data, const __global int * treeOrder, const __global int * nodeList, - const __global int * nodeIndices, int nodeIndicesOffset, const __global int * selectedFeatures, - const __global algorithmFPType * response, const __global int * binOffsets, int nMaxBinsAmongFtrs, - int nFeatures, __global algorithmFPType * partialHistograms, int nSelectedFeatures) { - const int nProp = HIST_PROPS; // num of characteristics in histogram - const int nNodeProp = NODE_PROPS; // num of node properties in nodeOffsets - - const int nodeIdx = get_global_id(1); - const int nodeId = nodeIndices[nodeIndicesOffset + nodeIdx]; - const int ftrGrpIdx = get_local_id(0); - const int ftrGrpSize = get_local_size(0); - const int nPartHist = get_num_groups(0); - const int histIdx = get_group_id(0); - - const int rowsOffset = nodeList[nodeId * nNodeProp + 0]; - const int nRows = nodeList[nodeId * nNodeProp + 1]; - - const int nElementsForGroup = nRows / nPartHist + !!(nRows % nPartHist); - - int iStart = histIdx * nElementsForGroup; - int iEnd = (histIdx + 1) * nElementsForGroup; - - iEnd = (iEnd > nRows) ? nRows : iEnd; - - for (int i = iStart; i < iEnd; i++) - { - int id = treeOrder[rowsOffset + i]; - for (int featIdx = ftrGrpIdx; featIdx < nSelectedFeatures; featIdx += ftrGrpSize) - { - const int featId = selectedFeatures[nodeId * nSelectedFeatures + featIdx]; - - __global algorithmFPType * histogram = - partialHistograms + ((nodeIdx * nPartHist + histIdx) * nSelectedFeatures + featIdx) * nMaxBinsAmongFtrs * nProp; - - int bin = data[id * nFeatures + featId]; - - histogram[bin * nProp + 0] += 1.0; // N + 1 - algorithmFPType invN = ((algorithmFPType)1) / histogram[bin * nProp + 0]; - algorithmFPType delta = response[id] - histogram[bin * nProp + 1]; // y[i] - mean - histogram[bin * nProp + 1] += delta * invN; // updated mean - histogram[bin * nProp + 2] += delta * (response[id] - histogram[bin * nProp + 1]); // updated sum2Cent - } - } - } - - __kernel void reducePartialHistograms(const __global algorithmFPType * partialHistograms, __global algorithmFPType * histograms, - int nPartialHistograms, int nSelectedFeatures, int nMaxBinsAmongFtrs) { - const int nProp = HIST_PROPS; // num of characteristics in histogram - __local algorithmFPType buf[LOCAL_BUFFER_SIZE * nProp]; - - const int nodeIdx = get_global_id(2); - const int binId = get_global_id(0); - const int local_id = get_local_id(1); - const int local_size = get_local_size(1); - - buf[local_id * nProp + 0] = (algorithmFPType)0; - buf[local_id * nProp + 1] = (algorithmFPType)0; - buf[local_id * nProp + 2] = (algorithmFPType)0; - - algorithmFPType mrgN = (algorithmFPType)0; - algorithmFPType mrgMean = (algorithmFPType)0; - algorithmFPType mrgSum2Cent = (algorithmFPType)0; - - const __global algorithmFPType * nodePartialHistograms = - partialHistograms + nodeIdx * nPartialHistograms * nSelectedFeatures * nMaxBinsAmongFtrs * nProp; - __global algorithmFPType * nodeHistogram = histograms + nodeIdx * nSelectedFeatures * nMaxBinsAmongFtrs * nProp; - - for (int i = local_id; i < nPartialHistograms; i += local_size) - { - int offset = i * nSelectedFeatures * nMaxBinsAmongFtrs * nProp + binId * nProp; - algorithmFPType n = nodePartialHistograms[offset + 0]; - - if ((algorithmFPType)0 == n) continue; - - algorithmFPType mean = nodePartialHistograms[offset + 1]; - algorithmFPType sum2Cent = nodePartialHistograms[offset + 2]; - - mergeStat(n, mean, sum2Cent, &mrgN, &mrgMean, &mrgSum2Cent); - - buf[local_id * nProp + 0] += n; - buf[local_id * nProp + 1] = mrgMean; - buf[local_id * nProp + 2] = mrgSum2Cent; - } - - for (int offset = local_size / 2; offset > 0; offset >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (local_id < offset) - { - algorithmFPType n = buf[(local_id + offset) * nProp + 0]; - if ((algorithmFPType)0 == n) continue; - algorithmFPType mean = buf[(local_id + offset) * nProp + 1]; - algorithmFPType sum2Cent = buf[(local_id + offset) * nProp + 2]; - - mergeStat(n, mean, sum2Cent, &mrgN, &mrgMean, &mrgSum2Cent); - - buf[local_id * nProp + 0] += n; - buf[local_id * nProp + 1] = mrgMean; - buf[local_id * nProp + 2] = mrgSum2Cent; - } - } - - if (local_id == 0) - { - // item 0 collects all results in private vars - nodeHistogram[binId * nProp + 0] = mrgN; - nodeHistogram[binId * nProp + 1] = mrgMean; - nodeHistogram[binId * nProp + 2] = mrgSum2Cent; - } - }); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_kernel_oneapi.h b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_kernel_oneapi.h deleted file mode 100644 index 7956e168bfc..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_kernel_oneapi.h +++ /dev/null @@ -1,109 +0,0 @@ -/* file: df_regression_predict_dense_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for decision forest -// prediction for GPU for the dense method. -//-- -*/ - -#ifndef __DF_REGRESSION_PREDICT_DENSE_KERNEL_ONEAPI_H__ -#define __DF_REGRESSION_PREDICT_DENSE_KERNEL_ONEAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/decision_forest/decision_forest_regression_predict.h" -#include "src/algorithms/dtrees/forest/regression/df_regression_model_impl.h" -#include "algorithms/decision_forest/decision_forest_regression_model.h" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace regression -{ -namespace prediction -{ -namespace internal -{ -template -class PredictKernelOneAPI : public algorithms::Kernel -{ -public: - PredictKernelOneAPI() : _nTreeGroups(0) {}; - PredictKernelOneAPI(const PredictKernelOneAPI &) = delete; - PredictKernelOneAPI & operator=(const PredictKernelOneAPI &) = delete; - ~PredictKernelOneAPI() {}; - - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory, const char * programName, const char * programSrc); - services::Status compute(services::HostAppIface * const pHostApp, const data_management::NumericTable * a, - const decision_forest::regression::Model * const m, data_management::NumericTable * const r); - services::Status predictByAllTrees(const services::internal::Buffer & srcBuffer, - const decision_forest::regression::Model * const m, - services::internal::Buffer & resObsResponse, size_t nRows, size_t nCols); - services::Status predictByTreesGroup(const services::internal::Buffer & srcBuffer, - const services::internal::sycl::UniversalBuffer & featureIndexList, - const services::internal::sycl::UniversalBuffer & leftOrClassTypeList, - const services::internal::sycl::UniversalBuffer & featureValueList, - services::internal::sycl::UniversalBuffer & obsResponses, size_t nRows, size_t nCols, size_t nTrees, - size_t maxTreeSize); - services::Status reduceResponse(const services::internal::sycl::UniversalBuffer & obsResponses, - services::internal::Buffer & resObsResponse, size_t nRows, size_t nTrees, algorithmFPType scale); - -private: - const uint32_t _preferableSubGroup = 16; // preferable maximal sub-group size - const uint32_t _maxLocalSize = 128; - const uint32_t _maxGroupsNum = 256; - - // following constants showed best performance on benchmark's datasets - const size_t _nRowsLarge = 500000; - const size_t _nRowsMedium = 100000; - - const size_t _nRowsBlocksForLarge = 16; - const size_t _nRowsBlocksForMedium = 8; - - const size_t _nTreesLarge = 192; - const size_t _nTreesMedium = 48; - const size_t _nTreesSmall = 12; - - const size_t _nTreeGroupsForLarge = 128; - const size_t _nTreeGroupsForMedium = 32; - const size_t _nTreeGroupsForSmall = 16; - const size_t _nTreeGroupsMin = 8; - - static constexpr size_t _int32max = static_cast(services::internal::MaxVal::get()); - - size_t _nTreeGroups; - - services::internal::sycl::KernelPtr kernelPredictByTreesGroup; - services::internal::sycl::KernelPtr kernelReduceResponse; -}; - -} // namespace internal -} // namespace prediction -} // namespace regression -} // namespace decision_forest -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_oneapi_impl.i b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_oneapi_impl.i deleted file mode 100644 index 7375333259a..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_oneapi_impl.i +++ /dev/null @@ -1,366 +0,0 @@ -/* file: df_regression_predict_dense_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for decision forest regression -// hist method. -//-- -*/ - -#ifndef __DF_REGRESSION_PREDICT_DENSE_ONEAPI_IMPL_I__ -#define __DF_REGRESSION_PREDICT_DENSE_ONEAPI_IMPL_I__ - -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_predict_dense_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_predict_regression_kernels.cl" - -#include "src/algorithms/dtrees/forest/regression/df_regression_model_impl.h" - -#include "src/externals/service_profiler.h" -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "src/data_management/service_numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/services/service_data_utils.h" -#include "src/services/service_algo_utils.h" -#include "src/services/service_arrays.h" -#include "src/services/service_utils.h" -#include "services/internal/sycl/types.h" - -using namespace daal::services; -using namespace daal::services::internal; -using namespace daal::internal; -using namespace daal::services::internal::sycl; -using namespace daal::algorithms::dtrees::internal; - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace regression -{ -namespace prediction -{ -namespace internal -{ -template -services::Status PredictKernelOneAPI::buildProgram(ClKernelFactoryIface & factory, const char * programName, - const char * programSrc) -{ - services::Status status; - - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - { - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - build_options.add(" -cl-std=CL1.2 "); - - services::String cachekey("__daal_algorithms_df_batch_regression_"); - cachekey.add(build_options); - cachekey.add(programName); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), programSrc, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -/////////////////////////////////////////////////////////////////////////////////////////// -/* compute method for PredictKernelOneAPI */ -/////////////////////////////////////////////////////////////////////////////////////////// -template -services::Status PredictKernelOneAPI::compute(services::HostAppIface * const pHostApp, const NumericTable * const x, - const decision_forest::regression::Model * const m, NumericTable * const res) -{ - services::Status status; - - const size_t nRows = x->getNumberOfRows(); - const size_t nCols = x->getNumberOfColumns(); - - const daal::algorithms::decision_forest::regression::internal::ModelImpl * const pModel = - static_cast(m); - const auto nTrees = pModel->size(); - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - - if (nRows > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfRowsInInputNumericTable); - } - if (nCols > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - } - if (nTrees > _int32max) - { - return services::Status(services::ErrorIncorrectSizeOfModel); - } - - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory, "predict_reg_kernels", df_batch_predict_regression_kernels)); - - kernelPredictByTreesGroup = kernel_factory.getKernel("predictByTreesGroup", status); - kernelReduceResponse = kernel_factory.getKernel("reduceResponse", status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor dataBlock; - DAAL_CHECK_STATUS_VAR(const_cast(x)->getBlockOfRows(0, nRows, readOnly, dataBlock)); - - BlockDescriptor resBlock; - DAAL_CHECK_STATUS_VAR(const_cast(res)->getBlockOfRows(0, nRows, writeOnly, resBlock)); - - auto dataBuffer = dataBlock.getBuffer(); - auto resBuffer = resBlock.getBuffer(); - - DAAL_CHECK_STATUS_VAR(predictByAllTrees(dataBuffer, m, resBuffer, nRows, nCols)); - - DAAL_CHECK_STATUS_VAR(const_cast(x)->releaseBlockOfRows(dataBlock)); - DAAL_CHECK_STATUS_VAR(const_cast(res)->releaseBlockOfRows(resBlock)); - - return status; -} - -template -services::Status PredictKernelOneAPI::predictByAllTrees(const services::internal::Buffer & srcBuffer, - const decision_forest::regression::Model * const m, - services::internal::Buffer & resObsResponse, - size_t nRows, size_t nCols) -{ - services::Status status; - const daal::algorithms::decision_forest::regression::internal::ModelImpl * const pModel = - static_cast(m); - - auto & context = services::internal::getDefaultContext(); - - const auto nTrees = pModel->size(); - - TArray _aTree; - - _aTree.reset(nTrees); - DAAL_CHECK_MALLOC(_aTree.get()); - - _nTreeGroups = _nTreeGroupsMin; - - if (nTrees > _nTreesLarge) - { - _nTreeGroups = _nTreeGroupsForLarge; - } - else if (nTrees > _nTreesMedium) - { - _nTreeGroups = _nTreeGroupsForMedium; - } - else if (nTrees > _nTreesSmall) - { - _nTreeGroups = _nTreeGroupsForSmall; - } - - size_t maxTreeSize = 0; - for (size_t i = 0; i < nTrees; ++i) - { - _aTree[i] = pModel->at(i); - maxTreeSize = maxTreeSize < _aTree[i]->getNumberOfRows() ? _aTree[i]->getNumberOfRows() : maxTreeSize; - } - if (maxTreeSize > _int32max) - { - return services::Status(services::ErrorIncorrectSizeOfModel); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, maxTreeSize, nTrees); - const size_t treeBlockSize = maxTreeSize * nTrees; - - TArray tFI(treeBlockSize); - TArray tLC(treeBlockSize); - TArray tFV(treeBlockSize); - - auto ftrIdxArr = context.allocate(TypeIds::id(), treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - auto leftNodeIdxOrClassIdArr = context.allocate(TypeIds::id(), treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - auto ftrValueOrResponseArr = context.allocate(TypeIds::id(), treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRows, _nTreeGroups); - auto obsResponses = context.allocate(TypeIds::id(), nRows * _nTreeGroups, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(obsResponses, (algorithmFPType)0, status); - DAAL_CHECK_STATUS_VAR(status); - - for (size_t iTree = 0; iTree < nTrees; iTree++) - { - const size_t treeSize = _aTree[iTree]->getNumberOfRows(); - const DecisionTreeNode * const aNode = (const DecisionTreeNode *)(*_aTree[iTree]).getArray(); - - int32_t * const fi = tFI.get() + iTree * maxTreeSize; - int32_t * const lc = tLC.get() + iTree * maxTreeSize; - algorithmFPType * const fv = tFV.get() + iTree * maxTreeSize; - - PRAGMA_IVDEP - PRAGMA_VECTOR_ALWAYS - for (size_t i = 0; i < treeSize; i++) - { - fi[i] = aNode[i].featureIndex; - lc[i] = aNode[i].leftIndexOrClass; - fv[i] = (algorithmFPType)aNode[i].featureValueOrResponse; - } - } - - algorithmFPType probasScale = (algorithmFPType)1 / nTrees; - - context.copy(ftrIdxArr, 0, (void *)tFI.get(), treeBlockSize, 0, treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - context.copy(leftNodeIdxOrClassIdArr, 0, (void *)tLC.get(), treeBlockSize, 0, treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - context.copy(ftrValueOrResponseArr, 0, (void *)tFV.get(), treeBlockSize, 0, treeBlockSize, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR( - predictByTreesGroup(srcBuffer, ftrIdxArr, leftNodeIdxOrClassIdArr, ftrValueOrResponseArr, obsResponses, nRows, nCols, nTrees, maxTreeSize)); - DAAL_CHECK_STATUS_VAR(reduceResponse(obsResponses, resObsResponse, nRows, _nTreeGroups, probasScale)); - - return status; -} - -template -services::Status PredictKernelOneAPI::predictByTreesGroup(const services::internal::Buffer & srcBuffer, - const UniversalBuffer & featureIndexList, - const UniversalBuffer & leftOrClassTypeList, - const UniversalBuffer & featureValueList, - UniversalBuffer & obsResponses, size_t nRows, size_t nCols, - size_t nTrees, size_t maxTreeSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.predictByTreesGroup); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPredictByTreesGroup; - - DAAL_CHECK_STATUS_VAR(status); - - size_t localSize = _maxLocalSize; - size_t nRowsBlocks = 1; - if (nRows > _nRowsLarge) - { - nRowsBlocks = _nRowsBlocksForLarge; - } - else if (nRows > _nRowsMedium) - { - nRowsBlocks = _nRowsBlocksForMedium; - } - { - KernelRange local_range(localSize, 1); - KernelRange global_range(nRowsBlocks * localSize, _nTreeGroups); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(nCols <= _int32max); - DAAL_ASSERT(nTrees <= _int32max); - DAAL_ASSERT(maxTreeSize <= _int32max); - - DAAL_ASSERT(srcBuffer.size() == nRows * nCols); - - DAAL_ASSERT_UNIVERSAL_BUFFER(featureIndexList, int32_t, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(leftOrClassTypeList, int32_t, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(featureValueList, algorithmFPType, maxTreeSize * nTrees); - DAAL_ASSERT_UNIVERSAL_BUFFER(obsResponses, algorithmFPType, nRows * _nTreeGroups); - - for (size_t procTrees = 0; procTrees < nTrees; procTrees += _nTreeGroups) - { - KernelArguments args(10, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, srcBuffer, AccessModeIds::read); - args.set(1, featureIndexList, AccessModeIds::read); - args.set(2, leftOrClassTypeList, AccessModeIds::read); - args.set(3, featureValueList, AccessModeIds::read); - args.set(4, obsResponses, AccessModeIds::readwrite); - args.set(5, static_cast(nRows)); - args.set(6, static_cast(nCols)); - args.set(7, static_cast(nTrees)); - args.set(8, static_cast(maxTreeSize)); - args.set(9, static_cast(procTrees)); - - context.run(range, kernel, args, status); - - DAAL_CHECK_STATUS_VAR(status); - } - } - - return status; -} - -template -services::Status PredictKernelOneAPI::reduceResponse(const UniversalBuffer & obsResponses, - services::internal::Buffer & resObsResponse, - size_t nRows, size_t nTrees, algorithmFPType scale) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.reduceResponse); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & kernel = kernelReduceResponse; - - size_t localSize = _preferableSubGroup; - size_t nGroups = _maxGroupsNum; - { - DAAL_ASSERT(nRows <= _int32max); - DAAL_ASSERT(nTrees <= _int32max); - DAAL_ASSERT(resObsResponse.size() == nRows * 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(obsResponses, algorithmFPType, nRows * _nTreeGroups); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, obsResponses, AccessModeIds::read); - args.set(1, resObsResponse, AccessModeIds::readwrite); - args.set(2, static_cast(nRows)); - args.set(3, static_cast(nTrees)); - args.set(4, scale); - - KernelRange local_range(localSize); - KernelRange global_range(nGroups * localSize); - - KernelNDRange range(1); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -} /* namespace internal */ -} /* namespace prediction */ -} /* namespace regression */ -} /* namespace decision_forest */ -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_kernel_oneapi.h b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_kernel_oneapi.h deleted file mode 100644 index a3cd6391c3e..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_kernel_oneapi.h +++ /dev/null @@ -1,181 +0,0 @@ -/* file: df_regression_train_hist_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for decision forest -// training for GPU for the hist method. -//-- -*/ - -#ifndef __DF_REGRESSION_TRAIN_HIST_KERNEL_ONEAPI_H__ -#define __DF_REGRESSION_TRAIN_HIST_KERNEL_ONEAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "src/algorithms/dtrees/forest/regression/df_regression_model_impl.h" -#include "algorithms/decision_forest/decision_forest_regression_training_types.h" -#include "algorithms/decision_forest/decision_forest_regression_model.h" -#include "src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.h" -#include "src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace regression -{ -namespace training -{ -namespace internal -{ -template -class RegressionTrainBatchKernelOneAPI : public daal::algorithms::Kernel -{ -public: - RegressionTrainBatchKernelOneAPI() {} - services::Status compute(services::HostAppIface * pHostApp, const NumericTable * x, const NumericTable * y, - decision_forest::regression::Model & m, Result & res, const Parameter & par) - { - return services::ErrorMethodNotImplemented; - } -}; - -template -class RegressionTrainBatchKernelOneAPI : public daal::algorithms::Kernel -{ -public: - RegressionTrainBatchKernelOneAPI() : _nRows(0), _nFeatures(0), _nSelectedRows(0), _nMaxBinsAmongFtrs(0), _totalBins(0) {}; - services::Status compute(services::HostAppIface * pHostApp, const NumericTable * x, const NumericTable * y, - decision_forest::regression::Model & m, Result & res, const Parameter & par); - -private: - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory, const char * programName, const char * programSrc, - const char * buildOptions); - - size_t getPartHistRequiredMemSize(size_t nSelectedFeatures, size_t nMaxBinsAmongFtrs); - - services::Status computeBestSplit(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, - services::internal::sycl::UniversalBuffer & nodeOffsets, services::internal::sycl::UniversalBuffer & binOffsets, - services::internal::sycl::UniversalBuffer & splitInfo, - services::internal::sycl::UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, - size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, algorithmFPType impurityThreshold); - - services::Status computeBestSplitSinglePass( - const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, services::internal::sycl::UniversalBuffer & binOffsets, - services::internal::sycl::UniversalBuffer & nodeList, services::internal::sycl::UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, - services::internal::sycl::UniversalBuffer & impList, services::internal::sycl::UniversalBuffer & nodeImpDecreaseList, - bool updateImpDecreaseRequired, size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, algorithmFPType impurityThreshold); - - services::Status computeBestSplitByHistogram( - const services::internal::sycl::UniversalBuffer & nodeHistogramList, services::internal::sycl::UniversalBuffer & selectedFeatures, - size_t nSelectedFeatures, services::internal::sycl::UniversalBuffer & nodeList, services::internal::sycl::UniversalBuffer & nodeIndices, - size_t nodeIndicesOffset, services::internal::sycl::UniversalBuffer & binOffsets, services::internal::sycl::UniversalBuffer & splitInfo, - services::internal::sycl::UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, size_t nNodes, size_t nMaxBinsAmongFtrs, - size_t minObservationsInLeafNode, algorithmFPType impurityThreshold); - - services::Status computePartialHistograms(const services::internal::sycl::UniversalBuffer & data, - services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, - services::internal::sycl::UniversalBuffer & nodeList, - services::internal::sycl::UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, - services::internal::sycl::UniversalBuffer & binOffsets, size_t nMaxBinsAmongFtrs, size_t nFeatures, - size_t nNodes, services::internal::sycl::UniversalBuffer & partialHistograms, - size_t nPartialHistograms); - - services::Status reducePartialHistograms(services::internal::sycl::UniversalBuffer & partialHistograms, - services::internal::sycl::UniversalBuffer & histograms, size_t nPartialHistograms, size_t nNodes, - size_t nSelectedFeatures, size_t nMaxBinsAmongFtrs, size_t reduceLocalSize); - - services::Status computeResults(const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, - const size_t nFeatures, const services::internal::sycl::UniversalBuffer & oobIndices, - const services::internal::sycl::UniversalBuffer & oobRowsNumList, - services::internal::sycl::UniversalBuffer & oobBuf, algorithmFPType * varImp, algorithmFPType * varImpVariance, - size_t nBuiltTrees, const engines::EnginePtr & engine, size_t nTreesInBlock, size_t treeIndex, - const Parameter & par); - - algorithmFPType computeOOBError(const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, - const size_t nFeatures, const services::internal::sycl::UniversalBuffer & indices, size_t indicesOffset, size_t n, - services::internal::sycl::UniversalBuffer oobBuf, services::Status & status); - - algorithmFPType computeOOBErrorPerm(const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, - const size_t nFeatures, const services::internal::sycl::UniversalBuffer & indices, size_t indicesOffset, - const int * indicesPerm, const size_t testFtrInd, size_t n, services::Status & status); - - services::Status finalizeOOBError(const algorithmFPType * y, const services::internal::sycl::UniversalBuffer & oobBuf, const size_t nRows, - algorithmFPType * res, algorithmFPType * resPerObs, algorithmFPType * resR2, algorithmFPType * resPrediction); - - services::Status finalizeVarImp(const Parameter & par, algorithmFPType * varImp, algorithmFPType * varImpVariance, size_t nFeatures); - - services::internal::sycl::KernelPtr kernelComputePartialHistograms; - services::internal::sycl::KernelPtr kernelReducePartialHistograms; - services::internal::sycl::KernelPtr kernelComputeBestSplitByHistogram; - services::internal::sycl::KernelPtr kernelComputeBestSplitSinglePass; - - decision_forest::internal::TreeLevelBuildHelperOneAPI _treeLevelBuildHelper; - - const size_t _maxWorkItemsPerGroup = 256; // should be a power of two for interal needs - const size_t _preferableSubGroup = 16; // preferable maximal sub-group size - const size_t _maxLocalSize = 128; - const size_t _maxLocalSums = 256; - const size_t _maxLocalHistograms = 256; - const size_t _preferableGroupSize = 256; - const size_t _minRowsBlock = 256; - const size_t _maxBins = 256; - const size_t _reduceLocalSizePartHist = 64; - - const size_t _minPreferableLocalSizeForPartHistKernel = 32; - - const double _globalMemFractionForTreeBlock = 0.6; // part of free global mem which can be used for processing block of tree - const double _globalMemFractionForPartHist = 0.2; // part of free global mem which can be used for partial histograms - const size_t _maxMemAllocSizeForAlgo = 1073741824; // 1 Gb it showed better efficiency than using just platform info.maxMemAllocSize - const size_t _minRowsBlocksForMaxPartHistNum = 16384; - const size_t _minRowsBlocksForOneHist = 128; - - const size_t _nOOBProps = 2; // number of props for each OOB row to compute prediction (i.e. mean and num of predictions) - const size_t _nHistProps = 3; // number of properties in bins histogram (i.e. n, mean and var) - const size_t _nNodesGroups = 3; // all nodes are split on groups (big, medium, small) - const size_t _nodeGroupProps = 2; // each nodes Group contains props: numOfNodes, maxNumOfBlocks - - static constexpr size_t _int32max = static_cast(services::internal::MaxVal::get()); - - size_t _nRows; - size_t _nFeatures; - size_t _nSelectedRows; - size_t _nMaxBinsAmongFtrs; - size_t _totalBins; - size_t _preferableLocalSizeForPartHistKernel; // local size for histogram collecting kernel, depends on num of selected features - size_t _maxPartHistCumulativeSize; // is calculated at the beggining of compute using _globalMemFractionForPartHist -}; - -} // namespace internal -} // namespace training -} // namespace regression -} // namespace decision_forest -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i deleted file mode 100644 index db4e286628a..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_oneapi_impl.i +++ /dev/null @@ -1,1196 +0,0 @@ -/* file: df_regression_train_hist_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for decision forest regression -// hist method. -//-- -*/ - -#ifndef __DF_REGRESSION_TRAIN_HIST_ONEAPI_IMPL_I__ -#define __DF_REGRESSION_TRAIN_HIST_ONEAPI_IMPL_I__ - -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_train_hist_kernel_oneapi.h" -#include "src/algorithms/dtrees/forest/regression/oneapi/cl_kernels/df_batch_regression_kernels.cl" - -#include "src/algorithms/dtrees/forest/oneapi/df_feature_type_helper_oneapi.i" -#include "src/algorithms/dtrees/forest/oneapi/df_tree_level_build_helper_oneapi.i" -#include "src/algorithms/dtrees/forest/regression/df_regression_model_impl.h" -#include "src/algorithms/dtrees/forest/regression/oneapi/df_regression_tree_helper_impl.i" - -#include "src/externals/service_profiler.h" -#include "src/externals/service_rng.h" -#include "src/externals/service_math.h" //will remove after migrating finalize MDA to GPU -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/services/service_data_utils.h" -#include "src/services/service_algo_utils.h" -#include "src/services/service_arrays.h" -#include "src/services/service_utils.h" -#include "src/services/daal_strings.h" -#include "src/algorithms/engines/engine_types_internal.h" -#include "services/internal/sycl/types.h" - -using namespace daal::algorithms::decision_forest::internal; -using namespace daal::algorithms::decision_forest::regression::internal; -using namespace daal::internal; -using namespace daal::services::internal; - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace regression -{ -namespace training -{ -namespace internal -{ -template -static services::String getFPTypeAccuracy() -{ - if (IsSameType::value) - { - return services::String(" -D algorithmFPTypeAccuracy=(float)1e-5 "); - } - if (IsSameType::value) - { - return services::String(" -D algorithmFPTypeAccuracy=(double)1e-10 "); - } - return services::String(); -} - -static services::String getBuildOptions() -{ - return " -D NODE_PROPS=5 -D IMPURITY_PROPS=2 -D HIST_PROPS=3 "; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::buildProgram(ClKernelFactoryIface & factory, const char * programName, - const char * programSrc, const char * buildOptions) -{ - services::Status status; - - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - { - auto fptype_name = getKeyFPType(); - auto fptype_accuracy = getFPTypeAccuracy(); - auto build_options = fptype_name; - build_options.add(fptype_accuracy); - build_options.add(" -cl-std=CL1.2 "); - build_options.add(" -D LOCAL_BUFFER_SIZE=256 -D MAX_WORK_ITEMS_PER_GROUP=256 "); - - if (buildOptions) - { - build_options.add(buildOptions); - } - - services::String cachekey("__daal_algorithms_df_batch_regression_"); - cachekey.add(build_options); - cachekey.add(programName); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), programSrc, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeBestSplitByHistogram( - const UniversalBuffer & nodeHistogramList, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, UniversalBuffer & nodeList, - UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, UniversalBuffer & binOffsets, UniversalBuffer & impList, - UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, size_t nNodes, size_t nMaxBinsAmongFtrs, size_t minObservationsInLeafNode, - algorithmFPType impurityThreshold) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeBestSpitByHistogramLevel); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeBestSplitByHistogram; - - { - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nodeIndicesOffset <= _int32max); - DAAL_ASSERT(updateImpDecreaseRequired <= _int32max); - DAAL_ASSERT(nMaxBinsAmongFtrs <= _int32max); - DAAL_ASSERT(minObservationsInLeafNode <= _int32max); - - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeHistogramList, algorithmFPType, nNodes * nSelectedFeatures * _nMaxBinsAmongFtrs * _nHistProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(selectedFeatures, int32_t, nNodes * nSelectedFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * TreeLevelRecord::_nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeIndices, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, _nFeatures + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(impList, algorithmFPType, nNodes * TreeLevelRecord::_nNodeImpProps); - if (updateImpDecreaseRequired) DAAL_ASSERT_UNIVERSAL_BUFFER(nodeImpDecreaseList, algorithmFPType, nNodes); - - KernelArguments args(13, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, nodeHistogramList, AccessModeIds::read); - args.set(1, selectedFeatures, AccessModeIds::read); - args.set(2, static_cast(nSelectedFeatures)); - args.set(3, binOffsets, AccessModeIds::read); - args.set(4, nodeList, AccessModeIds::readwrite); // nodeList will be updated with split attributes - args.set(5, nodeIndices, AccessModeIds::read); - args.set(6, static_cast(nodeIndicesOffset)); - args.set(7, impList, AccessModeIds::write); - args.set(8, nodeImpDecreaseList, AccessModeIds::write); - args.set(9, static_cast(updateImpDecreaseRequired)); - args.set(10, static_cast(nMaxBinsAmongFtrs)); - args.set(11, static_cast(minObservationsInLeafNode)); - args.set(12, impurityThreshold); - - const size_t numOfSubGroupsPerNode = 8; //add logic for adjusting it in accordance with nNodes - size_t localSize = _preferableSubGroup * numOfSubGroupsPerNode; - - KernelRange local_range(localSize, 1); - KernelRange global_range(localSize, nNodes); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeBestSplitSinglePass( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, UniversalBuffer & binOffsets, UniversalBuffer & nodeList, - UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, UniversalBuffer & impList, UniversalBuffer & nodeImpDecreaseList, - bool updateImpDecreaseRequired, size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, algorithmFPType impurityThreshold) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeBestSplitSinglePass); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeBestSplitSinglePass; - - { - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nodeIndicesOffset <= _int32max); - DAAL_ASSERT(updateImpDecreaseRequired <= _int32max); - DAAL_ASSERT(nFeatures <= _int32max); - DAAL_ASSERT(minObservationsInLeafNode <= _int32max); - DAAL_ASSERT(response.size() == _nRows); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, _nRows * _nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int32_t, _nSelectedRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(selectedFeatures, int32_t, nNodes * nSelectedFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, _nFeatures + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * TreeLevelRecord::_nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeIndices, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(impList, algorithmFPType, nNodes * TreeLevelRecord::_nNodeImpProps); - if (updateImpDecreaseRequired) DAAL_ASSERT_UNIVERSAL_BUFFER(nodeImpDecreaseList, algorithmFPType, nNodes); - - KernelArguments args(15, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::read); - args.set(2, selectedFeatures, AccessModeIds::read); - args.set(3, static_cast(nSelectedFeatures)); - args.set(4, response, AccessModeIds::read); - args.set(5, binOffsets, AccessModeIds::read); - args.set(6, nodeList, AccessModeIds::readwrite); // nodeList will be updated with split attributes - args.set(7, nodeIndices, AccessModeIds::read); - args.set(8, static_cast(nodeIndicesOffset)); - args.set(9, impList, AccessModeIds::write); - args.set(10, nodeImpDecreaseList, AccessModeIds::write); - args.set(11, static_cast(updateImpDecreaseRequired)); - args.set(12, static_cast(nFeatures)); - args.set(13, static_cast(minObservationsInLeafNode)); - args.set(14, impurityThreshold); - - const size_t numOfSubGroupsPerNode = 8; //add logic for adjusting it in accordance with nNodes - size_t localSize = _preferableSubGroup * numOfSubGroupsPerNode; - - KernelRange local_range(localSize, 1); - KernelRange global_range(localSize, nNodes); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -size_t RegressionTrainBatchKernelOneAPI::getPartHistRequiredMemSize(size_t nSelectedFeatures, size_t nMaxBinsAmongFtrs) -{ - // mul overflow for nSelectedFeatures * _nMaxBinsAmongFtrs and for nHistBins * _nHistProps were checked before kernel call in compute - const size_t nHistBins = nSelectedFeatures * _nMaxBinsAmongFtrs; - return sizeof(algorithmFPType) * nHistBins * _nHistProps; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeBestSplit( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, UniversalBuffer & nodeList, UniversalBuffer & binOffsets, UniversalBuffer & impList, - UniversalBuffer & nodeImpDecreaseList, bool updateImpDecreaseRequired, size_t nFeatures, size_t nNodes, size_t minObservationsInLeafNode, - algorithmFPType impurityThreshold) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - // no overflow check is required because of _nNodesGroups and _nodeGroupProps are small constants - auto nodesGroups = context.allocate(TypeIds::id(), _nNodesGroups * _nodeGroupProps, status); - DAAL_CHECK_STATUS_VAR(status); - auto nodeIndices = context.allocate(TypeIds::id(), nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR( - _treeLevelBuildHelper.splitNodeListOnGroupsBySize(nodeList, nNodes, nodesGroups, _nNodesGroups, _nodeGroupProps, nodeIndices)); - - auto nodesGroupsHost = nodesGroups.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - size_t nGroupNodes = 0; - size_t processedNodes = 0; - - for (size_t i = 0; i < _nNodesGroups; i++, processedNodes += nGroupNodes) - { - nGroupNodes = nodesGroupsHost.get()[i * _nodeGroupProps + 0]; - if (0 == nGroupNodes) continue; - - size_t maxGroupBlocksNum = nodesGroupsHost.get()[i * _nodeGroupProps + 1]; - - size_t groupIndicesOffset = processedNodes; - - if (maxGroupBlocksNum > 1) - { - const size_t partHistSize = getPartHistRequiredMemSize(nSelectedFeatures, _nMaxBinsAmongFtrs); - - size_t nPartialHistograms = maxGroupBlocksNum <= _minRowsBlocksForOneHist ? 1 : _maxLocalHistograms; - - if (nPartialHistograms > 1 && maxGroupBlocksNum < _minRowsBlocksForMaxPartHistNum) - { - while (nPartialHistograms > 1 - && (nPartialHistograms * _minRowsBlocksForOneHist > maxGroupBlocksNum - || nPartialHistograms * partHistSize > _maxPartHistCumulativeSize)) - { - nPartialHistograms >>= 1; - } - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nGroupNodes, partHistSize); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nGroupNodes * partHistSize, nPartialHistograms); - - const size_t maxPHBlockElems = _maxPartHistCumulativeSize / sizeof(algorithmFPType); - - const size_t nPHBlockElems = nGroupNodes * nPartialHistograms * partHistSize; - const size_t nPHBlocks = nPHBlockElems / maxPHBlockElems ? (nPHBlockElems / maxPHBlockElems + !!(nPHBlockElems % maxPHBlockElems)) : 1; - - size_t nBlockNodes = nGroupNodes / nPHBlocks + !!(nGroupNodes % nPHBlocks); - - for (size_t blockIndicesOffset = groupIndicesOffset; blockIndicesOffset < groupIndicesOffset + nGroupNodes; - blockIndicesOffset += nBlockNodes) - { - nBlockNodes = services::internal::min(nBlockNodes, groupIndicesOffset + nGroupNodes - blockIndicesOffset); - if (1 == nPartialHistograms) - { - auto nodesHistograms = context.allocate(TypeIds::id(), nBlockNodes * partHistSize, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(computePartialHistograms(data, treeOrder, selectedFeatures, nSelectedFeatures, response, nodeList, - nodeIndices, blockIndicesOffset, binOffsets, _nMaxBinsAmongFtrs, nFeatures, - nBlockNodes, nodesHistograms, nPartialHistograms)); - - DAAL_CHECK_STATUS_VAR(computeBestSplitByHistogram(nodesHistograms, selectedFeatures, nSelectedFeatures, nodeList, nodeIndices, - blockIndicesOffset, binOffsets, impList, nodeImpDecreaseList, - updateImpDecreaseRequired, nBlockNodes, _nMaxBinsAmongFtrs, - minObservationsInLeafNode, impurityThreshold)); - } - else - { - auto partialHistograms = - context.allocate(TypeIds::id(), nBlockNodes * nPartialHistograms * partHistSize, status); - DAAL_CHECK_STATUS_VAR(status); - auto nodesHistograms = context.allocate(TypeIds::id(), nBlockNodes * partHistSize, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(computePartialHistograms(data, treeOrder, selectedFeatures, nSelectedFeatures, response, nodeList, - nodeIndices, blockIndicesOffset, binOffsets, _nMaxBinsAmongFtrs, nFeatures, - nBlockNodes, partialHistograms, nPartialHistograms)); - DAAL_CHECK_STATUS_VAR(reducePartialHistograms(partialHistograms, nodesHistograms, nPartialHistograms, nBlockNodes, - nSelectedFeatures, _nMaxBinsAmongFtrs, _reduceLocalSizePartHist)); - - DAAL_CHECK_STATUS_VAR(computeBestSplitByHistogram(nodesHistograms, selectedFeatures, nSelectedFeatures, nodeList, nodeIndices, - blockIndicesOffset, binOffsets, impList, nodeImpDecreaseList, - updateImpDecreaseRequired, nBlockNodes, _nMaxBinsAmongFtrs, - minObservationsInLeafNode, impurityThreshold)); - } - } - } - else - { - DAAL_CHECK_STATUS_VAR(computeBestSplitSinglePass(data, treeOrder, selectedFeatures, nSelectedFeatures, response, binOffsets, nodeList, - nodeIndices, groupIndicesOffset, impList, nodeImpDecreaseList, updateImpDecreaseRequired, - nFeatures, nGroupNodes, minObservationsInLeafNode, impurityThreshold)); - } - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computePartialHistograms( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & selectedFeatures, size_t nSelectedFeatures, - const services::internal::Buffer & response, UniversalBuffer & nodeList, UniversalBuffer & nodeIndices, size_t nodeIndicesOffset, - UniversalBuffer & binOffsets, size_t nMaxBinsAmongFtrs, size_t nFeatures, size_t nNodes, UniversalBuffer & partialHistograms, - size_t nPartialHistograms) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computePartialHistograms); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputePartialHistograms; - - { - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nodeIndicesOffset <= _int32max); - DAAL_ASSERT(nMaxBinsAmongFtrs <= _int32max); - DAAL_ASSERT(nFeatures <= _int32max); - DAAL_ASSERT(response.size() == _nRows); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, _nRows * _nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int32_t, _nSelectedRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(selectedFeatures, int32_t, nNodes * nSelectedFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, _nFeatures + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * TreeLevelRecord::_nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeIndices, int32_t, nNodes); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHistograms, algorithmFPType, - nNodes * nPartialHistograms * nSelectedFeatures * _nMaxBinsAmongFtrs * _nHistProps); - - context.fill(partialHistograms, (algorithmFPType)0, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(12, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::read); - args.set(2, nodeList, AccessModeIds::read); - args.set(3, nodeIndices, AccessModeIds::read); - args.set(4, static_cast(nodeIndicesOffset)); - args.set(5, selectedFeatures, AccessModeIds::read); - args.set(6, response, AccessModeIds::read); - args.set(7, binOffsets, AccessModeIds::read); - args.set(8, static_cast(nMaxBinsAmongFtrs)); // max num of bins among all ftrs - args.set(9, static_cast(nFeatures)); - args.set(10, partialHistograms, AccessModeIds::write); - args.set(11, static_cast(nSelectedFeatures)); - - size_t localSize = _preferableLocalSizeForPartHistKernel; - - KernelRange local_range(localSize, 1); - KernelRange global_range(nPartialHistograms * localSize, nNodes); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::reducePartialHistograms(UniversalBuffer & partialHistograms, - UniversalBuffer & histograms, - size_t nPartialHistograms, size_t nNodes, - size_t nSelectedFeatures, size_t nMaxBinsAmongFtrs, - size_t reduceLocalSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.reducePartialHistograms); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelReducePartialHistograms; - - { - DAAL_ASSERT(nPartialHistograms <= _int32max); - DAAL_ASSERT(nSelectedFeatures <= _int32max); - DAAL_ASSERT(nMaxBinsAmongFtrs <= _int32max); - - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHistograms, algorithmFPType, - nNodes * nPartialHistograms * nSelectedFeatures * _nMaxBinsAmongFtrs * _nHistProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(histograms, algorithmFPType, nNodes * nSelectedFeatures * _nMaxBinsAmongFtrs * _nHistProps); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialHistograms, AccessModeIds::read); - args.set(1, histograms, AccessModeIds::write); - args.set(2, static_cast(nPartialHistograms)); - args.set(3, static_cast(nSelectedFeatures)); - args.set(4, static_cast(nMaxBinsAmongFtrs)); // max num of bins among all ftrs - - KernelRange local_range(1, reduceLocalSize, 1); - // overflow for nMaxBinsAmongFtrs * nSelectedFeatures should be checked in compute - KernelRange global_range(nMaxBinsAmongFtrs * nSelectedFeatures, reduceLocalSize, nNodes); - - KernelNDRange range(3); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -static void shuffle(void * state, size_t n, int * dst) -{ - RNGsInst rng; - int idx[2]; - - for (size_t i = 0; i < n; ++i) - { - rng.uniform(2, idx, state, 0, n); - daal::services::internal::swap(dst[idx[0]], dst[idx[1]]); - } -} - -template -services::Status selectParallelizationTechnique(const Parameter & par, engines::internal::ParallelizationTechnique & technique) -{ - auto engineImpl = dynamic_cast(par.engine.get()); - - engines::internal::ParallelizationTechnique techniques[] = { engines::internal::family, engines::internal::leapfrog, - engines::internal::skipahead }; - - for (auto & t : techniques) - { - if (engineImpl->hasSupport(t)) - { - technique = t; - return services::Status(); - } - } - return services::Status(ErrorEngineNotSupported); -} - -/* following methods are related to results computation (OBB err, varImportance MDA/MDA_Scaled)*/ -/* they will be migrated on GPU when prediction layer forGPU is ready*/ -template -services::Status RegressionTrainBatchKernelOneAPI::computeResults( - const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, size_t nRows, size_t nFeatures, - const UniversalBuffer & oobIndices, const UniversalBuffer & oobRowsNumList, UniversalBuffer & oobBuf, algorithmFPType * varImp, - algorithmFPType * varImpVariance, size_t nBuiltTrees, const engines::EnginePtr & engine, size_t nTreesInBlock, size_t treeIndex, - const Parameter & par) -{ - DAAL_ASSERT_UNIVERSAL_BUFFER(oobRowsNumList, int32_t, nTreesInBlock + 1); - - services::Status status; - const bool mdaRequired(par.varImportance == decision_forest::training::MDA_Raw || par.varImportance == decision_forest::training::MDA_Scaled); - - size_t nOOB = 0; - size_t oobIndicesOffset = 0; - - { - auto nOOBRowsHost = oobRowsNumList.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - oobIndicesOffset = static_cast(nOOBRowsHost.get()[treeIndex]); - nOOB = static_cast(nOOBRowsHost.get()[treeIndex + 1] - nOOBRowsHost.get()[treeIndex]); - } - - if ((par.resultsToCompute & (decision_forest::training::computeOutOfBagError | decision_forest::training::computeOutOfBagErrorPerObservation) - || mdaRequired) - && nOOB) - { - const algorithmFPType oobError = computeOOBError(t, x, y, nRows, nFeatures, oobIndices, oobIndicesOffset, nOOB, oobBuf, status); - DAAL_CHECK_STATUS_VAR(status); - - if (mdaRequired) - { - DAAL_ASSERT(varImp); - TArray permutation(nOOB); - DAAL_CHECK_MALLOC(permutation.get()); - for (size_t i = 0; i < nOOB; ++i) - { - permutation[i] = i; - } - - const algorithmFPType div1 = algorithmFPType(1) / algorithmFPType(nBuiltTrees); - daal::internal::RNGsInst rng; - auto engineImpl = dynamic_cast(engine.get()); - - for (size_t ftr = 0; ftr < nFeatures; ftr++) - { - shuffle(engineImpl->getState(), nOOB, permutation.get()); - const algorithmFPType permOOBError = - computeOOBErrorPerm(t, x, y, nRows, nFeatures, oobIndices, oobIndicesOffset, permutation.get(), ftr, nOOB, status); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType diff = (permOOBError - oobError); - const algorithmFPType delta = diff - varImp[ftr]; - varImp[ftr] += div1 * delta; - if (varImpVariance) - { - varImpVariance[ftr] += delta * (diff - varImp[ftr]); - } - } - } - DAAL_CHECK_STATUS_VAR(status); - } - return status; -} - -template -algorithmFPType RegressionTrainBatchKernelOneAPI::computeOOBError(const dtrees::internal::Tree & t, const algorithmFPType * x, - const algorithmFPType * y, const size_t nRows, - const size_t nFeatures, const UniversalBuffer & indices, - size_t indicesOffset, size_t n, UniversalBuffer oobBuf, - services::Status & status) -{ - typedef DFTreeConverter DFTreeConverterType; - - DAAL_ASSERT(x); - DAAL_ASSERT(y); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int32_t, indicesOffset + n); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobBuf, algorithmFPType, nRows * _nOOBProps); - - auto rowsIndHost = indices.template get().toHost(ReadWriteMode::readOnly, status); - auto oobBufHost = oobBuf.template get().toHost(ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, algorithmFPType(0)); - - //compute prediction error on each OOB row and get its mean online formulae (Welford) - //TODO: can be threader_for() block - - algorithmFPType mean = algorithmFPType(0); - for (size_t i = 0; i < n; i++) - { - int rowInd = rowsIndHost.get()[indicesOffset + i]; - DAAL_ASSERT(rowInd < nRows); - algorithmFPType prediction = DFTreeConverterType::TreeHelperType::predict(t, &x[rowInd * nFeatures]); - oobBufHost.get()[rowInd * 2 + 0] += prediction; - oobBufHost.get()[rowInd * 2 + 1] += algorithmFPType(1); - mean += (prediction - y[rowInd]) * (prediction - y[rowInd]); - } - - return mean / n; -} - -template -algorithmFPType RegressionTrainBatchKernelOneAPI::computeOOBErrorPerm( - const dtrees::internal::Tree & t, const algorithmFPType * x, const algorithmFPType * y, const size_t nRows, const size_t nFeatures, - const UniversalBuffer & indices, size_t indicesOffset, const int * indicesPerm, const size_t testFtrInd, size_t n, services::Status & status) -{ - typedef DFTreeConverter DFTreeConverterType; - - DAAL_ASSERT(x); - DAAL_ASSERT(y); - DAAL_ASSERT(indicesPerm); - DAAL_ASSERT(testFtrInd < nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int32_t, indicesOffset + n); - - auto rowsIndHost = indices.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, algorithmFPType(0)); - - TArray buf(nFeatures); - DAAL_CHECK_COND_ERROR(buf.get(), status, services::ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, algorithmFPType(0)); - - algorithmFPType mean = algorithmFPType(0); - for (size_t i = 0; i < n; i++) - { - int rowInd = rowsIndHost.get()[indicesOffset + i]; - int rowIndPerm = indicesPerm[i]; - DAAL_ASSERT(rowInd < nRows); - DAAL_ASSERT(rowIndPerm < nRows); - services::internal::tmemcpy(buf.get(), &x[rowInd * nFeatures], nFeatures); - buf[testFtrInd] = x[rowIndPerm * nFeatures + testFtrInd]; - algorithmFPType prediction = DFTreeConverterType::TreeHelperType::predict(t, buf.get()); - mean += (prediction - y[rowInd]) * (prediction - y[rowInd]); - } - - return mean / n; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::finalizeOOBError(const algorithmFPType * y, const UniversalBuffer & oobBuf, - const size_t nRows, algorithmFPType * res, - algorithmFPType * resPerObs, algorithmFPType * resR2, - algorithmFPType * resPrediction) -{ - services::Status status; - - DAAL_ASSERT(y); - DAAL_ASSERT_UNIVERSAL_BUFFER(oobBuf, algorithmFPType, nRows * _nOOBProps); - - auto oobBufHost = oobBuf.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - size_t nPredicted = 0; - algorithmFPType _res = algorithmFPType(0); - algorithmFPType yMean = algorithmFPType(0); - algorithmFPType sumMeanDiff = algorithmFPType(0); - - for (size_t i = 0; i < nRows; i++) - { - yMean += y[i]; - } - - for (size_t i = 0; i < nRows; i++) - { - algorithmFPType value = oobBufHost.get()[i * 2 + 0]; - algorithmFPType count = oobBufHost.get()[i * 2 + 1]; - - if (algorithmFPType(0) != count) - { - value /= count; - const algorithmFPType oobForObs = (value - y[i]) * (value - y[i]); - if (resPerObs) resPerObs[i] = oobForObs; - _res += oobForObs; - nPredicted++; - - if (resPrediction) resPrediction[i] = value; - sumMeanDiff += (y[i] - yMean) * (y[i] - yMean); - } - else - { - if (resPerObs) resPerObs[i] = algorithmFPType(-1); //was not in OOB set of any tree and hence not predicted - if (resPrediction) resPrediction[i] = algorithmFPType(0); - } - } - - if (res) *res = (0 < nPredicted) ? _res / algorithmFPType(nPredicted) : 0; - if (resR2) *resR2 = (0 < nPredicted) ? algorithmFPType(1) - _res / sumMeanDiff : 0; - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::finalizeVarImp(const Parameter & par, algorithmFPType * varImp, - algorithmFPType * varImpVariance, size_t nFeatures) -{ - if (par.varImportance == decision_forest::training::MDA_Scaled) - { - if (par.nTrees > 1) - { - DAAL_ASSERT(varImpVariance); - const algorithmFPType div = algorithmFPType(1) / algorithmFPType(par.nTrees); - for (size_t i = 0; i < nFeatures; i++) - { - varImpVariance[i] *= div; - if (varImpVariance[i] > algorithmFPType(0)) - varImp[i] /= daal::internal::MathInst::sSqrt(varImpVariance[i] * div); - } - } - else - { - DAAL_ASSERT(varImp); - for (size_t i = 0; i < nFeatures; i++) - { - varImp[i] = algorithmFPType(0); - } - } - } - else if (par.varImportance == decision_forest::training::MDI) - { - DAAL_ASSERT(varImp); - const algorithmFPType div = algorithmFPType(1) / algorithmFPType(par.nTrees); - for (size_t i = 0; i < nFeatures; i++) varImp[i] *= div; - } - return services::Status(); -} - -/////////////////////////////////////////////////////////////////////////////////////////// -/* compute method for RegressionTrainBatchKernelOneAPI */ -/////////////////////////////////////////////////////////////////////////////////////////// -template -services::Status RegressionTrainBatchKernelOneAPI::compute(HostAppIface * pHostApp, const NumericTable * x, - const NumericTable * y, decision_forest::regression::Model & m, - Result & res, const Parameter & par) -{ - services::Status status; - - typedef DFTreeConverter DFTreeConverterType; - typedef TreeLevelRecord TreeLevel; - - _nRows = x->getNumberOfRows(); - _nFeatures = x->getNumberOfColumns(); - DAAL_CHECK_EX((par.minObservationsInLeafNode <= _int32max), ErrorIncorrectParameter, ParameterName, minObservationsInLeafNodeStr()); - DAAL_CHECK_EX((par.featuresPerNode <= _int32max), ErrorIncorrectParameter, ParameterName, featuresPerNodeStr()); - DAAL_CHECK_EX((par.maxBins <= _int32max), ErrorIncorrectParameter, ParameterName, maxBinsStr()); - DAAL_CHECK_EX((par.minBinSize <= _int32max), ErrorIncorrectParameter, ParameterName, minBinSizeStr()); - DAAL_CHECK_EX((par.nTrees <= _int32max), ErrorIncorrectParameter, ParameterName, nTreesStr()); - - if (_nRows > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfRowsInInputNumericTable); - } - if (_nFeatures > _int32max) - { - return services::Status(services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - } - - const size_t nSelectedFeatures = par.featuresPerNode ? par.featuresPerNode : (_nFeatures > 3 ? _nFeatures / 3 : 1); - - _nSelectedRows = par.observationsPerTreeFraction * _nRows; - DAAL_CHECK_EX((_nSelectedRows > 0), ErrorIncorrectParameter, ParameterName, observationsPerTreeFractionStr()); - - _preferableLocalSizeForPartHistKernel = _preferableGroupSize; - - while (_preferableLocalSizeForPartHistKernel - > services::internal::max(nSelectedFeatures, _minPreferableLocalSizeForPartHistKernel)) - { - _preferableLocalSizeForPartHistKernel >>= 1; - } - - const bool mdaRequired(par.varImportance == decision_forest::training::MDA_Raw || par.varImportance == decision_forest::training::MDA_Scaled); - const bool oobRequired = - (par.resultsToCompute & (decision_forest::training::computeOutOfBagError | decision_forest::training::computeOutOfBagErrorPerObservation) - || mdaRequired); - - decision_forest::regression::internal::ModelImpl & mdImpl = - *static_cast(&m); - DAAL_CHECK_MALLOC(mdImpl.resize(par.nTrees)); - - services::String buildOptions = getBuildOptions(); - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.init(buildOptions.c_str(), TreeLevel::_nNodeSplitProps)); - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - - auto & info = context.getInfoDevice(); - - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory, "part1", df_batch_regression_kernels_part1, buildOptions.c_str())); - kernelComputeBestSplitSinglePass = kernel_factory.getKernel("computeBestSplitSinglePass", status); - - DAAL_CHECK_STATUS_VAR(buildProgram(kernel_factory, "part2", df_batch_regression_kernels_part2, buildOptions.c_str())); - kernelComputeBestSplitByHistogram = kernel_factory.getKernel("computeBestSplitByHistogram", status); - kernelComputePartialHistograms = kernel_factory.getKernel("computePartialHistograms", status); - kernelReducePartialHistograms = kernel_factory.getKernel("reducePartialHistograms", status); - DAAL_CHECK_STATUS_VAR(status); - - dtrees::internal::BinParams prm(par.maxBins, par.minBinSize, par.binningStrategy); - decision_forest::internal::IndexedFeaturesOneAPI indexedFeatures; - dtrees::internal::FeatureTypes featTypes; - - // init indexed features. - DAAL_CHECK_MALLOC(featTypes.init(*x)); - DAAL_CHECK_STATUS(status, (indexedFeatures.init(*const_cast(x), &featTypes, &prm))); - - _totalBins = indexedFeatures.totalBins(); - /* calculating the maximal number of bins for feature among all features */ - { - DAAL_ASSERT_UNIVERSAL_BUFFER(indexedFeatures.binOffsets(), uint32_t, _nFeatures + 1); - auto binOffsetsHost = indexedFeatures.binOffsets().template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - _nMaxBinsAmongFtrs = 0; - for (size_t i = 0; i < _nFeatures; i++) - { - auto nFtrBins = static_cast(binOffsetsHost.get()[i + 1] - binOffsetsHost.get()[i]); - _nMaxBinsAmongFtrs = (_nMaxBinsAmongFtrs < nFtrBins) ? nFtrBins : _nMaxBinsAmongFtrs; - } - } - - // no need to check for _nMaxBinsAmongFtrs < INT32_MAX because it will not be bigger than _nRows and _nRows was already checked - // check mul overflow for _nMaxBinsAmongFtrs * nSelectedFeatures - // and _nMaxBinsAmongFtrs * nSelectedFeatures * _nHistProps because they are used further in kernels - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nMaxBinsAmongFtrs, nSelectedFeatures); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nMaxBinsAmongFtrs * nSelectedFeatures, _nHistProps); - - // define num of trees which can be built in parallel - const size_t partHistSize = getPartHistRequiredMemSize(nSelectedFeatures, _nMaxBinsAmongFtrs); // alloc space at least for one part hist - const size_t maxMemAllocSize = services::internal::min(info.maxMemAllocSize, size_t(_maxMemAllocSizeForAlgo)); - - size_t usedMemSize = sizeof(algorithmFPType) * _nRows * (_nFeatures + 1); // input table size + response - usedMemSize += indexedFeatures.getRequiredMemSize(_nFeatures, _nRows); - usedMemSize += oobRequired ? sizeof(algorithmFPType) * _nRows * _nOOBProps : 0; - usedMemSize += partHistSize; // alloc space at least for one part hist - - size_t availableGlobalMemSize = info.globalMemSize > usedMemSize ? info.globalMemSize - usedMemSize : 0; - - size_t availableMemSizeForTreeBlock = - services::internal::min(maxMemAllocSize, static_cast(availableGlobalMemSize * _globalMemFractionForTreeBlock)); - - size_t requiredMemSizeForOneTree = - oobRequired ? _treeLevelBuildHelper.getOOBRowsRequiredMemSize(_nRows, 1 /* for 1 tree */, par.observationsPerTreeFraction) : 0; - requiredMemSizeForOneTree += sizeof(int32_t) * _nSelectedRows * 2; // main tree order and auxiliary one used for partitioning - - size_t treeBlock = availableMemSizeForTreeBlock / requiredMemSizeForOneTree; - - if (treeBlock <= 0) - { - // not enough memory even for one tree - return services::Status(services::ErrorMemoryAllocationFailed); - } - - treeBlock = services::internal::min(par.nTrees, treeBlock); - - availableGlobalMemSize = - availableGlobalMemSize > (treeBlock * requiredMemSizeForOneTree) ? availableGlobalMemSize - (treeBlock * requiredMemSizeForOneTree) : 0; - // size for one part hist was already reserved, add some more if there is available mem - _maxPartHistCumulativeSize = services::internal::min( - maxMemAllocSize, static_cast(partHistSize + availableGlobalMemSize * _globalMemFractionForPartHist)); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nSelectedRows, treeBlock); - daal::services::internal::TArray selectedRowsHost(_nSelectedRows * treeBlock); - DAAL_CHECK_MALLOC(selectedRowsHost.get()); - - auto treeOrderLev = context.allocate(TypeIds::id(), _nSelectedRows * treeBlock, status); - DAAL_CHECK_STATUS_VAR(status); - auto treeOrderLevBuf = context.allocate(TypeIds::id(), _nSelectedRows * treeBlock, status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor dataBlock; - DAAL_CHECK_STATUS_VAR(const_cast(x)->getBlockOfRows(0, _nRows, readOnly, dataBlock)); - - /* blocks for varImp MDI calculation */ - bool mdiRequired = (par.varImportance == decision_forest::training::MDI); - auto nodeImpDecreaseList = context.allocate(TypeIds::id(), 1, status); // holder will be reallocated in loop - DAAL_CHECK_STATUS_VAR(status); - BlockDescriptor varImpBlock; - NumericTablePtr varImpResPtr = res.get(variableImportance); - - if (mdiRequired || mdaRequired) - { - DAAL_CHECK_STATUS_VAR(varImpResPtr->getBlockOfRows(0, 1, writeOnly, varImpBlock)); - context.fill(varImpBlock.getBuffer(), (algorithmFPType)0, status); - DAAL_CHECK_STATUS_VAR(status); - } - - /* blocks for OutOfBag error calculation */ - UniversalBuffer oobBufferPerObs; - if (oobRequired) - { - // oobBufferPerObs contains pair for all out of bag observations for all trees - oobBufferPerObs = context.allocate(TypeIds::id(), _nRows * _nOOBProps, status); - DAAL_CHECK_STATUS_VAR(status); - context.fill(oobBufferPerObs, algorithmFPType(0), status); - DAAL_CHECK_STATUS_VAR(status); - } - - /* blocks for MDA scaled error calculation */ - bool mdaScaledRequired = (par.varImportance == decision_forest::training::MDA_Scaled); - daal::services::internal::TArrayCalloc varImpVariance; // for now it is calculated on host - if (mdaScaledRequired) - { - varImpVariance.reset(_nFeatures); - } - - /*init engines*/ - engines::internal::ParallelizationTechnique technique = engines::internal::family; - selectParallelizationTechnique(par, technique); - engines::internal::Params params(par.nTrees); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, par.nTrees - 1, par.nTrees); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, (par.nTrees - 1) * par.nTrees, _nRows); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, (par.nTrees - 1) * par.nTrees * _nRows, (par.featuresPerNode + 1)); - for (size_t i = 0; i < par.nTrees; i++) - { - params.nSkip[i] = i * par.nTrees * _nRows * (par.featuresPerNode + 1); - } - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, par.nTrees, sizeof(engines::EnginePtr)); - daal::services::internal::TArray engines(par.nTrees); - engines::internal::EnginesCollection enginesCollection(par.engine, technique, params, engines, &status); - DAAL_CHECK_STATUS_VAR(status); - daal::services::internal::TArray enginesBaseImpl(par.nTrees); - for (size_t treeIndex = 0; treeIndex < par.nTrees; treeIndex++) - { - enginesBaseImpl[treeIndex] = dynamic_cast(engines[treeIndex].get()); - if (!enginesBaseImpl[treeIndex]) return Status(ErrorEngineNotSupported); - } - - for (size_t iter = 0; (iter < par.nTrees) && !algorithms::internal::isCancelled(status, pHostApp); iter += treeBlock) - { - size_t nTrees = services::internal::min(par.nTrees - iter, treeBlock); - - BlockDescriptor responseBlock; - DAAL_CHECK_STATUS_VAR(const_cast(y)->getBlockOfRows(0, _nRows, readOnly, responseBlock)); - - size_t nNodes = nTrees; // num of potential nodes to split on current tree level - auto oobRowsNumList = context.allocate(TypeIds::id(), nTrees + 1, status); - DAAL_CHECK_STATUS_VAR(status); - - Collection DFTreeRecords; - Collection levelNodeLists; // lists of nodes int props(rowsOffset, rows, ftrId, ftrVal ... ) - Collection levelNodeImpLists; // list of nodes fptype props (impurity, mean) - UniversalBuffer oobRows; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodes, TreeLevel::_nNodeSplitProps); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodes, TreeLevel::_nNodeImpProps); - auto nodeVsTreeMap = context.allocate(TypeIds::id(), nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - levelNodeLists.push_back(context.allocate(TypeIds::id(), nNodes * TreeLevel::_nNodeSplitProps, status)); - DAAL_CHECK_STATUS_VAR(status); - levelNodeImpLists.push_back(context.allocate(TypeIds::id(), nNodes * TreeLevel::_nNodeImpProps, status)); - DAAL_CHECK_STATUS_VAR(status); - - { - auto treeMap = nodeVsTreeMap.template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - auto rootNode = levelNodeLists[0].template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - for (size_t node = 0; node < nNodes; node++) - { - treeMap.get()[node] = static_cast(iter + node); // check for par.nTrees less than int32 was done at the beggining - rootNode.get()[node * TreeLevel::_nNodeSplitProps + 0] = _nSelectedRows * node; // rows offset - rootNode.get()[node * TreeLevel::_nNodeSplitProps + 1] = _nSelectedRows; // num of rows - } - } - - if (par.bootstrap) - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.RNG); - - for (size_t node = 0; node < nNodes; node++) - { - daal::internal::RNGsInst rng; - rng.uniform(_nSelectedRows, selectedRowsHost.get() + _nSelectedRows * node, enginesBaseImpl[iter + node]->getState(), 0, _nRows); - } - - context.copy(treeOrderLev, 0, (void *)selectedRowsHost.get(), _nSelectedRows * nNodes, 0, _nSelectedRows * nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - } - else - { - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.initializeTreeOrder(_nSelectedRows, nTrees, treeOrderLev)); - } - - if (oobRequired) - { - _treeLevelBuildHelper.getOOBRows(treeOrderLev, _nSelectedRows, nTrees, oobRowsNumList, - oobRows); // oobRowsNumList and oobRows are the output - } - - for (size_t level = 0; nNodes > 0; level++) - { - auto nodeList = levelNodeLists[level]; - auto impList = levelNodeImpLists[level]; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, (nNodes + 1), nSelectedFeatures); - daal::services::internal::TArray selectedFeaturesHost( - (nNodes + 1) * nSelectedFeatures); // first part is used features indices, +1 - part for generator - DAAL_CHECK_MALLOC(selectedFeaturesHost.get()); - - auto selectedFeaturesCom = context.allocate(TypeIds::id(), nNodes * nSelectedFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - if (nSelectedFeatures != _nFeatures) - { - daal::internal::RNGsInst rng; - auto treeMap = nodeVsTreeMap.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - for (size_t node = 0; node < nNodes; node++) - { - rng.uniformWithoutReplacement(nSelectedFeatures, selectedFeaturesHost.get() + node * nSelectedFeatures, - selectedFeaturesHost.get() + (node + 1) * nSelectedFeatures, - enginesBaseImpl[treeMap.get()[node]]->getState(), 0, _nFeatures); - } - } - else - { - for (size_t node = 0; node < nNodes; node++) - { - for (size_t i = 0; i < nSelectedFeatures; i++) - { - selectedFeaturesHost.get()[node * nSelectedFeatures + i] = i; - } - } - } - - context.copy(selectedFeaturesCom, 0, (void *)selectedFeaturesHost.get(), nSelectedFeatures * nNodes, 0, nSelectedFeatures * nNodes, - status); - DAAL_CHECK_STATUS_VAR(status); - - if (mdiRequired) - { - nodeImpDecreaseList = context.allocate(TypeIds::id(), nNodes, status); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_CHECK_STATUS_VAR(computeBestSplit(indexedFeatures.getFullData(), treeOrderLev, selectedFeaturesCom, nSelectedFeatures, - responseBlock.getBuffer(), nodeList, indexedFeatures.binOffsets(), impList, nodeImpDecreaseList, - mdiRequired, _nFeatures, nNodes, par.minObservationsInLeafNode, par.impurityThreshold)); - - if (par.maxTreeDepth > 0 && par.maxTreeDepth == level) - { - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.convertSplitToLeaf(nodeList, nNodes)); - TreeLevel levelRecord; - DAAL_CHECK_STATUS_VAR(levelRecord.init(nodeList, impList, nNodes)); - DFTreeRecords.push_back(levelRecord); - break; - } - - TreeLevel levelRecord; - DAAL_CHECK_STATUS_VAR(levelRecord.init(nodeList, impList, nNodes)); - DFTreeRecords.push_back(levelRecord); - - if (mdiRequired) - { - /*mdi is calculated only on split nodes and not calculated on last level*/ - auto varImpBuffer = varImpBlock.getBuffer(); - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.updateMDIVarImportance(nodeList, nodeImpDecreaseList, nNodes, varImpBuffer, _nFeatures)); - } - - size_t nNodesNewLevel; - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.getNumOfSplitNodes(nodeList, nNodes, nNodesNewLevel)); - - if (nNodesNewLevel) - { - /*there are split nodes -> next level is required*/ - nNodesNewLevel *= 2; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodesNewLevel, TreeLevel::_nNodeSplitProps); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nNodesNewLevel, (TreeLevel::_nNodeImpProps)); - auto nodeListNewLevel = context.allocate(TypeIds::id(), nNodesNewLevel * TreeLevel::_nNodeSplitProps, status); - DAAL_CHECK_STATUS_VAR(status); - auto nodeVsTreeMapNew = context.allocate(TypeIds::id(), nNodesNewLevel, status); - DAAL_CHECK_STATUS_VAR(status); - auto impListNewLevel = context.allocate(TypeIds::id(), nNodesNewLevel * TreeLevel::_nNodeImpProps, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR( - _treeLevelBuildHelper.doNodesSplit(nodeList, nNodes, nodeListNewLevel, nNodesNewLevel, nodeVsTreeMap, nodeVsTreeMapNew)); - - levelNodeLists.push_back(nodeListNewLevel); - levelNodeImpLists.push_back(impListNewLevel); - - nodeVsTreeMap = nodeVsTreeMapNew; - - DAAL_CHECK_STATUS_VAR(_treeLevelBuildHelper.doLevelPartition(indexedFeatures.getFullData(), nodeList, nNodes, treeOrderLev, - treeOrderLevBuf, _nSelectedRows, _nFeatures)); - } - - nNodes = nNodesNewLevel; - } // for level - - DFTreeConverterType converter; - typename DFTreeConverterType::TreeHelperType mTreeHelper(nTrees); - - services::Collection > binValuesHost(_nFeatures); - DAAL_CHECK_MALLOC(binValuesHost.data()); - services::Collection binValues(_nFeatures); - DAAL_CHECK_MALLOC(binValues.data()); - - for (size_t i = 0; i < _nFeatures; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(indexedFeatures.binBorders(i), algorithmFPType, indexedFeatures.numIndices(i)); - binValuesHost[i] = indexedFeatures.binBorders(i).template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - binValues[i] = binValuesHost[i].get(); - } - - DAAL_CHECK_STATUS_VAR(converter.convertToDFDecisionTree(DFTreeRecords, binValues.data(), mTreeHelper)); - - for (size_t tree = 0; tree < nTrees; tree++) - { - mdImpl.add(mTreeHelper._tree_list[tree], 0 /*nClasses*/, iter + tree); - - DAAL_CHECK_STATUS_VAR(computeResults(mTreeHelper._tree_list[tree], dataBlock.getBlockPtr(), responseBlock.getBlockPtr(), _nSelectedRows, - _nFeatures, oobRows, oobRowsNumList, oobBufferPerObs, varImpBlock.getBlockPtr(), - varImpVariance.get(), iter + tree + 1, engines[iter + tree], nTrees, tree, par)); - } - - DAAL_CHECK_STATUS_VAR(const_cast(y)->releaseBlockOfRows(responseBlock)); - } - - /* Finalize results */ - if (par.resultsToCompute - & (decision_forest::training::computeOutOfBagError | decision_forest::training::computeOutOfBagErrorPerObservation - | decision_forest::training::computeOutOfBagErrorR2 | decision_forest::training::computeOutOfBagErrorPrediction)) - { - BlockDescriptor responseBlock; - DAAL_CHECK_STATUS_VAR(const_cast(y)->getBlockOfRows(0, _nRows, readOnly, responseBlock)); - - NumericTablePtr oobErrPtr = res.get(outOfBagError); - BlockDescriptor oobErrBlock; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagError) - DAAL_CHECK_STATUS_VAR(oobErrPtr->getBlockOfRows(0, 1, writeOnly, oobErrBlock)); - - NumericTablePtr oobErrR2Ptr = res.get(outOfBagErrorR2); - BlockDescriptor oobErrR2Block; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagErrorR2) - DAAL_CHECK_STATUS_VAR(oobErrR2Ptr->getBlockOfRows(0, 1, writeOnly, oobErrR2Block)); - - NumericTablePtr oobErrPerObsPtr = res.get(outOfBagErrorPerObservation); - BlockDescriptor oobErrPerObsBlock; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagErrorPerObservation) - DAAL_CHECK_STATUS_VAR(oobErrPerObsPtr->getBlockOfRows(0, _nRows, writeOnly, oobErrPerObsBlock)); - - NumericTablePtr oobErrPredictionPtr = res.get(outOfBagErrorPrediction); - BlockDescriptor oobErrPredictionBlock; - if (par.resultsToCompute & decision_forest::training::computeOutOfBagErrorPrediction) - DAAL_CHECK_STATUS_VAR(oobErrPredictionPtr->getBlockOfRows(0, _nRows, writeOnly, oobErrPredictionBlock)); - - DAAL_CHECK_STATUS_VAR(finalizeOOBError(responseBlock.getBlockPtr(), oobBufferPerObs, _nRows, oobErrBlock.getBlockPtr(), - oobErrPerObsBlock.getBlockPtr(), oobErrR2Block.getBlockPtr(), oobErrPredictionBlock.getBlockPtr())); - - if (oobErrPtr) DAAL_CHECK_STATUS_VAR(oobErrPtr->releaseBlockOfRows(oobErrBlock)); - - if (oobErrPerObsPtr) DAAL_CHECK_STATUS_VAR(oobErrPerObsPtr->releaseBlockOfRows(oobErrPerObsBlock)); - - DAAL_CHECK_STATUS_VAR(const_cast(y)->releaseBlockOfRows(responseBlock)); - } - - if (par.varImportance != decision_forest::training::none && par.varImportance != decision_forest::training::MDA_Raw) - { - DAAL_CHECK_STATUS_VAR(finalizeVarImp(par, varImpBlock.getBlockPtr(), varImpVariance.get(), _nFeatures)); - } - - if (mdiRequired || mdaRequired) DAAL_CHECK_STATUS_VAR(varImpResPtr->releaseBlockOfRows(varImpBlock)); - - DAAL_CHECK_STATUS_VAR(const_cast(x)->releaseBlockOfRows(dataBlock)); - - return status; -} // namespace internal - -} // namespace internal -} // namespace training -} // namespace regression -} // namespace decision_forest -} // namespace algorithms -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_tree_helper_impl.i b/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_tree_helper_impl.i deleted file mode 100644 index 24ddea9b68f..00000000000 --- a/cpp/daal/src/algorithms/dtrees/forest/regression/oneapi/df_regression_tree_helper_impl.i +++ /dev/null @@ -1,202 +0,0 @@ -/* file: df_regression_tree_helper_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of the class defining the decision forest regression tree -//-- -*/ - -#ifndef __DF_REGRESSION_TREE_HELPER_IMPL__ -#define __DF_REGRESSION_TREE_HELPER_IMPL__ - -//#include "data_management/data/aos_numeric_table.h" -#include "src/services/service_arrays.h" -#include "src/algorithms/dtrees/dtrees_predict_dense_default_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace decision_forest -{ -namespace regression -{ -namespace internal -{ -using namespace daal::algorithms::dtrees::internal; -using namespace daal::services::internal; - -template -class RegressionTreeHelperOneAPI -{ -public: - typedef dtrees::internal::TreeImpRegression<> TreeType; - typedef typename TreeType::NodeType NodeType; - - RegressionTreeHelperOneAPI() = delete; - explicit RegressionTreeHelperOneAPI(size_t nTrees) : _allocator(_cNumNodesHint) { _tree_list.reset(nTrees); } - ~RegressionTreeHelperOneAPI() {} - - typename NodeType::Leaf * makeLeaf(size_t n, algorithmFPType response, algorithmFPType impurity) - { - typename NodeType::Leaf * pNode = _allocator.allocLeaf(0); - DAAL_ASSERT(n > 0); - pNode->response = response; - pNode->count = n; - pNode->impurity = impurity; - - return pNode; - } - - typename NodeType::Split * makeSplit(size_t n, size_t iFeature, algorithmFPType featureValue, bool bUnordered, algorithmFPType impurity, - typename NodeType::Base * left, typename NodeType::Base * right) - { - typename NodeType::Split * pNode = _allocator.allocSplit(); - pNode->set(iFeature, featureValue, bUnordered); - pNode->kid[0] = left; - pNode->kid[1] = right; - pNode->impurity = impurity; - pNode->count = n; - - return pNode; - } - - static algorithmFPType predict(const dtrees::internal::Tree & t, const algorithmFPType * x) - { - const typename NodeType::Base * pNode = dtrees::prediction::internal::findNode(t, x); - DAAL_ASSERT(pNode); - return pNode ? NodeType::castLeaf(pNode)->response : 0.0; - } - - static const size_t _cNumNodesHint = 512; //number of nodes as a hint for allocator to grow by - TreeType::Allocator _allocator; - TArray _tree_list; -}; - -template -struct TreeLevelRecord -{ - TreeLevelRecord() : _isInitialized(false), _nNodes(0) {} - services::Status init(services::internal::sycl::UniversalBuffer & nodeList, services::internal::sycl::UniversalBuffer & impInfo, size_t nNodes) - { - services::Status status; - - _nNodes = nNodes; - DAAL_ASSERT_UNIVERSAL_BUFFER(nodeList, int32_t, nNodes * _nNodeSplitProps); - DAAL_ASSERT_UNIVERSAL_BUFFER(impInfo, algorithmFPType, nNodes * _nNodeImpProps); - - auto nodeListHost = nodeList.template get().toHost(ReadWriteMode::readOnly, status); - auto impInfoHost = impInfo.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - _nodeList = nodeListHost; - _impInfo = impInfoHost; - - _isInitialized = true; - - return status; - } - - bool isInitialized() const { return _isInitialized; } - size_t getNodesNum() { return _nNodes; } - int getRowsNum(size_t nodeIdx) { return _nodeList.get()[nodeIdx * _nNodeSplitProps + 1]; } - int getFtrIdx(size_t nodeIdx) { return _nodeList.get()[nodeIdx * _nNodeSplitProps + 2]; } - int getFtrVal(size_t nodeIdx) { return _nodeList.get()[nodeIdx * _nNodeSplitProps + 3]; } - algorithmFPType getImpurity(size_t nodeIdx) { return _impInfo.get()[nodeIdx * _nNodeImpProps + 0]; } - algorithmFPType getResponse(size_t nodeIdx) { return _impInfo.get()[nodeIdx * _nNodeImpProps + 1]; } - bool hasUnorderedFtr(size_t nodeIdx) { return false; } - - constexpr static int _nNodeImpProps = 2; - constexpr static int _nNodeSplitProps = 5; - - SharedPtr _nodeList; - SharedPtr _impInfo; - size_t _nNodes; - - bool _isInitialized; -}; - -template -struct DFTreeConverter -{ - typedef RegressionTreeHelperOneAPI TreeHelperType; - - services::Status convertToDFDecisionTree(Collection > & treeLevelsList, algorithmFPType ** binValues, - TreeHelperType & treeBuilder) - { - services::Status status; - typedef TArray DFTreeNodesArr; - typedef SharedPtr DFTreeNodesArrPtr; - - DFTreeNodesArrPtr dfTreeLevelNodesPrev; - bool unorderedFeaturesUsed = false; - const int notFoundVal = -1; - - size_t level = treeLevelsList.size(); - DAAL_ASSERT(level); - - do - { - level--; - TreeLevelRecord & record = treeLevelsList[level]; - DAAL_ASSERT(record.isInitialized()); - - DFTreeNodesArrPtr dfTreeLevelNodes(new DFTreeNodesArr(record.getNodesNum())); - DAAL_CHECK_MALLOC(dfTreeLevelNodes.get()); - DAAL_CHECK_MALLOC(dfTreeLevelNodes->get()); - - size_t nSplits = 0; - // nSplits is used to calculate index of child nodes on next level - for (size_t nodeIdx = 0; nodeIdx < record.getNodesNum(); nodeIdx++) - { - if (record.getFtrIdx(nodeIdx) == notFoundVal) - { - // leaf node - dfTreeLevelNodes->get()[nodeIdx] = - treeBuilder.makeLeaf(record.getRowsNum(nodeIdx), record.getResponse(nodeIdx), record.getImpurity(nodeIdx)); - } - else - { - DAAL_ASSERT(dfTreeLevelNodesPrev->get()); - //split node - dfTreeLevelNodes->get()[nodeIdx] = treeBuilder.makeSplit( - record.getRowsNum(nodeIdx), record.getFtrIdx(nodeIdx), binValues[record.getFtrIdx(nodeIdx)][record.getFtrVal(nodeIdx)], - static_cast(record.hasUnorderedFtr(nodeIdx)), record.getImpurity(nodeIdx), dfTreeLevelNodesPrev->get()[nSplits * 2], - dfTreeLevelNodesPrev->get()[nSplits * 2 + 1]); - nSplits++; - } - } - - dfTreeLevelNodesPrev = dfTreeLevelNodes; - } while (level > 0); - - for (size_t tree = 0; tree < treeBuilder._tree_list.size(); tree++) - { - treeBuilder._tree_list[tree].reset(dfTreeLevelNodesPrev->get()[tree], unorderedFeaturesUsed); - } - return status; - } -}; - -} // namespace internal -} // namespace regression -} // namespace decision_forest -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/gbt/BUILD b/cpp/daal/src/algorithms/dtrees/gbt/BUILD index 9d717dae310..79b1dbdf11c 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/BUILD +++ b/cpp/daal/src/algorithms/dtrees/gbt/BUILD @@ -7,10 +7,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/dtrees:kernel", ], ) diff --git a/cpp/daal/src/algorithms/dtrees/gbt/classification/BUILD b/cpp/daal/src/algorithms/dtrees/gbt/classification/BUILD index 9321c6d4cb5..3534ab94a62 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/classification/BUILD +++ b/cpp/daal/src/algorithms/dtrees/gbt/classification/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/classifier:kernel", "@onedal//cpp/daal/src/algorithms/dtrees/gbt:kernel", "@onedal//cpp/daal/src/algorithms/dtrees/gbt/regression:kernel", diff --git a/cpp/daal/src/algorithms/dtrees/gbt/oneapi/cl_kernels/gbt_common_kernels.cl b/cpp/daal/src/algorithms/dtrees/gbt/oneapi/cl_kernels/gbt_common_kernels.cl deleted file mode 100644 index dd4839044eb..00000000000 --- a/cpp/daal/src/algorithms/dtrees/gbt/oneapi/cl_kernels/gbt_common_kernels.cl +++ /dev/null @@ -1,234 +0,0 @@ -/* file: gbt_kernels.cl */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of GBT OpenCL kernels. -//-- -*/ - -#ifndef __GBT_KERNELS_CL__ -#define __GBT_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - gbt_common_kernels, - - __kernel void extractColumn(const __global algorithmFPType * data, __global algorithmFPType * values, __global int * indices, - unsigned int featureId, unsigned int nFeatures, unsigned int nRows) { - const unsigned int id = get_global_id(0); - values[id] = data[id * nFeatures + featureId]; - indices[id] = id; - } - - uint __attribute__((overloadable)) invBits(uint x) { - return x ^ (-(x >> 31) | 0x80000000u); - // return x ^ 0x80000000u; - } - - ulong __attribute__((overloadable)) invBits(ulong x) { - return x ^ (-(x >> 63) | 0x8000000000000000ul); - // return x ^ 0x8000000000000000u; - } - - __kernel void radixScan(const __global radixIntType * values, __global int * partialHists, unsigned int nRows, unsigned int bitOffset) { - const unsigned int RADIX_BITS = 4; - - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - unsigned int offset[1 << RADIX_BITS]; - const unsigned int radix_range = 1 << RADIX_BITS; - const unsigned int radix_range_1 = radix_range - 1; - for (unsigned int i = 0; i < radix_range; i++) - { - offset[i] = 0; - } - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - radixIntType data_bits = ((invBits(values[i]) >> bitOffset) & radix_range_1); - for (unsigned int j = 0; j < radix_range; j++) - { - unsigned int value = data_bits == j; - unsigned int partial_offset = sub_group_reduce_add(value); - offset[j] += partial_offset; - } - } - - if (local_id == 0) - { - for (unsigned int j = 0; j < radix_range; j++) - { - partialHists[group_id * radix_range + j] = offset[j]; - } - } - } - - __kernel void radixHistScan(const __global int * partialHists, __global int * partialPrefixHists, unsigned int nSubgroupSums) { - const unsigned int RADIX_BITS = 4; - - if (get_sub_group_id() > 0) return; - - const unsigned int local_size = get_sub_group_size(); - const unsigned int local_id = get_sub_group_local_id(); - - unsigned int offset[1 << RADIX_BITS]; - const unsigned int radix_range = 1 << RADIX_BITS; - for (unsigned int i = 0; i < radix_range; i++) - { - offset[i] = 0; - } - - for (unsigned int i = local_id; i < nSubgroupSums; i += local_size) - { - for (unsigned int j = 0; j < radix_range; j++) - { - unsigned int value = partialHists[i * radix_range + j]; - unsigned int boundary = sub_group_scan_exclusive_add(value); - partialPrefixHists[i * radix_range + j] = offset[j] + boundary; - unsigned int partial_offset = sub_group_reduce_add(value); - offset[j] += partial_offset; - } - } - - if (local_id == 0) - { - unsigned int totalSum = 0; - for (unsigned int j = 0; j < radix_range; j++) - { - partialPrefixHists[nSubgroupSums * radix_range + j] = totalSum; - totalSum += offset[j]; - } - } - } - - __kernel void radixReorder(const __global radixIntType * valuesSrc, const __global int * indicesSrc, const __global int * partialPrefixHists, - __global radixIntType * valuesDst, __global int * indicesDst, unsigned int nRows, unsigned int bitOffset) { - const unsigned int RADIX_BITS = 4; - - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - unsigned int offset[1 << RADIX_BITS]; - - const unsigned int radix_range = 1 << RADIX_BITS; - const unsigned int radix_range_1 = radix_range - 1; - - for (unsigned int i = 0; i < radix_range; i++) - { - offset[i] = partialPrefixHists[group_id * radix_range + i] + partialPrefixHists[n_total_sub_groups * radix_range + i]; - } - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - radixIntType data_value = valuesSrc[i]; - radixIntType data_bits = ((invBits(data_value) >> bitOffset) & radix_range_1); - unsigned int pos_new = 0; - for (unsigned int j = 0; j < radix_range; j++) - { - unsigned int value = data_bits == j; - unsigned int boundary = sub_group_scan_exclusive_add(value); - pos_new |= value * (offset[j] + boundary); - unsigned int partial_offset = sub_group_reduce_add(value); - offset[j] = offset[j] + partial_offset; - } - valuesDst[pos_new] = data_value; - indicesDst[pos_new] = indicesSrc[i]; - } - } - - __kernel void collectBinBorders(const __global algorithmFPType * values, const __global int * binOffsets, __global algorithmFPType * binBorders) { - const unsigned int id = get_global_id(0); - binBorders[id] = values[binOffsets[id]]; - } - - __kernel void computeBins(const __global algorithmFPType * values, const __global int * indices, const __global algorithmFPType * binBorders, - __global int * bins, unsigned int nRows, unsigned int nBins) { - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - unsigned int curBin = 0; - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - algorithmFPType value = values[i]; - while (binBorders[curBin] < value) curBin++; - bins[indices[i]] = curBin; - } - } - - __kernel void storeColumn(const __global int * data, __global int * fullData, unsigned int featureId, unsigned int nFeatures, - unsigned int nRows) { - const unsigned int id = get_global_id(0); - fullData[id * nFeatures + featureId] = data[id]; - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.h b/cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.h deleted file mode 100644 index 0d9c8866e38..00000000000 --- a/cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.h +++ /dev/null @@ -1,176 +0,0 @@ -/* file: gbt_feature_type_helper_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of a service class that provides optimal access to the feature types -//-- -*/ - -#ifndef __GBT_FEATURE_TYPE_HELPER_ONEAPI_H__ -#define __GBT_FEATURE_TYPE_HELPER_ONEAPI_H__ - -#include "src/algorithms/dtrees/dtrees_feature_type_helper.h" -#include "src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.h" -#include "src/threading/threading.h" -#include "src/algorithms/service_error_handling.h" -#include "src/algorithms/service_sort.h" -#include "src/algorithms/dtrees/service_array.h" -#include "src/externals/service_memory.h" -#include "src/services/service_data_utils.h" -#include "src/data_management/service_numeric_table.h" - -#include "src/algorithms/dtrees/gbt/oneapi/cl_kernels/gbt_common_kernels.cl" - -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" - -namespace daal -{ -namespace algorithms -{ -namespace gbt -{ -namespace internal -{ -////////////////////////////////////////////////////////////////////////////////////////// -// IndexedFeatures. Creates and stores index of every feature -// Sorts every feature and creates the mapping: features value -> index of the value -// in the sorted array of unique values of the feature in increasing order -////////////////////////////////////////////////////////////////////////////////////////// -template -class IndexedFeaturesOneAPI -{ -public: - typedef int IndexType; // TODO: should be unsigned int - - struct FeatureEntry - { - DAAL_NEW_DELETE(); - IndexType numIndices = 0; //number of indices or bins - IndexType offset = 0; - services::internal::sycl::UniversalBuffer binBorders; //right bin borders - - services::Status allocBorders(); - ~FeatureEntry(); - }; - -public: - IndexedFeaturesOneAPI() : _data(), _entries(nullptr), _sizeOfIndex(sizeof(IndexType)), _nCols(0), _nRows(0), _capacity(0), _maxNumIndices(0) {} - ~IndexedFeaturesOneAPI(); - - services::Status init(NumericTable & nt, const dtrees::internal::FeatureTypes * featureTypes, const dtrees::internal::BinParams * pBinPrm); - - //get max number of indices for that feature - IndexType numIndices(size_t iCol) const { return _entries[iCol].numIndices; } - - IndexType totalBins() const { return _totalBins; } - - services::internal::sycl::UniversalBuffer & binBorders(size_t iCol) const { return _entries[iCol].binBorders; } - - services::internal::sycl::UniversalBuffer & binOffsets() { return _binOffsets; } - - services::internal::sycl::UniversalBuffer & getFullData() { return _fullData; } - - //for low-level optimization - const services::internal::sycl::UniversalBuffer & getFeature(size_t iFeature) const { return _data[iFeature]; } - - size_t nRows() const { return _nRows; } - size_t nCols() const { return _nCols; } - -protected: - services::Status alloc(uint32_t nCols, uint32_t nRows); - - services::Status extractColumn(const services::internal::Buffer & data, services::internal::sycl::UniversalBuffer & values, - services::internal::sycl::UniversalBuffer & indices, uint32_t featureId, uint32_t nFeatures, uint32_t nRows); - - services::Status collectBinBorders(services::internal::sycl::UniversalBuffer & values, services::internal::sycl::UniversalBuffer & binOffsets, - services::internal::sycl::UniversalBuffer & binBorders, uint32_t nRows, uint32_t maxBins); - - services::Status computeBins(services::internal::sycl::UniversalBuffer & values, services::internal::sycl::UniversalBuffer & indices, - services::internal::sycl::UniversalBuffer & binBorders, services::internal::sycl::UniversalBuffer & bins, - uint32_t nRows, uint32_t nBins, uint32_t maxBins, uint32_t localSize, uint32_t nLocalBlocks); - - services::Status computeBins(services::internal::sycl::UniversalBuffer & values, services::internal::sycl::UniversalBuffer & indices, - services::internal::sycl::UniversalBuffer & bins, FeatureEntry & entry, uint32_t nRows, - const dtrees::internal::BinParams * pBinPrm); - - services::Status makeIndex(const services::internal::Buffer & data, uint32_t featureId, uint32_t nFeatures, uint32_t nRows, - const dtrees::internal::BinParams * pBinPrm, services::internal::sycl::UniversalBuffer & bins, FeatureEntry & entry); - - services::Status storeColumn(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & fullData, - uint32_t featureId, uint32_t nFeatures, uint32_t nRows); - -protected: - services::Collection _data; - services::internal::sycl::UniversalBuffer _fullData; - services::internal::sycl::UniversalBuffer _binOffsets; - FeatureEntry * _entries; - uint32_t _sizeOfIndex; - uint32_t _nRows; - uint32_t _nCols; - uint32_t _capacity; - uint32_t _maxNumIndices; - IndexType _totalBins; - - services::internal::sycl::UniversalBuffer _values; - services::internal::sycl::UniversalBuffer _values_buf; - services::internal::sycl::UniversalBuffer _indices; - services::internal::sycl::UniversalBuffer _indices_buf; - - const uint32_t _maxWorkItemsPerGroup = 128; // should be a power of two for interal needs - const uint32_t _maxLocalBuffer = 30000; // should be less than a half of local memory (two buffers) - const uint32_t _preferableSubGroup = 16; // preferable maximal sub-group size - const uint32_t _radixBits = 4; -}; - -class TreeNodeStorage -{ -public: - TreeNodeStorage() {} - - services::internal::sycl::UniversalBuffer & getHistograms() { return _histogramsForFeatures; } - - void clear() { _histogramsForFeatures = services::internal::sycl::UniversalBuffer(); } - - template - services::Status allocate(const gbt::internal::IndexedFeaturesOneAPI & indexedFeatures); - -private: - services::internal::sycl::UniversalBuffer _histogramsForFeatures; -}; - -template -struct BestSplitOneAPI -{ - BestSplitOneAPI(); - - algorithmFPType _impurityDecrease; - int32_t _featureIndex; - int32_t _featureValue; - algorithmFPType _leftGTotal; - algorithmFPType _leftHTotal; - algorithmFPType _rightGTotal; - algorithmFPType _rightHTotal; -}; - -} /* namespace internal */ -} // namespace gbt -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.i b/cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.i deleted file mode 100644 index 2e74fc4ee32..00000000000 --- a/cpp/daal/src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.i +++ /dev/null @@ -1,471 +0,0 @@ -/* file: gbt_feature_type_helper_oneapi.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// GPU-dependent initialization of service data structure -//-- -*/ -#include "src/algorithms/dtrees/dtrees_feature_type_helper.h" - -#include "src/services/service_data_utils.h" -#include "src/sycl/sorter.h" -#include "src/externals/service_profiler.h" - -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace gbt -{ -namespace internal -{ -template -struct GetIntegerTypeForFPType; - -template <> -struct GetIntegerTypeForFPType -{ - using Type = uint32_t; -}; - -template <> -struct GetIntegerTypeForFPType -{ - using Type = uint64_t; -}; - -template -services::String getOpenCLKeyType(const services::String & typeName); - -template <> -inline services::String getOpenCLKeyType(const services::String & typeName) -{ - return services::String(" -D ") + typeName + services::String("=uint "); -} - -template <> -inline services::String getOpenCLKeyType(const services::String & typeName) -{ - return services::String(" -D ") + typeName + services::String("=ulong "); -} - -template -static services::Status buildProgram(ClKernelFactoryIface & factory) -{ - services::Status status; - - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - - auto fptype_name = getKeyFPType(); - auto radixtype_name = getOpenCLKeyType::Type>("radixIntType"); - auto build_options = fptype_name + radixtype_name; - build_options.add("-cl-std=CL1.2"); - - services::String cachekey("__daal_algorithms_gbt_common_"); - cachekey.add(fptype_name); - cachekey.add(radixtype_name); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), gbt_common_kernels, build_options.c_str(), status); - - return status; -} - -template -IndexedFeaturesOneAPI::~IndexedFeaturesOneAPI() -{ - delete[] _entries; - _entries = nullptr; -} - -template -IndexedFeaturesOneAPI::FeatureEntry::~FeatureEntry() -{} - -template -services::Status IndexedFeaturesOneAPI::FeatureEntry::allocBorders() -{ - auto & context = services::internal::getDefaultContext(); - services::Status status; - - binBorders = context.allocate(TypeIds::id(), numIndices, status); - return status; -} - -template -services::Status IndexedFeaturesOneAPI::alloc(uint32_t nC, uint32_t nR) -{ - auto & context = services::internal::getDefaultContext(); - services::Status status; - - if (!_data.resize(nC)) - { - return services::throwIfPossible(services::ErrorMemoryAllocationFailed); - } - - for (uint32_t i = 0; i < nC; i++) - { - _data[i] = context.allocate(TypeId::uint32, nR, status); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nR, nC); - _fullData = context.allocate(TypeId::uint32, nR * nC, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_OVERFLOW_CHECK_BY_ADDING(uint32_t, nC, 1); - _binOffsets = context.allocate(TypeId::uint32, nC + 1, status); - DAAL_CHECK_STATUS_VAR(status); - - _entries = new FeatureEntry[nC]; - DAAL_CHECK_MALLOC(_entries); - _nCols = nC; - _nRows = nR; - _totalBins = 0; - return services::Status(); -} - -template -services::Status IndexedFeaturesOneAPI::extractColumn(const services::internal::Buffer & data, - UniversalBuffer & values, UniversalBuffer & indices, uint32_t featureId, - uint32_t nFeatures, uint32_t nRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.extractColumn); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(factory)); - - auto kernel = factory.getKernel("extractColumn", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(data), algorithmFPType, nRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(values, algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int, nRows); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, values, AccessModeIds::write); - args.set(2, indices, AccessModeIds::write); - args.set(3, featureId); - args.set(4, nFeatures); - args.set(5, nRows); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - return status; -} - -template -services::Status IndexedFeaturesOneAPI::collectBinBorders(UniversalBuffer & values, UniversalBuffer & binOffsets, - UniversalBuffer & binBorders, uint32_t nRows, uint32_t maxBins) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.collectBinBorders); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("collectBinBorders", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(values, algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, int, maxBins); - DAAL_ASSERT_UNIVERSAL_BUFFER(binBorders, algorithmFPType, maxBins); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, values, AccessModeIds::read); - args.set(1, binOffsets, AccessModeIds::read); - args.set(2, binBorders, AccessModeIds::write); - - KernelRange global_range(maxBins); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::computeBins(UniversalBuffer & values, UniversalBuffer & indices, - UniversalBuffer & binBorders, UniversalBuffer & bins, uint32_t nRows, - uint32_t nBins, uint32_t maxBins, uint32_t localSize, uint32_t nLocalBlocks) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.computeBins); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("computeBins", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(values, algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(binBorders, algorithmFPType, maxBins); - DAAL_ASSERT_UNIVERSAL_BUFFER(bins, uint32_t, nRows); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, values, AccessModeIds::read); - args.set(1, indices, AccessModeIds::read); - args.set(2, binBorders, AccessModeIds::read); - args.set(3, bins, AccessModeIds::write); - args.set(4, nRows); - args.set(5, nBins); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalBlocks); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::computeBins(UniversalBuffer & values, UniversalBuffer & indices, UniversalBuffer & bins, - FeatureEntry & entry, uint32_t nRows, - const dtrees::internal::BinParams * pBinPrm) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - const uint32_t maxBins = pBinPrm->maxBins < nRows ? pBinPrm->maxBins : nRows; - const uint32_t localSize = _preferableSubGroup; - const uint32_t nLocalBlocks = 1024 * localSize < nRows ? 1024 : (nRows / localSize) + !!(nRows % localSize); - - auto binOffsets = context.allocate(TypeIds::id(), maxBins, status); - DAAL_CHECK_STATUS_VAR(status); - auto binBorders = context.allocate(TypeIds::id(), maxBins, status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, int, maxBins); - auto binOffsetsHost = binOffsets.template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - int offset = 0; - for (int i = 0; i < maxBins; i++) - { - offset += (nRows + i) / maxBins; - binOffsetsHost.get()[i] = offset - 1; - } - } - - DAAL_CHECK_STATUS_VAR(collectBinBorders(values, binOffsets, binBorders, nRows, maxBins)); - - uint32_t nBins = 0; - { - DAAL_ASSERT_UNIVERSAL_BUFFER(binBorders, algorithmFPType, maxBins); - auto binBordersHost = binBorders.template get().toHost(ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - for (uint32_t i = 0; i < maxBins; i++) - { - if (nBins == 0 || (nBins > 0 && binBordersHost.get()[i] != binBordersHost.get()[nBins - 1])) - { - binBordersHost.get()[nBins] = binBordersHost.get()[i]; - nBins++; - } - } - } - - DAAL_CHECK_STATUS_VAR(computeBins(values, indices, binBorders, bins, nRows, nBins, maxBins, localSize, nLocalBlocks)); - - entry.numIndices = nBins; - entry.binBorders = binBorders; - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::makeIndex(const services::internal::Buffer & data, uint32_t featureId, - uint32_t nFeatures, uint32_t nRows, const dtrees::internal::BinParams * pBinPrm, - UniversalBuffer & bins, FeatureEntry & entry) -{ - DAAL_CHECK_STATUS_VAR(extractColumn(data, _values, _indices, featureId, nFeatures, nRows)); - DAAL_CHECK_STATUS_VAR(sort::RadixSort::sortIndices(_values, _indices, _values_buf, _indices_buf, nRows)); - DAAL_CHECK_STATUS_VAR(computeBins(_values, _indices, bins, entry, nRows, pBinPrm)); - return services::Status(); -} - -template -services::Status IndexedFeaturesOneAPI::storeColumn(const UniversalBuffer & data, UniversalBuffer & fullData, uint32_t featureId, - uint32_t nFeatures, uint32_t nRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(indexedFeatures.storeColumn); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("storeColumn", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(fullData, uint32_t, nRows * nFeatures); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, fullData, AccessModeIds::write); - args.set(2, featureId); - args.set(3, nFeatures); - args.set(4, nRows); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status IndexedFeaturesOneAPI::init(NumericTable & nt, const dtrees::internal::FeatureTypes * featureTypes, - const dtrees::internal::BinParams * pBinPrm) -{ - dtrees::internal::FeatureTypes autoFT; - if (!featureTypes) - { - DAAL_CHECK_MALLOC(autoFT.init(nt)); - featureTypes = &autoFT; - } - - if (nt.getNumberOfRows() > static_cast(UINT_MAX) || nt.getNumberOfColumns() > static_cast(UINT_MAX)) - { - return Status(ErrorBufferSizeIntegerOverflow); - } - - const uint32_t nC = static_cast(nt.getNumberOfColumns()); - const uint32_t nR = static_cast(nt.getNumberOfRows()); - - _maxNumIndices = 0; - services::Status status = alloc(nC, nR); - DAAL_CHECK_STATUS_VAR(status); - - auto & context = services::internal::getDefaultContext(); - - _values = context.allocate(TypeIds::id(), nR, status); - DAAL_CHECK_STATUS_VAR(status); - _values_buf = context.allocate(TypeIds::id(), nR, status); - DAAL_CHECK_STATUS_VAR(status); - - _indices = context.allocate(TypeIds::id(), nR, status); - DAAL_CHECK_STATUS_VAR(status); - _indices_buf = context.allocate(TypeIds::id(), nR, status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor dataBlock; - - if (nt.getDataLayout() == NumericTableIface::soa) - { - for (uint32_t i = 0; i < nC; i++) - { - DAAL_CHECK_STATUS_VAR(nt.getBlockOfColumnValues(i, 0, nR, readOnly, dataBlock)); - auto dataBuffer = dataBlock.getBuffer(); - DAAL_CHECK_STATUS_VAR(makeIndex(dataBuffer, 0, 1, nR, pBinPrm, _data[i], _entries[i])); - DAAL_CHECK_STATUS_VAR(nt.releaseBlockOfColumnValues(dataBlock)); - } - } - else - { - DAAL_CHECK_STATUS_VAR(nt.getBlockOfRows(0, nR, readOnly, dataBlock)); - auto dataBuffer = dataBlock.getBuffer(); - for (uint32_t i = 0; i < nC; i++) - { - DAAL_CHECK_STATUS_VAR(makeIndex(dataBuffer, i, nC, nR, pBinPrm, _data[i], _entries[i])); - } - DAAL_CHECK_STATUS_VAR(nt.releaseBlockOfRows(dataBlock)); - } - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(_binOffsets, uint32_t, nC + 1); - auto binOffsetsHost = _binOffsets.template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - int total = 0; - for (uint32_t i = 0; i < nC; i++) - { - DAAL_CHECK_STATUS_VAR(storeColumn(_data[i], _fullData, i, nC, nR)); - binOffsetsHost.get()[i] = total; - _entries[i].offset = total; - total += _entries[i].numIndices; - } - binOffsetsHost.get()[nC] = total; - _totalBins = total; - } - - return status; -} - -template -services::Status TreeNodeStorage::allocate(const gbt::internal::IndexedFeaturesOneAPI & indexedFeatures) -{ - services::Status status; - auto & context = services::internal::getDefaultContext(); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, indexedFeatures.totalBins(), 2); - _histogramsForFeatures = context.allocate(TypeIds::id(), indexedFeatures.totalBins() * 2, status); - - return status; -} - -template -BestSplitOneAPI::BestSplitOneAPI() - : _impurityDecrease(-services::internal::MaxVal::get()), - _featureIndex(-1), - _featureValue(0), - _leftGTotal(0.0), - _leftHTotal(0.0), - _rightGTotal(0.0), - _rightHTotal(0.0) -{} - -} /* namespace internal */ -} // namespace gbt -} /* namespace algorithms */ -} /* namespace daal */ diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/BUILD b/cpp/daal/src/algorithms/dtrees/gbt/regression/BUILD index af094a02cca..85e8695d1b8 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/BUILD +++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/BUILD @@ -5,12 +5,11 @@ load("@onedal//dev/bazel:dal.bzl", "dal_test_suite") daal_module( name = "kernel", auto = False, - hdrs = glob(["**/*.h", "**/*.i", "**/*.cl"]), + hdrs = glob(["**/*.h", "**/*.i"]), srcs = glob(["*.cpp"]), - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/dtrees/gbt:kernel", "@onedal//cpp/daal/src/algorithms/regression:kernel", "@onedal//cpp/daal/src/algorithms/classifier:kernel", diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_fpt_dispatcher.cpp index 60bb0a35018..e386d3fe553 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_predict_dense_default_batch_fpt_dispatcher.cpp @@ -29,7 +29,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(gbt::regression::prediction::BatchContainer, batch, DAAL_FPTYPE, gbt::regression::prediction::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(gbt::regression::prediction::BatchContainer, batch, DAAL_FPTYPE, gbt::regression::prediction::defaultDense) namespace gbt { namespace regression diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_container.h b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_container.h index 42117c9e769..63ae4f70c72 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_container.h +++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_container.h @@ -28,7 +28,6 @@ #include "algorithms/gradient_boosted_trees/gbt_regression_training_types.h" #include "algorithms/gradient_boosted_trees/gbt_regression_training_batch.h" #include "src/algorithms/dtrees/gbt/regression/gbt_regression_train_kernel.h" -#include "src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_kernel_oneapi.h" #include "src/algorithms/dtrees/gbt/regression/gbt_regression_model_impl.h" #include "src/services/service_algo_utils.h" @@ -49,17 +48,7 @@ namespace training template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::RegressionTrainBatchKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::RegressionTrainBatchKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::RegressionTrainBatchKernel, algorithmFPType, method); } template @@ -79,9 +68,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - Input * input = static_cast(_in); Result * result = static_cast(_res); @@ -95,16 +81,8 @@ services::Status BatchContainer::compute() daal::algorithms::engines::internal::BatchBaseImpl * engine = dynamic_cast(par->engine.get()); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::RegressionTrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), x, y, *m, *result, *par, *engine); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::RegressionTrainBatchKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), x, y, *m, *result, *par, *engine); - } + __DAAL_CALL_KERNEL(env, internal::RegressionTrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + daal::services::internal::hostApp(*input), x, y, *m, *result, *par, *engine); } template diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_fpt_dispatcher.cpp index cdb0b3760df..625dafb93a3 100644 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(gbt::regression::training::BatchContainer, batch, DAAL_FPTYPE, gbt::regression::training::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(gbt::regression::training::BatchContainer, batch, DAAL_FPTYPE, gbt::regression::training::defaultDense) namespace gbt { diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index 0b616782f10..00000000000 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/gbt_regression_train_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* file: gbt_regression_train_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of gradient boosted trees regression training functions for the default method -//-- -*/ - -#include "src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_kernel_oneapi.h" -#include "src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_dense_default_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace gbt -{ -namespace regression -{ -namespace training -{ -namespace internal -{ -template class RegressionTrainBatchKernelOneAPI; -} - -} // namespace training -} // namespace regression -} // namespace gbt -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/cl_kernels/gbt_batch_regression_kernels.cl b/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/cl_kernels/gbt_batch_regression_kernels.cl deleted file mode 100644 index aa0e014bad0..00000000000 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/cl_kernels/gbt_batch_regression_kernels.cl +++ /dev/null @@ -1,371 +0,0 @@ -/* file: gbt_kernels.cl */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of GBT Batch Regression OpenCL kernels. -//-- -*/ - -#ifndef __GBT_BATCH_REGRESSION_KERNELS_CL__ -#define __GBT_BATCH_REGRESSION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - gbt_batch_regression_kernels, - - __kernel void scan(const __global algorithmFPType * values, __global algorithmFPType * partialSums, unsigned int nRows) { - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - algorithmFPType sum = 0.0; - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - algorithmFPType partial_sum = sub_group_reduce_add(values[i]); - sum += partial_sum; - } - - if (local_id == 0) - { - partialSums[group_id] = sum; - } - } - - __kernel void reduce(const __global algorithmFPType * partialSums, __global algorithmFPType * totalSum, unsigned int nSubgroupSums) { - if (get_sub_group_id() > 0) return; - - const unsigned int local_size = get_sub_group_size(); - const unsigned int local_id = get_sub_group_local_id(); - - algorithmFPType sum = 0.0; - - for (unsigned int i = local_id; i < nSubgroupSums; i += local_size) - { - algorithmFPType partial_sum = sub_group_reduce_add(partialSums[i]); - sum += partial_sum; - } - - if (local_id == 0) - { - totalSum[0] = sum; - } - } - - __kernel void computeOptCoeffs(const __global algorithmFPType * labels, const __global algorithmFPType * response, - __global algorithmFPType * optCoeffs) { - const unsigned int id = get_global_id(0); - optCoeffs[2 * id + 0] = response[id] - labels[id]; - optCoeffs[2 * id + 1] = 1; - } - - __kernel void initializeTreeOrder(__global int * treeOrder) { - const unsigned int id = get_global_id(0); - treeOrder[id] = id; - } - - __kernel void computePartialHistograms(const __global int * data, const __global int * treeOrder, const __global algorithmFPType * optCoeffs, - __global algorithmFPType * partialHistograms, unsigned int offset, unsigned int nRows, - const __global int * binOffsets, unsigned int nTotalBins, unsigned int nFeatures) { - const unsigned int feat_id = get_local_id(1); - const unsigned int global_id = get_global_id(0); - const unsigned int global_size = get_global_size(0); - const unsigned int nElementsForGroup = nRows / global_size + !!(nRows % global_size); - - unsigned int iStart = global_id * nElementsForGroup; - unsigned int iEnd = (global_id + 1) * nElementsForGroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - __global algorithmFPType * histogram = partialHistograms + nTotalBins * global_id * 2 + binOffsets[feat_id] * 2; - - unsigned int nBins = binOffsets[feat_id + 1] - binOffsets[feat_id]; - - for (unsigned int i = 0; i < 2 * nBins; i++) - { - histogram[i] = 0.0; - } - - for (unsigned int i = iStart; i < iEnd; i++) - { - unsigned int id = treeOrder[offset + i]; - unsigned int bin = data[id * nFeatures + feat_id]; - histogram[bin * 2 + 0] += optCoeffs[id * 2 + 0]; - histogram[bin * 2 + 1] += optCoeffs[id * 2 + 1]; - } - } - - __kernel void reducePartialHistograms(const __global algorithmFPType * partialHistograms, __global algorithmFPType * histogram, - unsigned int nHistograms, unsigned int nTotalBins) { - __local algorithmFPType buf[256 * 2]; - - const unsigned int bin_id = get_global_id(0); - const unsigned int local_id = get_local_id(1); - const unsigned int local_size = get_local_size(1); - - buf[local_id * 2 + 0] = 0; - buf[local_id * 2 + 1] = 0; - - for (unsigned int i = local_id; i < nHistograms; i += local_size) - { - buf[local_id * 2 + 0] += partialHistograms[i * nTotalBins * 2 + bin_id * 2 + 0]; - buf[local_id * 2 + 1] += partialHistograms[i * nTotalBins * 2 + bin_id * 2 + 1]; - } - - for (unsigned int offset = local_size / 2; offset > 0; offset >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (local_id < offset) - { - buf[local_id * 2 + 0] += buf[(local_id + offset) * 2 + 0]; - buf[local_id * 2 + 1] += buf[(local_id + offset) * 2 + 1]; - } - } - - if (local_id == 0) - { - histogram[bin_id * 2 + 0] = buf[0]; - histogram[bin_id * 2 + 1] = buf[1]; - } - } - - __kernel void computeHistogramDiff(const __global algorithmFPType * histogramSrc, const __global algorithmFPType * histogramTotal, - __global algorithmFPType * histogramDst) { - const unsigned int id = get_global_id(0); - histogramDst[id * 2 + 0] = histogramTotal[id * 2 + 0] - histogramSrc[id * 2 + 0]; - histogramDst[id * 2 + 1] = histogramTotal[id * 2 + 1] - histogramSrc[id * 2 + 1]; - } - - __kernel void computeTotalOptCoeffs(const __global algorithmFPType * histogram, __global algorithmFPType * totalOptCoeffs, - const __global int * binOffsets, unsigned int nTotalBins) { - if (get_sub_group_id() > 0) return; - - const unsigned int feat_id = get_global_id(1); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int local_size = get_sub_group_size(); - - algorithmFPType g = 0.0; - algorithmFPType h = 0.0; - - unsigned int nBins = binOffsets[feat_id + 1] - binOffsets[feat_id]; - - const __global algorithmFPType * histogramForFeature = histogram + binOffsets[feat_id] * 2; - - for (unsigned int i = local_id; i < nBins; i += local_size) - { - g += sub_group_reduce_add(histogramForFeature[i * 2 + 0]); - h += sub_group_reduce_add(histogramForFeature[i * 2 + 1]); - } - - if (feat_id == 0 && local_id == 0) - { - totalOptCoeffs[0] = g; - totalOptCoeffs[1] = h; - } - } - - algorithmFPType impurityValue(algorithmFPType g, algorithmFPType h, algorithmFPType lambda) { return (g / (h + lambda)) * g; } - - __kernel void computeBestSplitForFeatures(const __global algorithmFPType * histogram, const __global algorithmFPType * totalOptCoeffs, - __global algorithmFPType * splitInfo, __global int * splitValue, const __global int * binOffsets, - unsigned int nTotalBins, algorithmFPType lambda) { - if (get_sub_group_id() > 0) return; - - const unsigned int feat_id = get_global_id(1); - - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int local_size = get_sub_group_size(); - - int curFeatureValue = -1; - algorithmFPType curImpDec = -1e30; - algorithmFPType curGLeft = 0.0; - algorithmFPType curHLeft = 0.0; - - algorithmFPType g = 0.0; - algorithmFPType h = 0.0; - - const __global algorithmFPType * histogramForFeature = histogram + binOffsets[feat_id] * 2; - __global algorithmFPType * splitInfoForFeature = splitInfo + feat_id * 5; - __global int * splitValueForFeature = splitValue + feat_id; - unsigned int nBins = binOffsets[feat_id + 1] - binOffsets[feat_id]; - - for (unsigned int i = local_id; i < nBins; i += local_size) - { - algorithmFPType gLeft = g + sub_group_scan_inclusive_add(histogramForFeature[i * 2 + 0]); - algorithmFPType hLeft = h + sub_group_scan_inclusive_add(histogramForFeature[i * 2 + 1]); - algorithmFPType gRight = totalOptCoeffs[0] - gLeft; - algorithmFPType hRight = totalOptCoeffs[1] - hLeft; - - algorithmFPType impDec = impurityValue(gLeft, hLeft, lambda) + impurityValue(gRight, hRight, lambda); - - if (curFeatureValue == -1 || impDec > curImpDec) - { - curFeatureValue = i; - curImpDec = impDec; - curGLeft = gLeft; - curHLeft = hLeft; - } - - g += sub_group_reduce_add(histogramForFeature[i * 2 + 0]); - h += sub_group_reduce_add(histogramForFeature[i * 2 + 1]); - } - - algorithmFPType bestImpDec = sub_group_reduce_max(curImpDec); - int bestFeatureValue = sub_group_reduce_min(bestImpDec == curImpDec ? curFeatureValue : nBins); - - if (curFeatureValue == bestFeatureValue) - { - splitValueForFeature[0] = curFeatureValue == nBins ? -1 : curFeatureValue; - splitInfoForFeature[0] = curImpDec; - splitInfoForFeature[1] = curGLeft; - splitInfoForFeature[2] = curHLeft; - splitInfoForFeature[3] = totalOptCoeffs[0] - curGLeft; - splitInfoForFeature[4] = totalOptCoeffs[1] - curHLeft; - } - } - - __kernel void partitionScan(const __global int * data, const __global int * treeOrder, __global int * partialSums, int splitValue, - unsigned int offset, unsigned int nRows) { - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - unsigned int sum = 0; - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - unsigned int value = (unsigned int)(data[treeOrder[offset + i]] > splitValue); - sum += sub_group_reduce_add(value); - } - - if (local_id == 0) - { - partialSums[group_id] = sum; - } - } - - __kernel void partitionSumScan(const __global int * partialSums, __global int * partialPrefixSums, __global int * totalSum, - unsigned int nSubgroupSums) { - if (get_sub_group_id() > 0) return; - - const unsigned int local_size = get_sub_group_size(); - const unsigned int local_id = get_sub_group_local_id(); - - unsigned int sum = 0; - - for (unsigned int i = local_id; i < nSubgroupSums; i += local_size) - { - unsigned int value = partialSums[i]; - unsigned int boundary = sub_group_scan_exclusive_add(value); - partialPrefixSums[i] = sum + boundary; - sum += sub_group_reduce_add(value); - } - - if (local_id == 0) - { - totalSum[0] = sum; - partialPrefixSums[nSubgroupSums] = sum; - } - } - - __kernel void partitionReorder(const __global int * data, const __global int * treeOrder, __global int * treeOrderBuf, - const __global int * partialPrefixSums, int splitValue, unsigned int offset, unsigned int nRows) { - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - unsigned int groupOffset = partialPrefixSums[group_id]; - unsigned int totalOffset = nRows - partialPrefixSums[n_total_sub_groups]; - unsigned int sum = 0; - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - unsigned int id = treeOrder[offset + i]; - unsigned int part = (unsigned int)(data[id] > splitValue); - unsigned int boundary = groupOffset + sum + sub_group_scan_exclusive_add(part); - unsigned int pos_new = (part ? totalOffset + boundary : i - boundary); - treeOrderBuf[offset + pos_new] = id; - sum += sub_group_reduce_add(part); - } - } - - __kernel void partitionCopy(const __global unsigned int * treeOrderBuf, __global int * treeOrder, unsigned int offset) { - const unsigned int id = get_global_id(0); - treeOrder[offset + id] = treeOrderBuf[offset + id]; - } - - __kernel void updateResponse(const __global int * treeOrder, __global algorithmFPType * response, unsigned int iStart, unsigned int nRows, - algorithmFPType inc) { - const unsigned int id = get_global_id(0); - response[treeOrder[id + iStart]] += inc; - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_dense_default_oneapi_impl.i b/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_dense_default_oneapi_impl.i deleted file mode 100644 index 35f68ee6f58..00000000000 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_dense_default_oneapi_impl.i +++ /dev/null @@ -1,1136 +0,0 @@ -/* file: gbt_regression_train_dense_default_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for gradient boosted trees regression -// (defaultDense) method. -//-- -*/ - -#ifndef __GBT_REGRESSION_TRAIN_DENSE_DEFAULT_ONEAPI_IMPL_I__ -#define __GBT_REGRESSION_TRAIN_DENSE_DEFAULT_ONEAPI_IMPL_I__ - -#include "src/algorithms/dtrees/gbt/regression/oneapi/cl_kernels/gbt_batch_regression_kernels.cl" - -#include "src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.i" -#include "src/algorithms/dtrees/gbt/regression/gbt_regression_model_impl.h" -#include "src/algorithms/dtrees/gbt/regression/gbt_regression_tree_impl.h" -#include "src/algorithms/dtrees/gbt/gbt_model_impl.h" - -#include "src/externals/service_profiler.h" -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/services/service_data_utils.h" -#include "src/services/service_algo_utils.h" -#include "services/internal/sycl/types.h" - -using namespace daal::algorithms::gbt::internal; -using namespace daal::algorithms::gbt::regression::internal; - -namespace daal -{ -namespace algorithms -{ -namespace gbt -{ -namespace regression -{ -namespace training -{ -namespace internal -{ -template -static services::Status buildProgram(ClKernelFactoryIface & factory) -{ - services::Status status; - - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - build_options.add("-cl-std=CL1.2"); - - services::String cachekey("__daal_algorithms_gbt_batch_regression_"); - cachekey.add(fptype_name); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), gbt_batch_regression_kernels, build_options.c_str(), status); - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::scan(const services::internal::Buffer & values, - UniversalBuffer & partialSums, uint32_t nRows, uint32_t localSize, - uint32_t nLocalSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.scan); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelScan; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(values), algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialSums, algorithmFPType, nLocalSums); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, values, AccessModeIds::read); - args.set(1, partialSums, AccessModeIds::write); - args.set(2, nRows); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalSums); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::reduce(UniversalBuffer & partialSums, UniversalBuffer & totalSum, - uint32_t localSize, uint32_t nSubgroupSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.reduce); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelReduce; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(partialSums, algorithmFPType, nSubgroupSums); - DAAL_ASSERT_UNIVERSAL_BUFFER(totalSum, algorithmFPType, 1); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialSums, AccessModeIds::read); - args.set(1, totalSum, AccessModeIds::write); - args.set(2, nSubgroupSums); - - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::getInitialResponse(NumericTable & y, algorithmFPType * response) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.getInitialResponse); - - services::Status status; - - const uint32_t nRows = static_cast(y.getNumberOfRows()); - - auto & context = services::internal::getDefaultContext(); - - const uint32_t subSize = _preferableSubGroup; - const uint32_t localSize = _preferableSubGroup; - const uint32_t nLocalSums = 1024 * localSize < nRows ? 1024 : (nRows / localSize) + !!(nRows % localSize); - const uint32_t nSubgroupSums = nLocalSums * (localSize / subSize); - - auto partialSums = context.allocate(TypeIds::id(), nSubgroupSums, status); - DAAL_CHECK_STATUS_VAR(status); - auto totalSum = context.allocate(TypeIds::id(), 1, status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor yBlock; - DAAL_CHECK_STATUS_VAR(y.getBlockOfRows(0, nRows, readOnly, yBlock)); - auto yBuffer = yBlock.getBuffer(); - - DAAL_CHECK_STATUS_VAR(scan(yBuffer, partialSums, nRows, localSize, nLocalSums)); - DAAL_CHECK_STATUS_VAR(reduce(partialSums, totalSum, localSize, nLocalSums)); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(totalSum, algorithmFPType, 1); - auto totalSumHost = totalSum.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_ASSERT(response); - *response = totalSumHost.get()[0] / nRows; - } - - DAAL_CHECK_STATUS_VAR(y.releaseBlockOfRows(yBlock)); - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeOptCoeffs(NumericTable & y, UniversalBuffer & response, - UniversalBuffer & optCoeffs) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeOptCoeffs); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeOptCoeffs; - - const uint32_t nRows = static_cast(y.getNumberOfRows()); - - BlockDescriptor yBlock; - DAAL_CHECK_STATUS_VAR(y.getBlockOfRows(0, nRows, readOnly, yBlock)); - auto yBuffer = yBlock.getBuffer(); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(yBuffer), algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(response, algorithmFPType, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(optCoeffs, algorithmFPType, nRows * 2); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, yBuffer, AccessModeIds::read); - args.set(1, response, AccessModeIds::read); - args.set(2, optCoeffs, AccessModeIds::write); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_CHECK_STATUS_VAR(y.releaseBlockOfRows(yBlock)); - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::initializeTreeOrder(uint32_t nRows, UniversalBuffer & treeOrder) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.initializeTreeOrder); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelInitializeTreeOrder; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int, nRows); - - KernelArguments args(1, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, treeOrder, AccessModeIds::write); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computePartialHistograms( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & optCoeffs, UniversalBuffer & partialHistograms, uint32_t iStart, - uint32_t nRows, UniversalBuffer & binOffsets, uint32_t nTotalBins, uint32_t nFeatures, uint32_t localSize, uint32_t nPartialHistograms, - uint32_t totalRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computePartialHistograms); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputePartialHistograms; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, totalRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(optCoeffs, algorithmFPType, totalRows * 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHistograms, algorithmFPType, _maxLocalHistograms * nTotalBins * 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, nFeatures + 1); - - KernelArguments args(9, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::read); - args.set(2, optCoeffs, AccessModeIds::read); - args.set(3, partialHistograms, AccessModeIds::write); - args.set(4, iStart); - args.set(5, nRows); - args.set(6, binOffsets, AccessModeIds::read); - args.set(7, nTotalBins); - args.set(8, nFeatures); - - uint32_t localSize = nFeatures < _maxLocalSize ? nFeatures : _maxLocalSize; - - KernelRange local_range(1, localSize); - KernelRange global_range(nPartialHistograms, localSize); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::reducePartialHistograms(UniversalBuffer & partialHistograms, - UniversalBuffer & histograms, uint32_t nTotalBins, - uint32_t reduceLocalSize, - uint32_t nPartialHistograms) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.reducePartialHistograms); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelReducePartialHistograms; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHistograms, algorithmFPType, _maxLocalHistograms * nTotalBins * 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(histograms, algorithmFPType, nTotalBins * 2); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialHistograms, AccessModeIds::read); - args.set(1, histograms, AccessModeIds::write); - args.set(2, nPartialHistograms); - args.set(3, nTotalBins); - - KernelRange local_range(1, reduceLocalSize); - KernelRange global_range(nTotalBins, reduceLocalSize); - - KernelNDRange range(2); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeHistogram( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & optCoeffs, UniversalBuffer & partialHistograms, - UniversalBuffer & histograms, uint32_t iStart, uint32_t nRows, UniversalBuffer & binOffsets, uint32_t nTotalBins, uint32_t totalRows, - uint32_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeHistogram); - - services::Status status; - - const uint32_t localSize = _preferableSubGroup; - const uint32_t nPartialHistograms = - (nRows < _preferableGroupSize * _maxLocalHistograms) ? nRows / _preferableGroupSize + !!(nRows % _preferableGroupSize) : _maxLocalHistograms; - - uint32_t reduceLocalSize = 1; - while (reduceLocalSize * 2 <= nPartialHistograms) - { - reduceLocalSize *= 2; - } - - DAAL_CHECK_STATUS_VAR(computePartialHistograms(data, treeOrder, optCoeffs, partialHistograms, iStart, nRows, binOffsets, nTotalBins, nFeatures, - localSize, nPartialHistograms, totalRows)); - DAAL_CHECK_STATUS_VAR(reducePartialHistograms(partialHistograms, histograms, nTotalBins, reduceLocalSize, nPartialHistograms)); - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeHistogramDiff(UniversalBuffer & histogramSrc, - UniversalBuffer & histogramTotal, - UniversalBuffer & histogramDst, uint32_t nTotalBins) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeHistogramDiff); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeHistogramDiff; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(histogramSrc, algorithmFPType, nTotalBins * 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(histogramTotal, algorithmFPType, nTotalBins * 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(histogramDst, algorithmFPType, nTotalBins * 2); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, histogramSrc, AccessModeIds::read); - args.set(1, histogramTotal, AccessModeIds::read); - args.set(2, histogramDst, AccessModeIds::write); - - KernelRange global_range(nTotalBins); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeTotalOptCoeffs(UniversalBuffer & histograms, - UniversalBuffer & totalOptCoeffs, - UniversalBuffer & binOffsets, uint32_t nTotalBins, - uint32_t nFeatures, uint32_t localSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeTotalOptCoeffs); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeTotalOptCoeffs; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(histograms, algorithmFPType, nTotalBins * 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(totalOptCoeffs, algorithmFPType, 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, nFeatures + 1); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, histograms, AccessModeIds::read); - args.set(1, totalOptCoeffs, AccessModeIds::write); - args.set(2, binOffsets, AccessModeIds::read); - args.set(3, nTotalBins); - - KernelRange global_range(localSize, nFeatures); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeBestSplitForFeatures( - UniversalBuffer & histograms, UniversalBuffer & totalOptCoeffs, UniversalBuffer & splitInfo, UniversalBuffer & splitValue, - UniversalBuffer & binOffsets, uint32_t nTotalBins, uint32_t nFeatures, algorithmFPType lambda, uint32_t localSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeBestSplitForFeatures); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelComputeBestSplitForFeatures; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(histograms, algorithmFPType, nTotalBins * 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(totalOptCoeffs, algorithmFPType, 2); - DAAL_ASSERT_UNIVERSAL_BUFFER(splitInfo, algorithmFPType, nFeatures * 5); - DAAL_ASSERT_UNIVERSAL_BUFFER(splitValue, int, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(binOffsets, uint32_t, nFeatures + 1); - - KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, histograms, AccessModeIds::read); - args.set(1, totalOptCoeffs, AccessModeIds::read); - args.set(2, splitInfo, AccessModeIds::write); - args.set(3, splitValue, AccessModeIds::write); - args.set(4, binOffsets, AccessModeIds::read); - args.set(5, nTotalBins); - args.set(6, lambda); - - KernelRange local_range(localSize, 1); - KernelRange global_range(localSize, nFeatures); - - KernelNDRange range(2); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::computeBestSplit(UniversalBuffer & histograms, - UniversalBuffer & binOffsets, uint32_t nTotalBins, - uint32_t nFeatures, algorithmFPType lambda, - BestSplitOneAPI & bestSplit, - algorithmFPType * gTotal, algorithmFPType * hTotal) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeBestSplit); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto totalOptCoeffs = context.allocate(TypeIds::id(), 2, status); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nFeatures, 5); - auto splitInfo = context.allocate(TypeIds::id(), nFeatures * 5, status); - auto splitValue = context.allocate(TypeIds::id(), nFeatures, status); - - DAAL_CHECK_STATUS_VAR(status); - - const uint32_t localSize = _preferableSubGroup; - - DAAL_CHECK_STATUS_VAR(computeTotalOptCoeffs(histograms, totalOptCoeffs, binOffsets, nTotalBins, nFeatures, localSize)); - DAAL_CHECK_STATUS_VAR( - computeBestSplitForFeatures(histograms, totalOptCoeffs, splitInfo, splitValue, binOffsets, nTotalBins, nFeatures, lambda, localSize)); - - if (gTotal && hTotal) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(totalOptCoeffs, algorithmFPType, 2); - auto totalOptCoeffsHost = totalOptCoeffs.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - *gTotal = totalOptCoeffsHost.get()[0]; - *hTotal = totalOptCoeffsHost.get()[1]; - } - { - DAAL_ASSERT_UNIVERSAL_BUFFER(splitInfo, algorithmFPType, nFeatures * 5); - DAAL_ASSERT_UNIVERSAL_BUFFER(splitValue, int, nFeatures); - auto splitInfoHost = splitInfo.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - auto splitValueHost = splitValue.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - for (uint32_t featId = 0; featId < nFeatures; featId++) - { - algorithmFPType impurityDecrease = splitInfoHost.get()[featId * 5 + 0]; - int32_t featureValue = splitValueHost.get()[featId]; - if (featureValue != -1) - { - if (impurityDecrease > bestSplit._impurityDecrease - || (impurityDecrease == bestSplit._impurityDecrease && static_cast(featId) < bestSplit._featureIndex)) - { - bestSplit._impurityDecrease = impurityDecrease; - bestSplit._featureIndex = static_cast(featId); - bestSplit._featureValue = featureValue; - bestSplit._leftGTotal = splitInfoHost.get()[featId * 5 + 1]; - bestSplit._leftHTotal = splitInfoHost.get()[featId * 5 + 2]; - bestSplit._rightGTotal = splitInfoHost.get()[featId * 5 + 3]; - bestSplit._rightHTotal = splitInfoHost.get()[featId * 5 + 4]; - } - } - } - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::partitionScan(const UniversalBuffer & data, UniversalBuffer & treeOrder, - UniversalBuffer & partialSums, int splitValue, - uint32_t iStart, uint32_t nRows, uint32_t localSize, - uint32_t nLocalSums, uint32_t totalRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.partitionScan); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPartitionScan; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialSums, int, nLocalSums + 1); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::read); - args.set(2, partialSums, AccessModeIds::write); - args.set(3, splitValue); - args.set(4, iStart); - args.set(5, nRows); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalSums); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::partitionSumScan(UniversalBuffer & partialSums, - UniversalBuffer & partialPrefixSums, - UniversalBuffer & totalSum, uint32_t localSize, - uint32_t nSubgroupSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.partitionSumScan); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPartitionSumScan; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(partialSums, int, nSubgroupSums + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialPrefixSums, int, nSubgroupSums + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(totalSum, int, 1); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialSums, AccessModeIds::read); - args.set(1, partialPrefixSums, AccessModeIds::write); - args.set(2, totalSum, AccessModeIds::write); - args.set(3, nSubgroupSums); - - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::partitionReorder( - const UniversalBuffer & data, UniversalBuffer & treeOrder, UniversalBuffer & treeOrderBuf, UniversalBuffer & partialPrefixSums, int splitValue, - uint32_t iStart, uint32_t nRows, uint32_t localSize, uint32_t nLocalSums, uint32_t totalRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.partitionReorder); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPartitionReorder; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(data, uint32_t, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrderBuf, int, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialPrefixSums, int, nLocalSums + 1); - - KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, data, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::read); - args.set(2, treeOrderBuf, AccessModeIds::write); - args.set(3, partialPrefixSums, AccessModeIds::read); - args.set(4, splitValue); - args.set(5, iStart); - args.set(6, nRows); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalSums); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::partitionCopy(UniversalBuffer & treeOrderBuf, UniversalBuffer & treeOrder, - uint32_t iStart, uint32_t nRows, uint32_t totalRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.partitionCopy); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelPartitionCopy; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrderBuf, int, totalRows); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, treeOrderBuf, AccessModeIds::read); - args.set(1, treeOrder, AccessModeIds::write); - args.set(2, iStart); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::doPartition(const UniversalBuffer & data, UniversalBuffer & treeOrder, - UniversalBuffer & treeOrderBuf, int splitValue, - uint32_t iStart, uint32_t nRows, uint32_t & nLeft, - uint32_t & nRight, uint32_t totalRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.doPartition); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - const int subSize = _preferableSubGroup; - const int localSize = _preferableSubGroup; - const int nLocalSums = _maxLocalSums * localSize < nRows ? _maxLocalSums : (nRows / localSize) + !!(nRows % localSize); - const int nSubgroupSums = nLocalSums * (localSize / subSize); - - DAAL_OVERFLOW_CHECK_BY_ADDING(uint32_t, nSubgroupSums, 1); - auto partialSums = context.allocate(TypeIds::id(), nSubgroupSums + 1, status); - DAAL_CHECK_STATUS_VAR(status); - auto partialPrefixSums = context.allocate(TypeIds::id(), nSubgroupSums + 1, status); - DAAL_CHECK_STATUS_VAR(status); - auto totalSum = context.allocate(TypeIds::id(), 1, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(partitionScan(data, treeOrder, partialSums, splitValue, iStart, nRows, localSize, nLocalSums, totalRows)); - DAAL_CHECK_STATUS_VAR(partitionSumScan(partialSums, partialPrefixSums, totalSum, localSize, nSubgroupSums)); - DAAL_CHECK_STATUS_VAR( - partitionReorder(data, treeOrder, treeOrderBuf, partialPrefixSums, splitValue, iStart, nRows, localSize, nLocalSums, totalRows)); - DAAL_CHECK_STATUS_VAR(partitionCopy(treeOrderBuf, treeOrder, iStart, nRows, totalRows)); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(totalSum, int, 1); - auto totalSumHost = totalSum.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - nRight = totalSumHost.get()[0]; - nLeft = nRows - totalSumHost.get()[0]; - if (nLeft == 0 || nRight == 0) - { - return status; - } - } - - return status; -} - -template -services::Status RegressionTrainBatchKernelOneAPI::updateResponse(UniversalBuffer & treeOrder, UniversalBuffer & response, - uint32_t iStart, uint32_t nRows, algorithmFPType inc, - uint32_t totalRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateResponse); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto & kernel = kernelUpdateResponse; - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(treeOrder, int, totalRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(response, algorithmFPType, totalRows); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, treeOrder, AccessModeIds::read); - args.set(1, response, AccessModeIds::write); - args.set(2, iStart); - args.set(3, nRows); - args.set(4, inc); - - KernelRange global_range(nRows); - - context.run(global_range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -////////////////////////////////////////////////////////////////////////////////////////// -// RegressionTrainBatchKernelOneAPI -////////////////////////////////////////////////////////////////////////////////////////// -template -services::Status RegressionTrainBatchKernelOneAPI::compute(HostAppIface * pHostApp, const NumericTable * x, - const NumericTable * y, gbt::regression::Model & m, Result & res, - const Parameter & par, engines::internal::BatchBaseImpl & engine) -{ - typedef TreeTableConnector ConnectorType; - - if (x->getNumberOfRows() > static_cast(UINT_MAX) || x->getNumberOfColumns() > static_cast(UINT_MAX)) - { - return Status(ErrorBufferSizeIntegerOverflow); - } - - const uint32_t nRows = static_cast(x->getNumberOfRows()); - const uint32_t nFeatures = static_cast(x->getNumberOfColumns()); - const uint32_t nFeaturesPerNode = static_cast(par.featuresPerNode ? par.featuresPerNode : nFeatures); - const bool inexactWithHistMethod = - !par.memorySavingMode && par.splitMethod == gbt::training::inexact && x->getNumberOfColumns() == nFeaturesPerNode; - - DAAL_ASSERT(inexactWithHistMethod); - - gbt::internal::ModelImpl & modelImpl = *static_cast(&m); - DAAL_CHECK_MALLOC(modelImpl.reserve(par.maxIterations)); - - services::Status status; - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - - status |= buildProgram(kernel_factory); - DAAL_CHECK_STATUS_VAR(status); - - kernelScan = kernel_factory.getKernel("scan", status); - kernelReduce = kernel_factory.getKernel("reduce", status); - kernelInitializeTreeOrder = kernel_factory.getKernel("initializeTreeOrder", status); - kernelComputePartialHistograms = kernel_factory.getKernel("computePartialHistograms", status); - kernelReducePartialHistograms = kernel_factory.getKernel("reducePartialHistograms", status); - kernelComputeHistogramDiff = kernel_factory.getKernel("computeHistogramDiff", status); - kernelComputeOptCoeffs = kernel_factory.getKernel("computeOptCoeffs", status); - kernelComputeTotalOptCoeffs = kernel_factory.getKernel("computeTotalOptCoeffs", status); - kernelComputeBestSplitForFeatures = kernel_factory.getKernel("computeBestSplitForFeatures", status); - kernelPartitionScan = kernel_factory.getKernel("partitionScan", status); - kernelPartitionSumScan = kernel_factory.getKernel("partitionSumScan", status); - kernelPartitionReorder = kernel_factory.getKernel("partitionReorder", status); - kernelPartitionCopy = kernel_factory.getKernel("partitionCopy", status); - kernelUpdateResponse = kernel_factory.getKernel("updateResponse", status); - - DAAL_CHECK_STATUS_VAR(status); - - gbt::internal::IndexedFeaturesOneAPI indexedFeatures; - dtrees::internal::FeatureTypes featTypes; - DAAL_CHECK_MALLOC(featTypes.init(*x)); - - BinParams prm(par.maxBins, par.minBinSize); - DAAL_CHECK_STATUS(status, (indexedFeatures.init(*const_cast(x), &featTypes, &prm))); - - auto response = context.allocate(TypeIds::id(), nRows, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nRows, 2); - auto optCoeffs = context.allocate(TypeIds::id(), nRows * 2, status); - DAAL_CHECK_STATUS_VAR(status); - auto treeOrder = context.allocate(TypeIds::id(), nRows, status); - DAAL_CHECK_STATUS_VAR(status); - auto treeOrderBuf = context.allocate(TypeIds::id(), nRows, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, _maxLocalHistograms, indexedFeatures.totalBins()); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, _maxLocalHistograms * indexedFeatures.totalBins(), 2); - auto partialHistograms = context.allocate(TypeIds::id(), _maxLocalHistograms * indexedFeatures.totalBins() * 2, status); - DAAL_CHECK_STATUS_VAR(status); - - algorithmFPType initResp = 0.0; - DAAL_CHECK_STATUS(status, getInitialResponse(*const_cast(y), &initResp)); - - context.fill(response, initResp, status); - DAAL_CHECK_STATUS_VAR(status); - - AOSNumericTablePtr treeStructure = ConnectorType::createGBTree(par.maxTreeDepth, status); - DAAL_CHECK_STATUS_VAR(status); - ConnectorType connector(treeStructure.get()); - - const uint32_t maxNodes = static_cast(treeStructure->getNumberOfRows()); - - Collection treeNodeStorages(maxNodes); - DAAL_CHECK_MALLOC(treeNodeStorages.data()); - - for (uint32_t iter = 0; (iter < par.maxIterations) && !algorithms::internal::isCancelled(status, pHostApp); ++iter) - { - DAAL_CHECK_STATUS_VAR(computeOptCoeffs(*const_cast(y), response, optCoeffs)); - DAAL_CHECK_STATUS_VAR(initializeTreeOrder(nRows, treeOrder)); - - TableRecord * record = connector.get(0); - - record->level = 0; - record->nid = 0; - record->iStart = 0; - record->n = nRows; - record->nodeState = ConnectorType::split; - record->isFinalized = false; - - Collection > splits; - Collection > leafs; - - SplitRecord splitRecord(record); - splits.push_back(splitRecord); - - for (size_t splitId = 0; splitId < splits.size(); splitId++) - { - SplitRecord & split = splits[splitId]; - if (split.first && split.second) - { - TableRecord * leftRecord = split.first; - TableRecord * rightRecord = split.second; - DAAL_ASSERT(leftRecord->nid > 0 && leftRecord->nid < static_cast(UINT_MAX)); - const uint32_t parentId = (static_cast(leftRecord->nid) - 1) / 2; - DAAL_CHECK_STATUS_VAR(treeNodeStorages[leftRecord->nid].allocate(indexedFeatures)); - DAAL_CHECK_STATUS_VAR(treeNodeStorages[rightRecord->nid].allocate(indexedFeatures)); - BestSplitOneAPI bestSplitLeft; - BestSplitOneAPI bestSplitRight; - if (leftRecord->n < rightRecord->n) - { - DAAL_CHECK_STATUS_VAR(computeHistogram(indexedFeatures.getFullData(), treeOrder, optCoeffs, partialHistograms, - treeNodeStorages[leftRecord->nid].getHistograms(), leftRecord->iStart, leftRecord->n, - indexedFeatures.binOffsets(), indexedFeatures.totalBins(), nRows, nFeatures)); - DAAL_CHECK_STATUS_VAR(computeHistogramDiff(treeNodeStorages[leftRecord->nid].getHistograms(), - treeNodeStorages[parentId].getHistograms(), - treeNodeStorages[rightRecord->nid].getHistograms(), indexedFeatures.totalBins())); - } - else - { - DAAL_CHECK_STATUS_VAR(computeHistogram(indexedFeatures.getFullData(), treeOrder, optCoeffs, partialHistograms, - treeNodeStorages[rightRecord->nid].getHistograms(), rightRecord->iStart, rightRecord->n, - indexedFeatures.binOffsets(), indexedFeatures.totalBins(), nRows, nFeatures)); - DAAL_CHECK_STATUS_VAR(computeHistogramDiff(treeNodeStorages[rightRecord->nid].getHistograms(), - treeNodeStorages[parentId].getHistograms(), - treeNodeStorages[leftRecord->nid].getHistograms(), indexedFeatures.totalBins())); - } - - DAAL_CHECK_STATUS_VAR(computeBestSplit(treeNodeStorages[leftRecord->nid].getHistograms(), indexedFeatures.binOffsets(), - indexedFeatures.totalBins(), nFeatures, par.lambda, bestSplitLeft)); - DAAL_CHECK_STATUS_VAR(computeBestSplit(treeNodeStorages[rightRecord->nid].getHistograms(), indexedFeatures.binOffsets(), - indexedFeatures.totalBins(), nFeatures, par.lambda, bestSplitRight)); - - bestSplitLeft._impurityDecrease -= (leftRecord->gTotal / (leftRecord->hTotal + par.lambda)) * leftRecord->gTotal; - if (bestSplitLeft._impurityDecrease < par.minSplitLoss || bestSplitLeft._featureIndex < 0 || bestSplitLeft._featureValue < 0) - { - leftRecord->isFinalized = true; - leftRecord->nodeState = ConnectorType::badSplit; - } - else - { - uint32_t nLeft = 0; - uint32_t nRight = 0; - DAAL_CHECK_STATUS_VAR(doPartition(indexedFeatures.getFeature(bestSplitLeft._featureIndex), treeOrder, treeOrderBuf, - bestSplitLeft._featureValue, leftRecord->iStart, leftRecord->n, nLeft, nRight, nRows)); - if (nLeft == 0 || nRight == 0) - { - leftRecord->isFinalized = true; - leftRecord->nodeState = ConnectorType::badSplit; - } - else - { - leftRecord->isFinalized = true; - leftRecord->featureValue = bestSplitLeft._featureValue; - leftRecord->featureIdx = bestSplitLeft._featureIndex; - connector.createNode(leftRecord->level + 1, leftRecord->nid * 2 + 1, nLeft, leftRecord->iStart, bestSplitLeft._leftGTotal, - bestSplitLeft._leftHTotal, nLeft, par); - connector.createNode(leftRecord->level + 1, leftRecord->nid * 2 + 2, leftRecord->n - nLeft, leftRecord->iStart + nLeft, - bestSplitLeft._rightGTotal, bestSplitLeft._rightHTotal, nRight, par); - connector.setSplitLevel(leftRecord->level + 1); - connector.getSplitNodesMerged(leftRecord->nid, splits, false); - } - } - - bestSplitRight._impurityDecrease -= (rightRecord->gTotal / (rightRecord->hTotal + par.lambda)) * rightRecord->gTotal; - if (bestSplitRight._impurityDecrease < par.minSplitLoss || bestSplitRight._featureIndex < 0 || bestSplitRight._featureValue < 0) - { - rightRecord->isFinalized = true; - rightRecord->nodeState = ConnectorType::badSplit; - } - else - { - uint32_t nLeft = 0; - uint32_t nRight = 0; - DAAL_CHECK_STATUS_VAR(doPartition(indexedFeatures.getFeature(bestSplitRight._featureIndex), treeOrder, treeOrderBuf, - bestSplitRight._featureValue, rightRecord->iStart, rightRecord->n, nLeft, nRight, nRows)); - if (nLeft == 0 || nRight == 0) - { - rightRecord->isFinalized = true; - rightRecord->nodeState = ConnectorType::badSplit; - } - else - { - rightRecord->isFinalized = true; - rightRecord->featureValue = bestSplitRight._featureValue; - rightRecord->featureIdx = bestSplitRight._featureIndex; - connector.createNode(rightRecord->level + 1, rightRecord->nid * 2 + 1, nLeft, rightRecord->iStart, bestSplitRight._leftGTotal, - bestSplitRight._leftHTotal, nLeft, par); - connector.createNode(rightRecord->level + 1, rightRecord->nid * 2 + 2, rightRecord->n - nLeft, rightRecord->iStart + nLeft, - bestSplitRight._rightGTotal, bestSplitRight._rightHTotal, nRight, par); - connector.setSplitLevel(rightRecord->level + 1); - connector.getSplitNodesMerged(rightRecord->nid, splits, false); - } - } - - treeNodeStorages[parentId].clear(); - } - else - { - TableRecord * record = (split.first ? split.first : split.second); - DAAL_CHECK_STATUS_VAR(treeNodeStorages[record->nid].allocate(indexedFeatures)); - BestSplitOneAPI bestSplit; - algorithmFPType gTotal = 0.0; - algorithmFPType hTotal = 0.0; - DAAL_CHECK_STATUS_VAR(computeHistogram(indexedFeatures.getFullData(), treeOrder, optCoeffs, partialHistograms, - treeNodeStorages[record->nid].getHistograms(), record->iStart, record->n, - indexedFeatures.binOffsets(), indexedFeatures.totalBins(), nRows, nFeatures)); - DAAL_CHECK_STATUS_VAR(computeBestSplit(treeNodeStorages[record->nid].getHistograms(), indexedFeatures.binOffsets(), - indexedFeatures.totalBins(), nFeatures, par.lambda, bestSplit, &gTotal, &hTotal)); - if (record->nid == 0) - { - record->gTotal = gTotal; - record->hTotal = hTotal; - record->nTotal = record->n; - } - bestSplit._impurityDecrease -= (record->gTotal / (record->hTotal + par.lambda)) * record->gTotal; - if (bestSplit._impurityDecrease < par.minSplitLoss || bestSplit._featureIndex < 0 || bestSplit._featureValue < 0) - { - record->isFinalized = true; - record->nodeState = ConnectorType::badSplit; - } - else - { - uint32_t nLeft = 0; - uint32_t nRight = 0; - DAAL_CHECK_STATUS_VAR(doPartition(indexedFeatures.getFeature(bestSplit._featureIndex), treeOrder, treeOrderBuf, - bestSplit._featureValue, record->iStart, record->n, nLeft, nRight, nRows)); - if (nLeft == 0 || nRight == 0) - { - record->isFinalized = true; - record->nodeState = ConnectorType::badSplit; - } - else - { - record->isFinalized = true; - record->featureValue = bestSplit._featureValue; - record->featureIdx = bestSplit._featureIndex; - connector.createNode(record->level + 1, record->nid * 2 + 1, nLeft, record->iStart, bestSplit._leftGTotal, - bestSplit._leftHTotal, nLeft, par); - connector.createNode(record->level + 1, record->nid * 2 + 2, record->n - nLeft, record->iStart + nLeft, - bestSplit._rightGTotal, bestSplit._rightHTotal, nRight, par); - connector.setSplitLevel(record->level + 1); - connector.getSplitNodesMerged(record->nid, splits, false); - } - } - if (record->nid > 0) - { - DAAL_ASSERT(record->nid > 0 && record->nid < static_cast(UINT_MAX)); - uint32_t parentId = (static_cast(record->nid) - 1) / 2; - treeNodeStorages[parentId].clear(); - } - } - } - - Collection *> leaves; - connector.getLeafNodes(0, leaves); - DAAL_ASSERT(leaves.size() < static_cast(UINT_MAX)); - uint32_t nLeaves = static_cast(leaves.size()); - - for (uint32_t leafId = 0; leafId < nLeaves; leafId++) - { - TableRecord * node = leaves[leafId]; - DAAL_ASSERT(node); - - algorithmFPType resp = 0; - - algorithmFPType val = node->hTotal + par.lambda; - if (val != 0.0) - { - val = -node->gTotal / val; - const algorithmFPType inc = val * par.shrinkage; - - resp = inc; - - DAAL_CHECK_STATUS_VAR(updateResponse(treeOrder, response, node->iStart, node->n, inc, nRows)); - } - - node->response = resp; - node->isFinalized = 1; - } - - services::Collection > binValuesHost(nFeatures); - DAAL_CHECK_MALLOC(binValuesHost.data()); - services::Collection binValues(nFeatures); - DAAL_CHECK_MALLOC(binValues.data()); - - for (uint32_t i = 0; i < nFeatures; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(indexedFeatures.binBorders(i), algorithmFPType, par.maxBins); - binValuesHost[i] = indexedFeatures.binBorders(i).template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - binValues[i] = binValuesHost[i].get(); - } - - size_t maxLevel = 0; - connector.getMaxLevel(0, maxLevel); - DAAL_ASSERT(maxLevel + 1 <= 63); - DAAL_ASSERT(((size_t)1 << (maxLevel + 1)) > 0 && ((size_t)1 << (maxLevel + 1)) < static_cast(UINT_MAX)); - const uint32_t nNodes = ((size_t)1 << (maxLevel + 1)) - 1; - - gbt::internal::GbtDecisionTree * pTbl = new gbt::internal::GbtDecisionTree(nNodes, maxLevel); - DAAL_CHECK_MALLOC(pTbl); - - HomogenNumericTable * pTblImp = new HomogenNumericTable(1, nNodes, NumericTable::doAllocate); - DAAL_CHECK_MALLOC(pTblImp); - HomogenNumericTable * pTblSmplCnt = new HomogenNumericTable(1, nNodes, NumericTable::doAllocate); - DAAL_CHECK_MALLOC(pTblSmplCnt); - - DAAL_CHECK_STATUS_VAR(connector.template convertToGbtDecisionTree( - binValues.data(), nNodes, maxLevel, pTbl, pTblImp->getArray(), pTblSmplCnt->getArray(), initResp, par)); - modelImpl.add(pTbl, pTblImp, pTblSmplCnt); - initResp = 0.0; - } - - return services::Status(); -} - -} /* namespace internal */ -} /* namespace training */ -} /* namespace regression */ -} /* namespace gbt */ -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_kernel_oneapi.h b/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_kernel_oneapi.h deleted file mode 100644 index a3aa27d61ca..00000000000 --- a/cpp/daal/src/algorithms/dtrees/gbt/regression/oneapi/gbt_regression_train_kernel_oneapi.h +++ /dev/null @@ -1,165 +0,0 @@ -/* file: gbt_regression_train_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for gradient boosted trees -// training for GPU. -//-- -*/ - -#ifndef __GBT_REGRESSION_TRAIN_KERNEL_ONEAPI_H__ -#define __GBT_REGRESSION_TRAIN_KERNEL_ONEAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/gradient_boosted_trees/gbt_regression_training_types.h" -#include "src/algorithms/engines/engine_batch_impl.h" -#include "src/algorithms/dtrees/gbt/oneapi/gbt_feature_type_helper_oneapi.h" - -using namespace daal::data_management; -using namespace daal::services; - -namespace daal -{ -namespace algorithms -{ -namespace gbt -{ -namespace regression -{ -namespace training -{ -namespace internal -{ -template -class RegressionTrainBatchKernelOneAPI : public daal::algorithms::Kernel -{ -public: - services::Status compute(HostAppIface * pHostApp, const NumericTable * x, const NumericTable * y, gbt::regression::Model & m, Result & res, - const Parameter & par, engines::internal::BatchBaseImpl & engine); - -private: - services::Status scan(const services::internal::Buffer & values, services::internal::sycl::UniversalBuffer & partialSums, - uint32_t nRows, uint32_t localSize, uint32_t nLocalSums); - - services::Status reduce(services::internal::sycl::UniversalBuffer & partialSums, services::internal::sycl::UniversalBuffer & totalSum, - uint32_t localSize, uint32_t nSubgroupSums); - - services::Status getInitialResponse(NumericTable & y, algorithmFPType * response); - - services::Status computeOptCoeffs(NumericTable & y, services::internal::sycl::UniversalBuffer & response, - services::internal::sycl::UniversalBuffer & optCoeffs); - - services::Status initializeTreeOrder(uint32_t nRows, services::internal::sycl::UniversalBuffer & treeOrder); - - services::Status computePartialHistograms(const services::internal::sycl::UniversalBuffer & data, - services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & optCoeffs, - services::internal::sycl::UniversalBuffer & partialHistograms, uint32_t iStart, uint32_t nRows, - services::internal::sycl::UniversalBuffer & binOffsets, uint32_t nTotalBins, uint32_t nFeatures, - uint32_t localSize, uint32_t nPartialHistograms, uint32_t totalRows); - - services::Status reducePartialHistograms(services::internal::sycl::UniversalBuffer & partialHistograms, - services::internal::sycl::UniversalBuffer & histograms, uint32_t nTotalBins, uint32_t reduceLocalSize, - uint32_t nPartialHistograms); - - services::Status computeHistogram(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & optCoeffs, - services::internal::sycl::UniversalBuffer & partialHistograms, - services::internal::sycl::UniversalBuffer & histograms, uint32_t iStart, uint32_t nRows, - services::internal::sycl::UniversalBuffer & binOffsets, uint32_t nTotalBins, uint32_t totalRows, - uint32_t nFeatures); - - services::Status computeHistogramDiff(services::internal::sycl::UniversalBuffer & histogramSrc, - services::internal::sycl::UniversalBuffer & histogramTotal, - services::internal::sycl::UniversalBuffer & histogramDst, uint32_t nBins); - - services::Status computeTotalOptCoeffs(services::internal::sycl::UniversalBuffer & histograms, - services::internal::sycl::UniversalBuffer & totalOptCoeffs, - services::internal::sycl::UniversalBuffer & binOffsets, uint32_t nTotalBins, uint32_t nFeatures, - uint32_t localSize); - - services::Status computeBestSplitForFeatures(services::internal::sycl::UniversalBuffer & histograms, - services::internal::sycl::UniversalBuffer & totalOptCoeffs, - services::internal::sycl::UniversalBuffer & splitInfo, - services::internal::sycl::UniversalBuffer & splitValue, - services::internal::sycl::UniversalBuffer & binOffsets, uint32_t nTotalBins, uint32_t nFeatures, - algorithmFPType lambda, uint32_t localSize); - - services::Status computeBestSplit(services::internal::sycl::UniversalBuffer & histograms, services::internal::sycl::UniversalBuffer & binOffsets, - uint32_t nTotalBins, uint32_t nFeatures, algorithmFPType lambda, - gbt::internal::BestSplitOneAPI & bestSplit, algorithmFPType * gTotal = nullptr, - algorithmFPType * hTotal = nullptr); - - services::Status partitionScan(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & partialSums, int splitValue, uint32_t iStart, uint32_t nRows, - uint32_t localSize, uint32_t nLocalSums, uint32_t totalRows); - - services::Status partitionSumScan(services::internal::sycl::UniversalBuffer & partialSums, - services::internal::sycl::UniversalBuffer & partialPrefixSums, - services::internal::sycl::UniversalBuffer & totalSum, uint32_t localSize, uint32_t nSubgroupSums); - - services::Status partitionReorder(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & treeOrderBuf, - services::internal::sycl::UniversalBuffer & partialPrefixSums, int spliteValue, uint32_t iStart, uint32_t nRows, - uint32_t localSize, uint32_t nLocalSums, uint32_t totalRows); - - services::Status partitionCopy(services::internal::sycl::UniversalBuffer & treeOrderBuf, services::internal::sycl::UniversalBuffer & treeOrder, - uint32_t iStart, uint32_t nRows, uint32_t totalRows); - - services::Status doPartition(const services::internal::sycl::UniversalBuffer & data, services::internal::sycl::UniversalBuffer & treeOrder, - services::internal::sycl::UniversalBuffer & treeOrderBuf, int splitValue, uint32_t iStart, uint32_t nRows, - uint32_t & nLeft, uint32_t & nRight, uint32_t totalRows); - - services::Status updateResponse(services::internal::sycl::UniversalBuffer & treeOrder, services::internal::sycl::UniversalBuffer & response, - uint32_t iStart, uint32_t nRows, algorithmFPType inc, uint32_t totalRows); - - services::internal::sycl::KernelPtr kernelScan; - services::internal::sycl::KernelPtr kernelReduce; - services::internal::sycl::KernelPtr kernelInitializeTreeOrder; - services::internal::sycl::KernelPtr kernelComputePartialHistograms; - services::internal::sycl::KernelPtr kernelReducePartialHistograms; - services::internal::sycl::KernelPtr kernelComputeHistogramDiff; - services::internal::sycl::KernelPtr kernelComputeOptCoeffs; - services::internal::sycl::KernelPtr kernelComputeTotalOptCoeffs; - services::internal::sycl::KernelPtr kernelComputeBestSplitForFeatures; - services::internal::sycl::KernelPtr kernelPartitionScan; - services::internal::sycl::KernelPtr kernelPartitionSumScan; - services::internal::sycl::KernelPtr kernelPartitionReorder; - services::internal::sycl::KernelPtr kernelPartitionCopy; - services::internal::sycl::KernelPtr kernelUpdateResponse; - - const uint32_t _maxWorkItemsPerGroup = 128; // should be a power of two for interal needs - const uint32_t _maxLocalBuffer = 30000; // should be less than a half of local memory (two buffers) - const uint32_t _preferableSubGroup = 16; // preferable maximal sub-group size - const uint32_t _maxLocalSize = 128; - const uint32_t _maxLocalSums = 256; - const uint32_t _maxLocalHistograms = 256; - const uint32_t _preferableGroupSize = 256; -}; - -} // namespace internal -} // namespace training -} // namespace regression -} // namespace gbt -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/BUILD b/cpp/daal/src/algorithms/k_nearest_neighbors/BUILD index 307a44c195a..31bf41bc8dd 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/BUILD +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/classifier:kernel", ], ) diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.cpp b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.cpp index 0e401dae455..fefef5cd96c 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.cpp +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.cpp @@ -15,7 +15,7 @@ * limitations under the License. *******************************************************************************/ -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" #include "src/services/serialization_utils.h" #include "src/services/daal_strings.h" #include "src/services/service_data_utils.h" diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h similarity index 50% rename from cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h rename to cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h index 810824f762c..5a903d39a46 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h @@ -1,4 +1,4 @@ -/* file: bf_knn_classification_model_ucapi_impl.h */ +/* file: bf_knn_classification_model_impl.h */ /******************************************************************************* * Copyright 2014 Intel Corporation * @@ -15,13 +15,11 @@ * limitations under the License. *******************************************************************************/ -#ifndef __BF_KNN_CLASSIFICATION_MODEL_UCAPI_IMPL_H__ -#define __BF_KNN_CLASSIFICATION_MODEL_UCAPI_IMPL_H__ +#ifndef __BF_KNN_CLASSIFICATION_MODEL_IMPL_ +#define __BF_KNN_CLASSIFICATION_MODEL_IMPL_ #include "algorithms/k_nearest_neighbors/bf_knn_classification_model.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" #include "data_management/data/homogen_numeric_table.h" -#include "services/internal/sycl/execution_context.h" #include "services/daal_defines.h" namespace daal @@ -79,43 +77,19 @@ class Model::ModelImpl } else { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - services::Status status; - dest = data_management::HomogenNumericTable::create(value->getNumberOfColumns(), value->getNumberOfRows(), - data_management::NumericTable::doAllocate, &status); - DAAL_CHECK_STATUS_VAR(status); - data_management::BlockDescriptor destBD, srcBD; - DAAL_CHECK_STATUS_VAR(dest->getBlockOfRows(0, dest->getNumberOfRows(), data_management::writeOnly, destBD)); - DAAL_CHECK_STATUS_VAR(value->getBlockOfRows(0, value->getNumberOfRows(), data_management::readOnly, srcBD)); - auto source = srcBD.getBlockPtr(); - auto destination = destBD.getBlockPtr(); - services::internal::daal_memcpy_s( - destBD.getBlockPtr(), destBD.getNumberOfColumns() * destBD.getNumberOfRows() * sizeof(algorithmFPType), srcBD.getBlockPtr(), - srcBD.getNumberOfColumns() * srcBD.getNumberOfRows() * sizeof(algorithmFPType)); - DAAL_CHECK_STATUS_VAR(dest->releaseBlockOfRows(destBD)); - DAAL_CHECK_STATUS_VAR(value->releaseBlockOfRows(srcBD)); - } - else - { - services::Status status; - dest = data_management::internal::SyclHomogenNumericTable::create( - value->getNumberOfColumns(), value->getNumberOfRows(), data_management::NumericTable::doAllocate, &status); - DAAL_CHECK_STATUS_VAR(status); - data_management::BlockDescriptor destBD, srcBD; - DAAL_CHECK_STATUS_VAR(dest->getBlockOfRows(0, dest->getNumberOfRows(), data_management::writeOnly, destBD)); - DAAL_CHECK_STATUS_VAR(value->getBlockOfRows(0, value->getNumberOfRows(), data_management::readOnly, srcBD)); - auto source = srcBD.getBuffer(); - auto destination = destBD.getBuffer(); - auto & context = services::internal::getDefaultContext(); - context.copy(destination, 0, source, 0, source.size(), status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_STATUS_VAR(dest->releaseBlockOfRows(destBD)); - DAAL_CHECK_STATUS_VAR(value->releaseBlockOfRows(srcBD)); - } + services::Status status; + dest = data_management::HomogenNumericTable::create(value->getNumberOfColumns(), value->getNumberOfRows(), + data_management::NumericTable::doAllocate, &status); + DAAL_CHECK_STATUS_VAR(status); + data_management::BlockDescriptor destBD, srcBD; + DAAL_CHECK_STATUS_VAR(dest->getBlockOfRows(0, dest->getNumberOfRows(), data_management::writeOnly, destBD)); + DAAL_CHECK_STATUS_VAR(value->getBlockOfRows(0, value->getNumberOfRows(), data_management::readOnly, srcBD)); + auto source = srcBD.getBlockPtr(); + auto destination = destBD.getBlockPtr(); + services::internal::daal_memcpy_s(destBD.getBlockPtr(), destBD.getNumberOfColumns() * destBD.getNumberOfRows() * sizeof(algorithmFPType), + srcBD.getBlockPtr(), srcBD.getNumberOfColumns() * srcBD.getNumberOfRows() * sizeof(algorithmFPType)); + DAAL_CHECK_STATUS_VAR(dest->releaseBlockOfRows(destBD)); + DAAL_CHECK_STATUS_VAR(value->releaseBlockOfRows(srcBD)); } return services::Status(); } diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_batch.cpp b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_batch.cpp index 532c926c011..b9f4bc4d4b3 100755 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_batch.cpp +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_batch.cpp @@ -17,7 +17,7 @@ #include "algorithms/k_nearest_neighbors/bf_knn_classification_predict_types.h" #include "algorithms/classifier/classifier_model.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" #include "src/services/daal_strings.h" using namespace daal::data_management; diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_container.h b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_container.h old mode 100755 new mode 100644 index 2f1e0ecae5a..b33d391c427 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_container.h +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_container.h @@ -16,9 +16,7 @@ *******************************************************************************/ #include "algorithms/k_nearest_neighbors/bf_knn_classification_predict.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" #include "services/error_indexes.h" namespace daal @@ -32,17 +30,7 @@ namespace prediction template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : PredictionContainerIface() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::KNNClassificationPredictKernel, algorithmFpType); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::KNNClassificationPredictKernelUCAPI, algorithmFpType); - } + __DAAL_INITIALIZE_KERNELS(internal::KNNClassificationPredictKernel, algorithmFpType); } template @@ -61,8 +49,6 @@ services::Status BatchContainer::compute() const data_management::NumericTablePtr label = result->get(bf_knn_classification::prediction::prediction); const data_management::NumericTablePtr indices = result->get(bf_knn_classification::prediction::indices); const data_management::NumericTablePtr distances = result->get(bf_knn_classification::prediction::distances); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); const Parameter * const par = static_cast(_par); internal::KernelParameter kernelPar; @@ -75,16 +61,8 @@ services::Status BatchContainer::compute() kernelPar.engine = par->engine->clone(); kernelPar.resultsToEvaluate = par->resultsToEvaluate; - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::KNNClassificationPredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFpType), compute, a.get(), m.get(), - label.get(), indices.get(), distances.get(), &kernelPar); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KNNClassificationPredictKernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFpType), compute, a.get(), - m.get(), label.get(), indices.get(), distances.get(), par); - } + __DAAL_CALL_KERNEL(env, internal::KNNClassificationPredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFpType), compute, a.get(), m.get(), + label.get(), indices.get(), distances.get(), &kernelPar); } } // namespace prediction diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_fpt_dispatcher.cpp index d3e4a3ad220..fe4644ac4f3 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_fpt_dispatcher.cpp @@ -22,8 +22,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(bf_knn_classification::prediction::BatchContainer, batch, DAAL_FPTYPE, - bf_knn_classification::prediction::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(bf_knn_classification::prediction::BatchContainer, batch, DAAL_FPTYPE, + bf_knn_classification::prediction::defaultDense) namespace bf_knn_classification { diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_fpt_ucapi.cpp b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_fpt_ucapi.cpp deleted file mode 100644 index 666b57adf50..00000000000 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_fpt_ucapi.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* file: bf_knn_classification_predict_fpt_ucapi.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi_impl.i" -#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_container.h" - -namespace daal -{ -namespace algorithms -{ -namespace bf_knn_classification -{ -namespace prediction -{ -namespace internal -{ -template class DAAL_EXPORT KNNClassificationPredictKernelUCAPI; - -} // namespace internal -} // namespace prediction -} // namespace bf_knn_classification -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel.h b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel.h index d0d1efbb69f..55f4f3a14ba 100755 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel.h +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel.h @@ -21,7 +21,7 @@ #include "src/algorithms/kernel.h" #include "data_management/data/numeric_table.h" #include "src/algorithms/service_kernel_math.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" #include "algorithms/k_nearest_neighbors/bf_knn_classification_predict_types.h" namespace daal diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel_impl.i index 31da6ed69bc..5e42208a232 100755 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel_impl.i +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel_impl.i @@ -22,8 +22,8 @@ #include "services/daal_defines.h" #include "algorithms/k_nearest_neighbors/bf_knn_classification_model.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_impl.i" #include "src/services/service_data_utils.h" #include "src/data_management/service_numeric_table.h" diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_result.h b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_result.h index 3daf3005881..1af051b6950 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_result.h +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_result.h @@ -25,7 +25,7 @@ #define __BF_KNN_CLASSIFICATION_TRAINING_RESULT_ #include "algorithms/classifier/classifier_model.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" #include "algorithms/k_nearest_neighbors/bf_knn_classification_predict_types.h" namespace daal @@ -53,48 +53,20 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in const size_t nRows = (static_cast(input))->getNumberOfRows(); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (par->resultsToEvaluate & daal::algorithms::classifier::computeClassLabels) { - if (deviceInfo.isCpu) - { - set(prediction, - data_management::HomogenNumericTable::create(1, nRows, data_management::NumericTableIface::doAllocate, &s)); - } - else - { - set(prediction, data_management::internal::SyclHomogenNumericTable::create( - 1, nRows, data_management::NumericTableIface::doAllocate, &s)); - } + set(prediction, data_management::HomogenNumericTable::create(1, nRows, data_management::NumericTableIface::doAllocate, &s)); } if (s.ok() && (par->resultsToCompute & computeIndicesOfNeighbors)) { - if (deviceInfo.isCpu) - { - set(indices, data_management::HomogenNumericTable::create(par->k, nRows, data_management::NumericTableIface::doAllocate, &s)); - } - else - { - set(indices, - data_management::internal::SyclHomogenNumericTable::create(par->k, nRows, data_management::NumericTableIface::doAllocate, &s)); - } + set(indices, data_management::HomogenNumericTable::create(par->k, nRows, data_management::NumericTableIface::doAllocate, &s)); } if (s.ok() && (par->resultsToCompute & computeDistances)) { - if (deviceInfo.isCpu) - { - set(distances, - data_management::HomogenNumericTable::create(par->k, nRows, data_management::NumericTableIface::doAllocate, &s)); - } - else - { - set(distances, data_management::internal::SyclHomogenNumericTable::create( - par->k, nRows, data_management::NumericTableIface::doAllocate, &s)); - } + set(distances, + data_management::HomogenNumericTable::create(par->k, nRows, data_management::NumericTableIface::doAllocate, &s)); } return s; diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_container.h b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_container.h old mode 100755 new mode 100644 index 4825d269a89..950fab05c37 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_container.h +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_container.h @@ -18,14 +18,12 @@ #ifndef __BF_KNN_CLASSIFICATION_TRAIN_CONTAINER_H__ #define __BF_KNN_CLASSIFICATION_TRAIN_CONTAINER_H__ -#include "services/internal/sycl/execution_context.h" #include "src/algorithms/kernel.h" #include "data_management/data/numeric_table.h" #include "services/daal_shared_ptr.h" #include "algorithms/classifier/classifier_model.h" #include "algorithms/k_nearest_neighbors/bf_knn_classification_training_batch.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel.h" namespace daal @@ -41,17 +39,7 @@ using namespace daal::data_management; template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::KNNClassificationTrainKernel, algorithmFpType); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::KNNClassificationTrainKernelUCAPI, DAAL_FPTYPE); - } + __DAAL_INITIALIZE_KERNELS(internal::KNNClassificationTrainKernel, algorithmFpType); } template @@ -83,19 +71,8 @@ services::Status BatchContainer::compute() } DAAL_CHECK_STATUS_VAR(status); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::KNNClassificationTrainKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFpType), compute, r->impl()->getData().get(), - r->impl()->getLabels().get(), r.get(), *par, *par->engine); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KNNClassificationTrainKernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFpType), compute, - r->impl()->getData().get(), r->impl()->getLabels().get(), r.get(), *par, *par->engine); - } + __DAAL_CALL_KERNEL(env, internal::KNNClassificationTrainKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFpType), compute, r->impl()->getData().get(), + r->impl()->getLabels().get(), r.get(), *par, *par->engine); } } // namespace training diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_dense_default_batch_fpt_dispatcher.cpp index fcd751e1a81..919607d081e 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_dense_default_batch_fpt_dispatcher.cpp @@ -21,8 +21,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(bf_knn_classification::training::BatchContainer, batch, DAAL_FPTYPE, - bf_knn_classification::training::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(bf_knn_classification::training::BatchContainer, batch, DAAL_FPTYPE, + bf_knn_classification::training::defaultDense) namespace bf_knn_classification { namespace training diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_fpt_ucapi.cpp b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_fpt_ucapi.cpp deleted file mode 100644 index 00cf1e4ba1f..00000000000 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_fpt_ucapi.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* file: bf_knn_classification_train_fpt_ucapi.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi_impl.i" -#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_dense_default_batch_container.h" - -namespace daal -{ -namespace algorithms -{ -namespace bf_knn_classification -{ -namespace training -{ -namespace internal -{ -template class DAAL_EXPORT KNNClassificationTrainKernelUCAPI; - -} // namespace internal -} // namespace training -} // namespace bf_knn_classification -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel.h b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel.h index c4ff48fb224..4120cadfbe7 100755 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel.h +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel.h @@ -21,7 +21,7 @@ #include "data_management/data/numeric_table.h" #include "src/algorithms/kernel.h" #include "algorithms/k_nearest_neighbors/bf_knn_classification_training_types.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" namespace daal { diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel_impl.i index ec2760f66a2..4e13d9f82c2 100755 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel_impl.i +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel_impl.i @@ -22,7 +22,7 @@ #include "services/daal_defines.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_impl.i" namespace daal diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_impl.i index 08ee8fcd5bb..59e2f7deb15 100644 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_impl.i +++ b/cpp/daal/src/algorithms/k_nearest_neighbors/bf_knn_impl.i @@ -24,7 +24,7 @@ #include "algorithms/k_nearest_neighbors/bf_knn_classification_model.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_classification_train_kernel.h" #include "src/algorithms/k_nearest_neighbors/bf_knn_classification_predict_kernel.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" #include "data_management/data/numeric_table.h" #include "services/env_detect.h" #include "src/services/service_data_utils.h" diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi.h b/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi.h deleted file mode 100755 index 3c956db1d6c..00000000000 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi.h +++ /dev/null @@ -1,90 +0,0 @@ -/* file: bf_knn_classification_predict_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __BF_KNN_CLASSIFICATION_PREDICT_KERNEL_UCAPI_H__ -#define __BF_KNN_CLASSIFICATION_PREDICT_KERNEL_UCAPI_H__ - -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" -#include "algorithms/k_nearest_neighbors/bf_knn_classification_predict_types.h" -#include "services/internal/sycl/execution_context.h" - -namespace daal -{ -namespace algorithms -{ -namespace bf_knn_classification -{ -namespace prediction -{ -namespace internal -{ -using namespace daal::data_management; - -template -class KNNClassificationPredictKernelUCAPI : public daal::algorithms::Kernel -{ -public: - services::Status compute(const NumericTable * x, const classifier::Model * m, NumericTable * y, NumericTable * outIndices, - NumericTable * outDistances, const daal::algorithms::Parameter * par); - - services::Status compute(const NumericTable * x, const classifier::Model * m, NumericTable * y, const daal::algorithms::Parameter * par); - -private: - services::Status copyPartialDistancesAndLabels(services::internal::sycl::ExecutionContextIface & context, - const services::internal::sycl::UniversalBuffer & distances, - const services::internal::sycl::UniversalBuffer & labels, - services::internal::sycl::UniversalBuffer & partialDistances, - services::internal::sycl::UniversalBuffer & partialLabels, uint32_t curQueryBlockRows, uint32_t k, - uint32_t nChunk, uint32_t totalNumberOfChunks); - - services::Status scatterSumOfSquares(services::internal::sycl::ExecutionContextIface & context, - const services::internal::sycl::UniversalBuffer & dataSumOfSquares, uint32_t dataBlockRowCount, - uint32_t queryBlockRowCount, services::internal::sycl::UniversalBuffer & distances); - - services::Status scatterBothL2Norms(services::internal::sycl::ExecutionContextIface & context, - const services::internal::sycl::UniversalBuffer & dataSumOfSquares, - const services::internal::sycl::UniversalBuffer & querySumOfSquares, uint32_t dataBlockRowCount, - uint32_t queryBlockRowCount, services::internal::sycl::UniversalBuffer & distances); - - services::Status computeDistances(services::internal::sycl::ExecutionContextIface & context, - const services::internal::Buffer & data, - const services::internal::Buffer & query, - services::internal::sycl::UniversalBuffer & distances, uint32_t dataBlockRowCount, uint32_t queryBlockRowCount, - uint32_t nFeatures); - - services::Status initializeIndices(services::internal::sycl::ExecutionContextIface & context, const uint32_t dataBlockRowCount, - const uint32_t fromDataBlockRow, services::internal::sycl::UniversalBuffer & indices); - - services::Status computeWinners(services::internal::sycl::ExecutionContextIface & context, - const services::internal::sycl::UniversalBuffer & labels, uint32_t queryBlockRowCount, uint32_t k, - services::internal::sycl::UniversalBuffer labelsOut); - - services::Status distancesFromSquares(services::internal::sycl::ExecutionContextIface & context, services::internal::sycl::UniversalBuffer & data, - const uint32_t distancesCount); - - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & kernel_factory); -}; - -} // namespace internal -} // namespace prediction -} // namespace bf_knn_classification -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi_impl.i deleted file mode 100644 index 0ca3cf93bee..00000000000 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi_impl.i +++ /dev/null @@ -1,637 +0,0 @@ -/* file: bf_knn_classification_predict_kernel_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __BF_KNN_CLASSIFICATION_PREDICT_KERNEL_UCAPI_IMPL_I__ -#define __BF_KNN_CLASSIFICATION_PREDICT_KERNEL_UCAPI_IMPL_I__ - -#include "algorithms/engines/engine.h" -#include "src/sycl/reducer.h" -#include "src/sycl/select_indexed.h" -#include "src/sycl/sorter.h" -#include "src/services/service_data_utils.h" -#include "services/daal_defines.h" - -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_predict_kernel_ucapi.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" - -#include "src/sycl/blas_gpu.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/cl_kernels/bf_knn_cl_kernels.cl" - -#include "src/externals/service_profiler.h" - -constexpr size_t maxInt32AsSizeT = static_cast(daal::services::internal::MaxVal::get()); -constexpr uint32_t maxInt32AsUint32T = static_cast(daal::services::internal::MaxVal::get()); - -namespace daal -{ -namespace algorithms -{ -namespace bf_knn_classification -{ -namespace prediction -{ -namespace internal -{ -using namespace daal::services::internal::sycl; -using namespace services; -using sort::RadixSort; -using selection::QuickSelectIndexed; -using selection::SelectIndexed; -using selection::SelectIndexedFactory; - -class Range -{ -public: - static Range createFromBlock(uint32_t blockIndex, uint32_t maxBlockSize, uint32_t sumOfBlocksSize) - { - // TODO: check that arguments are correct - - const uint32_t startIndex = blockIndex * maxBlockSize; - const uint32_t endIndex = startIndex + maxBlockSize; - return Range { startIndex, endIndex > sumOfBlocksSize ? sumOfBlocksSize : endIndex }; - } - - uint32_t startIndex; - uint32_t endIndex; - uint32_t count; - -private: - Range(uint32_t startIndex, uint32_t endIndex) : startIndex(startIndex), endIndex(endIndex), count(endIndex - startIndex) {} -}; - -template -services::Status KNNClassificationPredictKernelUCAPI::compute(const NumericTable * x, const classifier::Model * m, NumericTable * y, - NumericTable * outIndices, NumericTable * outDistances, - const daal::algorithms::Parameter * par) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute); - - services::Status st; - - auto & context = services::Environment::getInstance()->getDefaultExecutionContext(); - - const Model * model = static_cast(m); - DAAL_CHECK(model, services::ErrorNullModel); - - NumericTable * ntData = const_cast(x); - NumericTable * points = const_cast(model->impl()->getData().get()); - NumericTable * labels = const_cast(model->impl()->getLabels().get()); - - const Parameter * const parameter = static_cast(par); - DAAL_CHECK(par, services::ErrorNullParameterNotSupported); - - const bool computeOutputLabels = bool(parameter->resultsToEvaluate & daal::algorithms::classifier::computeClassLabels); - const bool computeOutputIndices = bool(parameter->resultsToCompute & ResultToComputeId::computeIndicesOfNeighbors); - const bool computeOutputDistances = bool(parameter->resultsToCompute & ResultToComputeId::computeDistances); - const bool isOwnDistances = computeOutputDistances && !(computeOutputLabels || computeOutputIndices); - - DAAL_CHECK(ntData != NULL, services::ErrorNullInputNumericTable); - DAAL_CHECK(points != NULL, services::ErrorNullInputNumericTable); - if (computeOutputLabels) - { - DAAL_CHECK(y != NULL, services::ErrorNullOutputNumericTable); - DAAL_CHECK(labels != NULL, services::ErrorNullInputNumericTable); - } - if (computeOutputIndices) - { - DAAL_CHECK(outIndices != NULL, services::ErrorNullOutputNumericTable); - } - if (computeOutputDistances) - { - DAAL_CHECK(outDistances != NULL, services::ErrorNullOutputNumericTable); - } - - const size_t kAsSizeT = parameter->k; - DAAL_CHECK(kAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - const uint32_t k = static_cast(kAsSizeT); - - const size_t nQueryRowsSizeT = ntData->getNumberOfRows(); - const size_t nQueryFeaturesSizeT = ntData->getNumberOfColumns(); - const size_t nDataRowsSizeT = points->getNumberOfRows(); - const size_t nTrainFeaturesSizeT = points->getNumberOfColumns(); - const size_t nLabelRowsSizeT = computeOutputLabels ? labels->getNumberOfRows() : size_t(0); - - DAAL_CHECK(nQueryRowsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfRowsInInputNumericTable); - DAAL_CHECK(nDataRowsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfRowsInInputNumericTable) - DAAL_CHECK(nTrainFeaturesSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - DAAL_CHECK(nTrainFeaturesSizeT == nQueryFeaturesSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - - if (computeOutputLabels) - { - DAAL_CHECK(nLabelRowsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfRowsInInputNumericTable); - DAAL_CHECK(labels->getNumberOfColumns() == size_t(1), services::ErrorIncorrectNumberOfColumnsInOutputNumericTable); - DAAL_CHECK(y->getNumberOfColumns() == size_t(1), services::ErrorIncorrectNumberOfColumnsInOutputNumericTable); - } - - if (computeOutputIndices) - { - DAAL_CHECK(outIndices->getNumberOfColumns() == kAsSizeT, services::ErrorIncorrectNumberOfColumnsInOutputNumericTable); - DAAL_CHECK(outIndices->getNumberOfRows() == nQueryRowsSizeT, services::ErrorIncorrectNumberOfRowsInOutputNumericTable); - } - - if (computeOutputDistances) - { - DAAL_CHECK(outDistances->getNumberOfColumns() == kAsSizeT, services::ErrorIncorrectNumberOfColumnsInOutputNumericTable); - DAAL_CHECK(outDistances->getNumberOfRows() == nQueryRowsSizeT, services::ErrorIncorrectNumberOfRowsInOutputNumericTable); - } - - const uint32_t nQueryRows = static_cast(nQueryRowsSizeT); - const uint32_t nLabelRows = static_cast(nLabelRowsSizeT); - const uint32_t nDataRows = computeOutputLabels ? static_cast(nDataRowsSizeT < nLabelRowsSizeT ? nDataRowsSizeT : nLabelRowsSizeT) : - static_cast(nDataRowsSizeT); - const uint32_t nFeatures = static_cast(nTrainFeaturesSizeT); - - // Block dimensions below are optimal for GEN9 - // Number of doubles is to 2X less against floats - // to keep the same block size in bytes - const uint32_t maxDataBlockRowCount = 4 * 4096; - const uint32_t maxQueryBlockRowCount = 4 * 2048 / sizeof(algorithmFpType); - DAAL_CHECK(k <= maxDataBlockRowCount, services::ErrorIncorrectParameter); - - // Maximal number of partial selections to be merged at once - const uint32_t selectionMaxNumberOfChunks = 16; - const uint32_t histogramSize = 256; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, maxDataBlockRowCount, maxQueryBlockRowCount); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, maxQueryBlockRowCount, k); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, maxQueryBlockRowCount * k, selectionMaxNumberOfChunks); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, maxQueryBlockRowCount, histogramSize); - - // Allocation section - - auto dataSumOfSquares = context.allocate(TypeIds::id(), maxDataBlockRowCount, st); - DAAL_CHECK_STATUS_VAR(st); - UniversalBuffer querySumOfSquares; - if (computeOutputDistances) - { - querySumOfSquares = context.allocate(TypeIds::id(), maxQueryBlockRowCount, st); - DAAL_CHECK_STATUS_VAR(st); - } - auto distances = context.allocate(TypeIds::id(), maxDataBlockRowCount * maxQueryBlockRowCount, st); - DAAL_CHECK_STATUS_VAR(st); - auto partialDistances = context.allocate(TypeIds::id(), maxQueryBlockRowCount * k * selectionMaxNumberOfChunks, st); - DAAL_CHECK_STATUS_VAR(st); - - UniversalBuffer partialLabels; - if (computeOutputLabels) - { - partialLabels = context.allocate(TypeIds::id(), maxQueryBlockRowCount * k * selectionMaxNumberOfChunks, st); - DAAL_CHECK_STATUS_VAR(st); - } - // temporary buffer for indices - UniversalBuffer blockIndices; - if (computeOutputIndices || isOwnDistances) - { - blockIndices = context.allocate(TypeIds::id(), maxDataBlockRowCount, st); - DAAL_CHECK_STATUS_VAR(st); - } - UniversalBuffer partialIndices; - if (computeOutputIndices || isOwnDistances) - { - partialIndices = context.allocate(TypeIds::id(), maxQueryBlockRowCount * k * selectionMaxNumberOfChunks, st); - DAAL_CHECK_STATUS_VAR(st); - } - UniversalBuffer sortedLabels; - if (computeOutputLabels) - { - sortedLabels = context.allocate(TypeIds::id(), maxQueryBlockRowCount * k, st); - DAAL_CHECK_STATUS_VAR(st); - } - // temporary buffer for RADIX sort - UniversalBuffer radixBuffer; - if (computeOutputLabels) - { - radixBuffer = context.allocate(TypeIds::id(), maxQueryBlockRowCount * histogramSize, st); - DAAL_CHECK_STATUS_VAR(st); - } - - const uint32_t nDataBlockCount = nDataRows / maxDataBlockRowCount + uint32_t(nDataRows % maxDataBlockRowCount != 0); - const uint32_t nQueryBlockCount = nQueryRows / maxQueryBlockRowCount + uint32_t(nQueryRows % maxQueryBlockRowCount != 0); - const uint32_t nSelectionBlockCount = nDataBlockCount / selectionMaxNumberOfChunks + uint32_t(nDataBlockCount % selectionMaxNumberOfChunks != 0); - - SelectIndexed::Result selectResult(context, k, maxQueryBlockRowCount, distances.type(), st); - DAAL_CHECK_STATUS_VAR(st); - SelectIndexed::Result selectResultIndices(context, k, maxQueryBlockRowCount, distances.type(), st); - DAAL_CHECK_STATUS_VAR(st); - - SelectIndexed::Params params(k, TypeIds::id(), maxDataBlockRowCount, parameter->engine); - SelectIndexedFactory factory; - SharedPtr selector(factory.create(k, params, st)); - DAAL_CHECK_STATUS_VAR(st); - - for (uint32_t qblock = 0; qblock < nQueryBlockCount; qblock++) - { - Range curQueryRange = Range::createFromBlock(qblock, maxQueryBlockRowCount, nQueryRows); - BlockDescriptor queryRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(curQueryRange.startIndex, curQueryRange.count, readOnly, queryRows)); - auto curQuery = queryRows.getBuffer(); - UniversalBuffer queryNormsBuffer; - if (computeOutputDistances) - { - // Collect sums of squares from query data - // necessary only if full distances are required - auto querySumResult = math::SumReducer::sum(math::Layout::RowMajor, curQuery, curQueryRange.count, nFeatures, st); - DAAL_CHECK_STATUS_VAR(st); - queryNormsBuffer = querySumResult.sumOfSquares; - } - for (uint32_t sblock = 0; sblock < nSelectionBlockCount; sblock++) - { - uint32_t curSelectionMaxNumberOfChunks = sblock == 0 ? selectionMaxNumberOfChunks : selectionMaxNumberOfChunks - 1; - uint32_t selectionChunkCount = uint32_t(sblock != 0); - Range curDataBlockRange = Range::createFromBlock(sblock, curSelectionMaxNumberOfChunks, nDataBlockCount); - for (uint32_t dblock = curDataBlockRange.startIndex; dblock < curDataBlockRange.endIndex; dblock++) - { - Range curDataRange = Range::createFromBlock(dblock, maxDataBlockRowCount, nDataRows); - BlockDescriptor labelRows; - if (computeOutputLabels) - { - DAAL_CHECK_STATUS_VAR(labels->getBlockOfRows(curDataRange.startIndex, curDataRange.count, readOnly, labelRows)); - } - if (computeOutputIndices) - { - DAAL_CHECK_STATUS_VAR(initializeIndices(context, curDataRange.startIndex, curDataRange.count, blockIndices)); - } - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(points->getBlockOfRows(curDataRange.startIndex, curDataRange.count, readOnly, dataRows)); - UniversalBuffer dataNormsBuffer; - { - // Collect sums of squares from train data - auto dataSumResult = math::SumReducer::sum(math::Layout::RowMajor, dataRows.getBuffer(), curDataRange.count, nFeatures, st); - DAAL_CHECK_STATUS_VAR(st); - dataNormsBuffer = dataSumResult.sumOfSquares; - } - // Initialize GEMM distances - if (computeOutputDistances) - { - // slightly less performace but needed if distances are required - DAAL_CHECK_STATUS_VAR( - scatterBothL2Norms(context, dataNormsBuffer, queryNormsBuffer, curDataRange.count, curQueryRange.count, distances)); - } - else - { - // slightly more performance suitable for all other cases - DAAL_CHECK_STATUS_VAR(scatterSumOfSquares(context, dataNormsBuffer, curDataRange.count, curQueryRange.count, distances)); - } - // Let's calculate distances using GEMM - DAAL_CHECK_STATUS_VAR( - computeDistances(context, dataRows.getBuffer(), curQuery, distances, curDataRange.count, curQueryRange.count, nFeatures)); - if (computeOutputLabels) - { - // Select k smallest distances and their labels from every row of the [curQueryRange.count]x[curDataRange.count] block - DAAL_CHECK_STATUS_VAR(selector->selectNearestDistancesAndLabels(distances, labelRows.getBuffer(), k, curQueryRange.count, - curDataRange.count, curDataRange.count, 0, selectResult)); - DAAL_CHECK_STATUS_VAR(st); - // copy block results to buffer in order to get merged with the same selection algorithm (up to selectionMaxNumberOfChunks of partial results) - // and keep the first part containing previously merged result if exists - DAAL_CHECK_STATUS_VAR(copyPartialDistancesAndLabels(context, selectResult.values, selectResult.indices, partialDistances, - partialLabels, curQueryRange.count, k, selectionChunkCount, - selectionMaxNumberOfChunks)); - DAAL_CHECK_STATUS_VAR(labels->releaseBlockOfRows(labelRows)); - } - if (computeOutputIndices || isOwnDistances) - { - // Select k smallest distances and their indices from every row of the [curQueryRange.count]x[curDataRange.count] block - DAAL_CHECK_STATUS_VAR(selector->selectNearestDistancesAndLabels(distances, blockIndices, k, curQueryRange.count, - curDataRange.count, curDataRange.count, 0, selectResultIndices)); - DAAL_CHECK_STATUS_VAR(st); - // copy block results to buffer in order to get merged with the same selection algorithm (up to selectionMaxNumberOfChunks of partial results) - // and keep the first part containing previously merged result if exists - DAAL_CHECK_STATUS_VAR(copyPartialDistancesAndLabels(context, selectResultIndices.values, selectResultIndices.indices, - partialDistances, partialIndices, curQueryRange.count, k, selectionChunkCount, - selectionMaxNumberOfChunks)); - } - - DAAL_CHECK_STATUS_VAR(points->releaseBlockOfRows(dataRows)); - selectionChunkCount++; - } - if (computeOutputLabels) - { - // merge partial data by one more K-selection - DAAL_CHECK_STATUS_VAR(selector->selectNearestDistancesAndLabels(partialDistances, partialLabels, k, curQueryRange.count, - k * curDataBlockRange.count, k * selectionMaxNumberOfChunks, - k * selectionMaxNumberOfChunks, selectResult)); - } - if (computeOutputIndices || isOwnDistances) - { - // merge partial data by one more K-selection - DAAL_CHECK_STATUS_VAR(selector->selectNearestDistancesAndLabels(partialDistances, partialIndices, k, curQueryRange.count, - k * curDataBlockRange.count, k * selectionMaxNumberOfChunks, - k * selectionMaxNumberOfChunks, selectResultIndices)); - } - } - if (computeOutputLabels) - { - // sort labels of closest neighbors - st |= RadixSort::sort(selectResult.indices, sortedLabels, radixBuffer, curQueryRange.count, k, k); - DAAL_CHECK_STATUS_VAR(st); - BlockDescriptor labelsBlock; - DAAL_CHECK_STATUS_VAR(y->getBlockOfRows(curQueryRange.startIndex, curQueryRange.count, writeOnly, labelsBlock)); - // search for maximum occurrence label - DAAL_CHECK_STATUS_VAR(computeWinners(context, sortedLabels, curQueryRange.count, k, labelsBlock.getBuffer())); - DAAL_CHECK_STATUS_VAR(y->releaseBlockOfRows(labelsBlock)); - } - if (computeOutputIndices) - { - BlockDescriptor indicesBlock; - DAAL_CHECK_STATUS_VAR(outIndices->getBlockOfRows(curQueryRange.startIndex, curQueryRange.count, writeOnly, indicesBlock)); - auto outBuff = indicesBlock.getBuffer(); - const size_t indicesCount = outBuff.size(); - auto inpBuff = selectResultIndices.indices; - context.copy(outBuff, size_t(0), inpBuff, size_t(0), indicesCount, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(outIndices->releaseBlockOfRows(indicesBlock)); - } - if ((computeOutputDistances && computeOutputIndices) || isOwnDistances) - { - BlockDescriptor distancesBlock; - DAAL_CHECK_STATUS_VAR(outDistances->getBlockOfRows(curQueryRange.startIndex, curQueryRange.count, writeOnly, distancesBlock)); - auto outBuff = distancesBlock.getBuffer(); - const size_t distancesCount = outBuff.size(); - auto inpBuff = selectResultIndices.values; - DAAL_CHECK_STATUS_VAR(distancesFromSquares(context, inpBuff, distancesCount)); - context.copy(outBuff, size_t(0), inpBuff, size_t(0), distancesCount, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(outDistances->releaseBlockOfRows(distancesBlock)); - } - else if (computeOutputDistances && computeOutputLabels) - { - BlockDescriptor distancesBlock; - DAAL_CHECK_STATUS_VAR(outDistances->getBlockOfRows(curQueryRange.startIndex, curQueryRange.count, writeOnly, distancesBlock)); - auto outBuff = distancesBlock.getBuffer(); - const size_t distancesCount = outBuff.size(); - auto inpBuff = selectResult.values; - DAAL_CHECK_STATUS_VAR(distancesFromSquares(context, inpBuff, distancesCount)); - context.copy(outBuff, size_t(0), inpBuff, size_t(0), distancesCount, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(outDistances->releaseBlockOfRows(distancesBlock)); - } - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(queryRows)); - } - return st; -} - -template -services::Status KNNClassificationPredictKernelUCAPI::compute(const NumericTable * x, const classifier::Model * m, NumericTable * y, - const daal::algorithms::Parameter * par) -{ - return compute(x, m, y, NULL, NULL, par); -} - -template -services::Status KNNClassificationPredictKernelUCAPI::initializeIndices(services::internal::sycl::ExecutionContextIface & context, - const uint32_t fromDataBlockRow, - const uint32_t dataBlockRowCount, - services::internal::sycl::UniversalBuffer & indices) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.initializeIndices); - - services::Status st; - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory)); - auto kernel = kernelFactory.getKernel("initialize_indices", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_OVERFLOW_CHECK_BY_ADDING(int32_t, fromDataBlockRow, dataBlockRowCount); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int, dataBlockRowCount); - - KernelArguments args(2, st); - DAAL_CHECK_STATUS_VAR(st); - - args.set(0, indices, AccessModeIds::write); - args.set(1, static_cast(fromDataBlockRow)); - - KernelRange range(dataBlockRowCount); - - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - - return st; -} - -template -services::Status KNNClassificationPredictKernelUCAPI::distancesFromSquares(services::internal::sycl::ExecutionContextIface & context, - services::internal::sycl::UniversalBuffer & data, - const uint32_t distancesCount) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.distancesFromSquare); - - services::Status st; - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory)); - auto kernel = kernelFactory.getKernel("distances_from_squares", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(data, algorithmFpType, distancesCount); - - KernelArguments args(1, st); - DAAL_CHECK_STATUS_VAR(st); - - args.set(0, data, AccessModeIds::readwrite); - - KernelRange range(distancesCount); - - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - - return st; -} - -template -services::Status KNNClassificationPredictKernelUCAPI::copyPartialDistancesAndLabels( - ExecutionContextIface & context, const UniversalBuffer & distances, const UniversalBuffer & labels, UniversalBuffer & partialDistances, - UniversalBuffer & partialLabels, uint32_t queryBlockRows, uint32_t k, uint32_t nChunk, uint32_t totalNumberOfChunks) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.copyPartialSelections); - - services::Status st; - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory)); - auto kernel = kernelFactory.getKernel("copy_partial_selection", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(distances, algorithmFpType, queryBlockRows * k); - DAAL_ASSERT_UNIVERSAL_BUFFER(labels, int, queryBlockRows * k); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialDistances, algorithmFpType, queryBlockRows * k * totalNumberOfChunks); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialLabels, int, queryBlockRows * k * totalNumberOfChunks); - - KernelArguments args(7, st); - DAAL_CHECK_STATUS_VAR(st); - - args.set(0, distances, AccessModeIds::read); - args.set(1, labels, AccessModeIds::read); - args.set(2, partialDistances, AccessModeIds::readwrite); - args.set(3, partialLabels, AccessModeIds::readwrite); - args.set(4, static_cast(k)); - args.set(5, static_cast(nChunk)); - args.set(6, static_cast(totalNumberOfChunks)); - - KernelRange localRange(1, 1); - KernelRange globalRange(queryBlockRows, k); - - KernelNDRange range(2); - range.global(globalRange, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(localRange, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -services::Status KNNClassificationPredictKernelUCAPI::scatterSumOfSquares(ExecutionContextIface & context, - const UniversalBuffer & dataSumOfSquares, - uint32_t dataBlockRowCount, uint32_t queryBlockRowCount, - UniversalBuffer & distances) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.scatterSumOfSquares); - DAAL_CHECK(dataBlockRowCount <= maxInt32AsUint32T, services::ErrorBufferSizeIntegerOverflow); - - services::Status st; - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory)); - auto kernel = kernelFactory.getKernel("scatter_row", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(dataSumOfSquares, algorithmFpType, dataBlockRowCount); - DAAL_ASSERT_UNIVERSAL_BUFFER(distances, algorithmFpType, dataBlockRowCount * queryBlockRowCount); - - KernelArguments args(3, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, dataSumOfSquares, AccessModeIds::read); - args.set(1, distances, AccessModeIds::write); - args.set(2, static_cast(dataBlockRowCount)); - - KernelRange globalRange(dataBlockRowCount, queryBlockRowCount); - context.run(globalRange, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -services::Status KNNClassificationPredictKernelUCAPI::scatterBothL2Norms(ExecutionContextIface & context, - const UniversalBuffer & dataSumOfSquares, - const UniversalBuffer & querySumOfSquares, - uint32_t dataBlockRowCount, uint32_t queryBlockRowCount, - UniversalBuffer & distances) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.scatterSumOfSquares); - DAAL_CHECK(dataBlockRowCount <= maxInt32AsUint32T, services::ErrorBufferSizeIntegerOverflow); - - services::Status st; - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory)); - auto kernel = kernelFactory.getKernel("scatter_row_col", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(dataSumOfSquares, algorithmFpType, dataBlockRowCount); - DAAL_ASSERT_UNIVERSAL_BUFFER(querySumOfSquares, algorithmFpType, queryBlockRowCount); - DAAL_ASSERT_UNIVERSAL_BUFFER(distances, algorithmFpType, dataBlockRowCount * queryBlockRowCount); - - KernelArguments args(4, st); - DAAL_CHECK_STATUS_VAR(st); - - args.set(0, dataSumOfSquares, AccessModeIds::read); - args.set(1, querySumOfSquares, AccessModeIds::read); - args.set(2, distances, AccessModeIds::write); - args.set(3, static_cast(dataBlockRowCount)); - - KernelRange globalRange(dataBlockRowCount, queryBlockRowCount); - context.run(globalRange, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -services::Status KNNClassificationPredictKernelUCAPI::computeDistances(ExecutionContextIface & context, - const services::internal::Buffer & data, - const services::internal::Buffer & query, - UniversalBuffer & distances, uint32_t dataBlockRowCount, - uint32_t queryBlockRowCount, uint32_t nFeatures) - -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.GEMM); - DAAL_ASSERT_UNIVERSAL_BUFFER(distances, algorithmFpType, queryBlockRowCount * dataBlockRowCount); - DAAL_ASSERT(data.size() >= dataBlockRowCount * nFeatures); - DAAL_ASSERT(query.size() >= queryBlockRowCount * nFeatures); - return BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::Trans, queryBlockRowCount, - dataBlockRowCount, nFeatures, algorithmFpType(-2.0), query, nFeatures, 0, data, nFeatures, 0, - algorithmFpType(1.0), distances.get(), dataBlockRowCount, 0); -} - -template -services::Status KNNClassificationPredictKernelUCAPI::computeWinners(ExecutionContextIface & context, const UniversalBuffer & labels, - uint32_t queryBlockRowCount, uint32_t k, - UniversalBuffer labelsOut) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeWinners); - - services::Status st; - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory)); - auto kernel = kernelFactory.getKernel("find_max_occurance", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(labels, int, queryBlockRowCount * k); - DAAL_ASSERT_UNIVERSAL_BUFFER(labelsOut, algorithmFpType, queryBlockRowCount); - - KernelArguments args(3, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, labels, AccessModeIds::read); - args.set(1, labelsOut, AccessModeIds::write); - args.set(2, static_cast(k)); - - KernelRange localRange(1); - KernelRange globalRange(queryBlockRowCount); - - KernelNDRange range(1); - range.global(globalRange, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(localRange, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -services::Status KNNClassificationPredictKernelUCAPI::buildProgram(ClKernelFactoryIface & kernelFactory) -{ - auto fptypeName = services::internal::sycl::getKeyFPType(); - auto buildOptions = fptypeName; - buildOptions.add(" -D sortedType=int -D NumParts=16 "); - - services::String cachekey("__daal_algorithms_bf_knn_block_"); - cachekey.add(fptypeName); - cachekey.add(buildOptions); - - services::Status st; - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), bf_knn_cl_kernels, buildOptions.c_str(), st); - DAAL_CHECK_STATUS_VAR(st); - } - return st; -} - -} // namespace internal -} // namespace prediction -} // namespace bf_knn_classification -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi.h b/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi.h deleted file mode 100644 index 7201b4ede83..00000000000 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi.h +++ /dev/null @@ -1,52 +0,0 @@ -/* file: bf_knn_classification_train_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __BF_KNN_CLASSIFICATION_TRAIN_KERNEL_UCAPI_H__ -#define __BF_KNN_CLASSIFICATION_TRAIN_KERNEL_UCAPI_H__ - -#include "data_management/data/numeric_table.h" -#include "src/algorithms/kernel.h" -#include "algorithms/k_nearest_neighbors/bf_knn_classification_training_types.h" -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_model_ucapi_impl.h" - -namespace daal -{ -namespace algorithms -{ -namespace bf_knn_classification -{ -namespace training -{ -namespace internal -{ -using namespace daal::data_management; -using namespace daal::services; - -template -class KNNClassificationTrainKernelUCAPI : public daal::algorithms::Kernel -{ -public: - services::Status compute(NumericTable * x, NumericTable * y, Model * r, const Parameter & par, engines::BatchBase & engine); -}; - -} // namespace internal -} // namespace training -} // namespace bf_knn_classification -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi_impl.i b/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi_impl.i deleted file mode 100644 index e6dc8f142c5..00000000000 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi_impl.i +++ /dev/null @@ -1,46 +0,0 @@ -/* file: bf_knn_classification_train_kernel_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __BF_KNN_CLASSIFICATION_TRAIN_KERNEL_UCAPI_IMPL_I__ -#define __BF_KNN_CLASSIFICATION_TRAIN_KERNEL_UCAPI_IMPL_I__ - -#include "src/algorithms/k_nearest_neighbors/oneapi/bf_knn_classification_train_kernel_ucapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace bf_knn_classification -{ -namespace training -{ -namespace internal -{ -template -Status KNNClassificationTrainKernelUCAPI::compute(NumericTable * x, NumericTable * y, Model * r, const Parameter & par, - engines::BatchBase & engine) -{ - return Status(); -} - -} // namespace internal -} // namespace training -} // namespace bf_knn_classification -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/cl_kernels/bf_knn_cl_kernels.cl b/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/cl_kernels/bf_knn_cl_kernels.cl deleted file mode 100644 index 40705ee9e6b..00000000000 --- a/cpp/daal/src/algorithms/k_nearest_neighbors/oneapi/cl_kernels/bf_knn_cl_kernels.cl +++ /dev/null @@ -1,102 +0,0 @@ -/* file: bf_knn_cl_kernels.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of BF KNN OpenCL kernels. -//-- -*/ - -#ifndef __KNN_CL_KERNELS_CL__ -#define __KNN_CL_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - bf_knn_cl_kernels, - - __kernel void scatter_row(__global const algorithmFPType * dataSq, __global algorithmFPType * distances, int N) { - const int global_id_0 = get_global_id(0); - const int global_id_1 = get_global_id(1); - - distances[global_id_0 + global_id_1 * N] = dataSq[global_id_0]; - } - - __kernel void scatter_row_col(__global const algorithmFPType * dataSq, __global const algorithmFPType * querySq, - __global algorithmFPType * distances, int N) { - const int global_id_0 = get_global_id(0); - const int global_id_1 = get_global_id(1); - - distances[global_id_0 + global_id_1 * N] = dataSq[global_id_0] + querySq[global_id_1]; - } - - __kernel void distances_from_squares(__global algorithmFPType * data) { - const int global_id_0 = get_global_id(0); - const algorithmFPType val = data[global_id_0]; - data[global_id_0] = (val > 0) ? sqrt(val) : 0; - } - - __kernel void initialize_indices(__global int * indices, const int from) { - const int global_id_0 = get_global_id(0); - - indices[global_id_0] = global_id_0 + from; - } - - __kernel void copy_partial_selection(__global const algorithmFPType * distances, __global const int * categories, - __global algorithmFPType * partialDistances, __global int * partialCategories, int K, int Part, - int TotalParts) { - const int global_id_0 = get_global_id(0); - const int global_id_1 = get_global_id(1); - - partialDistances[global_id_0 * K * TotalParts + Part * K + global_id_1] = distances[global_id_0 * K + global_id_1]; - partialCategories[global_id_0 * K * TotalParts + Part * K + global_id_1] = categories[global_id_0 * K + global_id_1]; - } - - __kernel void find_max_occurance(__global const sortedType * data, __global algorithmFPType * result, int K) { - const int global_id_0 = get_global_id(0); - __global const sortedType * array = &data[global_id_0 * K]; - - sortedType maxVal = -1; - sortedType curVal = -1; - int maxCount = 0; - int curCount = 0; - - for (int i = 0; i < K; i++) - { - sortedType val = array[i]; - if (val == curVal) - curCount++; - else - { - if (curCount > maxCount) - { - maxCount = curCount; - maxVal = curVal; - } - curVal = val; - curCount = 1; - } - } - if (curCount > maxCount) maxVal = curVal; - result[global_id_0] = maxVal; - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/kernel.h b/cpp/daal/src/algorithms/kernel.h index 2a0a7caee1f..6058f077ec5 100644 --- a/cpp/daal/src/algorithms/kernel.h +++ b/cpp/daal/src/algorithms/kernel.h @@ -24,10 +24,10 @@ #ifndef __KERNEL_H__ #define __KERNEL_H__ +#include "algorithms/algorithm_kernel.h" #include "services/daal_defines.h" #include "src/services/service_defines.h" #include "services/internal/daal_kernel_defines.h" -#include "services/internal/gpu_support_checker.h" #include "src/algorithms/kernel_config.h" @@ -37,12 +37,6 @@ _kernel = (new KernelClass<__VA_ARGS__, cpu>); \ } -#undef __DAAL_INITIALIZE_KERNELS_SYCL -#define __DAAL_INITIALIZE_KERNELS_SYCL(KernelClass, ...) \ - { \ - _kernel = (new KernelClass<__VA_ARGS__>); \ - } - #undef __DAAL_DEINITIALIZE_KERNELS #define __DAAL_DEINITIALIZE_KERNELS() \ { \ @@ -58,18 +52,8 @@ return ((KernelClass *)(_kernel))->method(__VA_ARGS__); \ } -#undef __DAAL_CALL_KERNEL_SYCL -#define __DAAL_CALL_KERNEL_SYCL(env, KernelClass, templateArguments, method, ...) \ - { \ - return ((KernelClass *)(_kernel))->method(__VA_ARGS__); \ - } - #undef __DAAL_CALL_KERNEL_STATUS #define __DAAL_CALL_KERNEL_STATUS(env, KernelClass, templateArguments, method, ...) \ ((KernelClass *)(_kernel))->method(__VA_ARGS__); -#undef __DAAL_CALL_KERNEL_STATUS_SYCL -#define __DAAL_CALL_KERNEL_STATUS_SYCL(env, KernelClass, templateArguments, method, ...) \ - ((KernelClass *)(_kernel))->method(__VA_ARGS__); - #endif diff --git a/cpp/daal/src/algorithms/kernel_config.h b/cpp/daal/src/algorithms/kernel_config.h index f2ee49d5057..b02d774daa4 100644 --- a/cpp/daal/src/algorithms/kernel_config.h +++ b/cpp/daal/src/algorithms/kernel_config.h @@ -28,7 +28,6 @@ #include "services/daal_defines.h" #include "src/services/service_defines.h" #include "services/internal/daal_kernel_defines.h" -#include "services/internal/gpu_support_checker.h" #if defined(TARGET_X86_64) #include "src/algorithms/kernel_inst_x86.h" @@ -51,12 +50,4 @@ #define __DAAL_INSTANTIATE_DISPATCH_CONTAINER(ContainerTemplate, Mode, ...) \ __DAAL_INSTANTIATE_DISPATCH_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID, __VA_ARGS__) -#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(ContainerTemplate, Mode, ...) \ - __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID, \ - __VA_ARGS__) - -#define __DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(ContainerTemplate, Mode, ...) \ - __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, AlgorithmDispatchContainer, AlgorithmContainerImpl, __DAAL_GET_CPUID_SAFE, \ - __VA_ARGS__) - #endif diff --git a/cpp/daal/src/algorithms/kernel_function/BUILD b/cpp/daal/src/algorithms/kernel_function/BUILD index 9f64eecd83b..9810bc00230 100644 --- a/cpp/daal/src/algorithms/kernel_function/BUILD +++ b/cpp/daal/src/algorithms/kernel_function/BUILD @@ -4,9 +4,8 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_fpt.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_fpt.cpp index 5538faae2bc..0f919e814cc 100644 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_fpt.cpp +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_fpt.cpp @@ -22,7 +22,7 @@ */ #include "algorithms/kernel_function/kernel_function_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -47,19 +47,9 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in const size_t nVectors2 = algInput->get(Y)->getNumberOfRows(); services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (!deviceInfo.isCpu) - { - set(values, data_management::internal::SyclHomogenNumericTable::create(nVectors2, nVectors1, - data_management::NumericTable::doAllocate, &status)); - } - else - { - set(values, - data_management::HomogenNumericTable::create(nVectors2, nVectors1, data_management::NumericTable::doAllocate, &status)); - } + set(values, + data_management::HomogenNumericTable::create(nVectors2, nVectors1, data_management::NumericTable::doAllocate, &status)); return status; } diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_batch_container.h b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_batch_container.h index 94dc45a2b66..91eb604eed3 100644 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_batch_container.h +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_batch_container.h @@ -25,7 +25,6 @@ #include "src/algorithms/kernel_function/polynomial/kernel_function_polynomial.h" #include "src/algorithms/kernel_function/polynomial/kernel_function_polynomial_dense_default_kernel.h" #include "src/algorithms/kernel_function/polynomial/kernel_function_polynomial_csr_fast_kernel.h" -#include "src/algorithms/kernel_function/oneapi/kernel_function_linear_kernel_oneapi.h" namespace daal { @@ -41,16 +40,7 @@ namespace poly = daal::algorithms::kernel_function::polynomial::internal; template BatchContainer::BatchContainer(services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (!deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::KernelImplLinearOneAPI, method, algorithmFPType); - } - else - { - __DAAL_INITIALIZE_KERNELS(poly::KernelImplPolynomial, (method == defaultDense) ? poly::defaultDense : poly::fastCSR, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(poly::KernelImplPolynomial, (method == defaultDense) ? poly::defaultDense : poly::fastCSR, algorithmFPType); } template @@ -85,20 +75,9 @@ services::Status BatchContainer::compute() kernelPar.degree = 1; kernelPar.kernelType = KernelType::linear; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (!deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KernelImplLinearOneAPI, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a[0], a[1], r[0], - par); - } - else - { - __DAAL_CALL_KERNEL(env, poly::KernelImplPolynomial, - __DAAL_KERNEL_ARGUMENTS((method == defaultDense) ? poly::defaultDense : poly::fastCSR, algorithmFPType), compute, a[0], - a[1], r[0], &kernelPar); - } + __DAAL_CALL_KERNEL(env, poly::KernelImplPolynomial, + __DAAL_KERNEL_ARGUMENTS((method == defaultDense) ? poly::defaultDense : poly::fastCSR, algorithmFPType), compute, a[0], a[1], + r[0], &kernelPar); } } // namespace linear diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_fpt_dispatcher.cpp index 3ce700b68e9..d471f32c472 100644 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_fpt_dispatcher.cpp @@ -28,6 +28,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kernel_function::linear::BatchContainer, batch, DAAL_FPTYPE, kernel_function::linear::fastCSR) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kernel_function::linear::BatchContainer, batch, DAAL_FPTYPE, kernel_function::linear::fastCSR) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_oneapi_fpt.cpp deleted file mode 100755 index 809a2292ab2..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_csr_fast_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: kernel_function_linear_csr_fast_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of linear kernel functions for CSR input data. -//-- -*/ - -#include "src/algorithms/kernel_function/oneapi/kernel_function_linear_kernel_oneapi.h" -#include "src/algorithms/kernel_function/oneapi/kernel_function_linear_csr_fast_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace linear -{ -namespace internal -{ -template class KernelImplLinearOneAPI; - -} // namespace internal -} // namespace linear -} // namespace kernel_function -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_fpt_dispatcher.cpp index cc5055ed240..b46abf581c6 100644 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_fpt_dispatcher.cpp @@ -28,6 +28,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kernel_function::linear::BatchContainer, batch, DAAL_FPTYPE, kernel_function::linear::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kernel_function::linear::BatchContainer, batch, DAAL_FPTYPE, kernel_function::linear::defaultDense) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index 529aaeca73f..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_linear_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: kernel_function_linear_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of linear kernel functions for dense input data. -//-- -*/ - -#include "src/algorithms/kernel_function/oneapi/kernel_function_linear_kernel_oneapi.h" -#include "src/algorithms/kernel_function/oneapi/kernel_function_linear_dense_default_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace linear -{ -namespace internal -{ -template class DAAL_EXPORT KernelImplLinearOneAPI; - -} // namespace internal -} // namespace linear -} // namespace kernel_function -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_batch_container.h b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_batch_container.h index b57690fab0b..0d83c462cab 100644 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_batch_container.h +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_batch_container.h @@ -24,7 +24,6 @@ #include "algorithms/kernel_function/kernel_function_rbf.h" #include "src/algorithms/kernel_function/kernel_function_rbf_dense_default_kernel.h" #include "src/algorithms/kernel_function/kernel_function_rbf_csr_fast_kernel.h" -#include "src/algorithms/kernel_function/oneapi/kernel_function_rbf_kernel_oneapi.h" namespace daal { @@ -39,16 +38,7 @@ using namespace daal::data_management; template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (!deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::KernelImplRBFOneAPI, method, algorithmFPType); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::KernelImplRBF, method, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(internal::KernelImplRBF, method, algorithmFPType); } template @@ -87,17 +77,7 @@ services::Status BatchContainer::compute() return services::Status(services::ErrorIncorrectTypeOfInputNumericTable); } - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (!deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KernelImplRBFOneAPI, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a[0], a[1], r[0], par); - } - else - { - __DAAL_CALL_KERNEL(env, internal::KernelImplRBF, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a[0], a[1], r[0], &kernelPar); - } + __DAAL_CALL_KERNEL(env, internal::KernelImplRBF, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a[0], a[1], r[0], &kernelPar); } } // namespace rbf diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_batch_oneapi_fpt.cpp deleted file mode 100755 index e9ae90c894e..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: kernel_function_rbf_csr_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of RBF kernel functions for dense input data. -//-- -*/ - -#include "src/algorithms/kernel_function/oneapi/kernel_function_rbf_kernel_oneapi.h" -#include "src/algorithms/kernel_function/oneapi/kernel_function_rbf_csr_fast_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace rbf -{ -namespace internal -{ -template class DAAL_EXPORT KernelImplRBFOneAPI; - -} // namespace internal -} // namespace rbf -} // namespace kernel_function -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_fast_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_fast_batch_fpt_dispatcher.cpp index d21da4bd6db..11e426ab21e 100644 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_fast_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_csr_fast_batch_fpt_dispatcher.cpp @@ -29,6 +29,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kernel_function::rbf::BatchContainer, batch, DAAL_FPTYPE, kernel_function::rbf::fastCSR) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kernel_function::rbf::BatchContainer, batch, DAAL_FPTYPE, kernel_function::rbf::fastCSR) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_fpt_dispatcher.cpp index 6d8b2c94341..5e054e5a162 100644 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_fpt_dispatcher.cpp @@ -29,6 +29,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kernel_function::rbf::BatchContainer, batch, DAAL_FPTYPE, kernel_function::rbf::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kernel_function::rbf::BatchContainer, batch, DAAL_FPTYPE, kernel_function::rbf::defaultDense) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index 862ab064f5d..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/kernel_function_rbf_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: kernel_function_rbf_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of RBF kernel functions for dense input data. -//-- -*/ - -#include "src/algorithms/kernel_function/oneapi/kernel_function_rbf_kernel_oneapi.h" -#include "src/algorithms/kernel_function/oneapi/kernel_function_rbf_dense_default_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace rbf -{ -namespace internal -{ -template class DAAL_EXPORT KernelImplRBFOneAPI; - -} // namespace internal -} // namespace rbf -} // namespace kernel_function -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/cl_kernels/kernel_function.cl b/cpp/daal/src/algorithms/kernel_function/oneapi/cl_kernels/kernel_function.cl deleted file mode 100644 index 007c7fd077f..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/cl_kernels/kernel_function.cl +++ /dev/null @@ -1,67 +0,0 @@ -/* file: kernel_function.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Kernel Function OpenCL kernels. -//-- -*/ - -#ifndef __KERNEL_FUNCTION_KERNELS_CL__ -#define __KERNEL_FUNCTION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelKF, - - __kernel void computeRBF(const __global algorithmFPType * const sqrA1, const __global algorithmFPType * const sqrA2, const uint ld, - const algorithmFPType expThreshold, const algorithmFPType coeff, __global algorithmFPType * rbf) { - const uint i = get_global_id(0); - const uint j = get_global_id(1); - - const algorithmFPType sqrA1i = sqrA1[i]; - const algorithmFPType sqrA2j = sqrA2[j]; - const algorithmFPType rbfij = rbf[i * ld + j]; - const algorithmFPType arg = fmax((rbfij + sqrA1i + sqrA2j) * coeff, expThreshold); - - rbf[i * ld + j] = exp(arg); - } - - __kernel void sumOfSquaresCSR(__global const algorithmFPType * const values, __global const ulong * const rowInd, - __global algorithmFPType * sumOfSquares) { - const ulong i = get_global_id(0); - - const ulong rowCur = rowInd[i] - 1; - const ulong rowEnd = rowInd[i + 1] - 1; - - algorithmFPType sum = (algorithmFPType)0; - for (ulong j = rowCur; j < rowEnd; ++j) - { - sum += values[j] * values[j]; - } - - sumOfSquares[i] = sum; - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_helper_oneapi.h b/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_helper_oneapi.h deleted file mode 100755 index 7006e7840f6..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_helper_oneapi.h +++ /dev/null @@ -1,145 +0,0 @@ -/* file: kernel_function_helper_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __KERNEL_FUNCTION_HELPER_ONEAPI_H__ -#define __KERNEL_FUNCTION_HELPER_ONEAPI_H__ - -#include "src/externals/service_profiler.h" -#include "src/externals/service_math.h" -#include "src/services/service_data_utils.h" -#include "src/algorithms/kernel_function/oneapi/cl_kernels/kernel_function.cl" -#include "src/sycl/math_service_types.h" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace rbf -{ -namespace internal -{ -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; -using namespace daal::services::internal::sycl::math; - -template -class HelperKernel -{ -public: - static services::Status buildProgram(ClKernelFactoryIface & factory) - { - services::String options = getKeyFPType(); - - services::String cachekey("__daal_algorithms_kernel_function_rbf"); - cachekey.add(options); - - services::Status status; - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelKF, options.c_str(), status); - return status; - } - - static services::Status lazyAllocate(UniversalBuffer & x, const size_t n) - { - services::Status status; - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - const TypeIds::Id idType = TypeIds::id(); - if (x.empty() || x.get().size() < n) - { - x = ctx.allocate(idType, n, status); - } - - return status; - } - - static services::Status sumOfSquaresCSR(const services::internal::Buffer & valuesBuff, - const services::internal::Buffer & rowIndBuff, UniversalBuffer & sumOfSquaresBuff, const size_t n) - - { - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("sumOfSquaresCSR", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(rowIndBuff.size() == n + 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(sumOfSquaresBuff, algorithmFPType, n); - - args.set(0, valuesBuff, AccessModeIds::read); - args.set(1, rowIndBuff, AccessModeIds::read); - args.set(2, sumOfSquaresBuff, AccessModeIds::readwrite); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - - static services::Status computeRBF(const UniversalBuffer & sqrMatLeft, const UniversalBuffer & sqrMatRight, const uint32_t ld, - const algorithmFPType coeff, services::internal::Buffer & rbf, const size_t n, const size_t m) - - { - DAAL_ITTNOTIFY_SCOPED_TASK(KernelRBF.computeRBF); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("computeRBF", status); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType threshold = math::expThreshold(); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(sqrMatLeft, algorithmFPType, n); - DAAL_ASSERT_UNIVERSAL_BUFFER(sqrMatRight, algorithmFPType, m); - DAAL_ASSERT(rbf.size() == n * m); - - args.set(0, sqrMatLeft, AccessModeIds::read); - args.set(1, sqrMatRight, AccessModeIds::read); - args.set(2, ld); - args.set(3, threshold); - args.set(4, coeff); - args.set(5, rbf, AccessModeIds::readwrite); - - KernelRange range(n, m); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; - } -}; - -} // namespace internal -} // namespace rbf -} // namespace kernel_function -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_csr_fast_oneapi_impl.i b/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_csr_fast_oneapi_impl.i deleted file mode 100755 index 197ebed0188..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_csr_fast_oneapi_impl.i +++ /dev/null @@ -1,128 +0,0 @@ -/* file: kernel_function_linear_csr_fast_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Linear kernel functions implementation -//-- -*/ - -#ifndef __KERNEL_FUNCTION_LINEAR_CSR_FAST_ONEAPI_IMPL_I__ -#define __KERNEL_FUNCTION_LINEAR_CSR_FAST_ONEAPI_IMPL_I__ - -#include "algorithms/kernel_function/kernel_function_types_linear.h" - -#include "src/externals/service_stat.h" -#include "src/algorithms/service_error_handling.h" -#include "src/externals/service_profiler.h" -#include "src/sycl/spblas_gpu.h" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace linear -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -template -services::Status KernelImplLinearOneAPI::computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplLinearOneAPI::computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplLinearOneAPI::computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, - NumericTable * result, const ParameterBase * par) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - const size_t nMatLeft = matLeft->getNumberOfRows(); - const size_t nMatRight = matRight->getNumberOfRows(); - - const size_t pMatLeft = matLeft->getNumberOfColumns(); - const size_t pMatRight = matRight->getNumberOfColumns(); - DAAL_ASSERT(pMatLeft == pMatRight); - - const Parameter * linPar = static_cast(par); - const algorithmFPType alpha = algorithmFPType(linPar->k); - const algorithmFPType beta = algorithmFPType(linPar->b); - - CSRBlockDescriptor matLeftBD, matRightBD; - { - DAAL_ITTNOTIFY_SCOPED_TASK(KernelLinearCSROneAPI.gemm); - - CSRNumericTableIface * matLeftCSR = dynamic_cast(matLeft); - DAAL_CHECK(matLeftCSR, services::ErrorIncorrectTypeOfInputNumericTable); - CSRNumericTableIface * matRightCSR = dynamic_cast(matRight); - DAAL_CHECK(matRightCSR, services::ErrorIncorrectTypeOfInputNumericTable); - DAAL_CHECK_STATUS(status, matLeftCSR->getSparseBlock(0, nMatLeft, readOnly, matLeftBD)); - DAAL_CHECK_STATUS(status, matRightCSR->getSparseBlock(0, nMatRight, readOnly, matRightBD)); - - const auto matLeftValuesBuff = matLeftBD.getBlockValuesBuffer(); - const auto matLeftColumnIndicesBuff = matLeftBD.getBlockColumnIndicesBuffer(); - const auto matLeftRowIndicesBuff = matLeftBD.getBlockRowIndicesBuffer(); - - const auto matRightValuesBuff = matRightBD.getBlockValuesBuffer(); - const auto matRightColumnIndicesBuff = matRightBD.getBlockColumnIndicesBuffer(); - const auto matRightRowIndicesBuff = matRightBD.getBlockRowIndicesBuffer(); - - BlockDescriptor resultBlock; - DAAL_CHECK_STATUS(status, result->getBlockOfRows(0, nMatLeft, ReadWriteMode::writeOnly, resultBlock)); - - auto resultBuff = resultBlock.getBuffer(); - - if (beta != 0.0) - { - context.fill(resultBuff, 1.0, status); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_CHECK_STATUS( - status, math::SpBlasGpu::xgemm(math::Transpose::Trans, math::Transpose::NoTrans, nMatLeft, nMatRight, pMatLeft, alpha, - matLeftValuesBuff, matLeftColumnIndicesBuff, matLeftRowIndicesBuff, matRightValuesBuff, - matRightColumnIndicesBuff, matRightRowIndicesBuff, beta, resultBuff, nMatRight, 0)); - - DAAL_CHECK_STATUS(status, matLeftCSR->releaseSparseBlock(matLeftBD)); - DAAL_CHECK_STATUS(status, matRightCSR->releaseSparseBlock(matRightBD)); - DAAL_CHECK_STATUS(status, result->releaseBlockOfRows(resultBlock)); - } - - return status; -} - -} // namespace internal -} // namespace linear -} // namespace kernel_function -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_dense_default_oneapi_impl.i b/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_dense_default_oneapi_impl.i deleted file mode 100644 index 001f494d5c7..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_dense_default_oneapi_impl.i +++ /dev/null @@ -1,122 +0,0 @@ -/* file: kernel_function_linear_dense_default_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Linear kernel functions implementation -//-- -*/ - -#ifndef __KERNEL_FUNCTION_LINEAR_DENSE_DEFAULT_ONEAPI_IMPL_I__ -#define __KERNEL_FUNCTION_LINEAR_DENSE_DEFAULT_ONEAPI_IMPL_I__ - -#include "algorithms/kernel_function/kernel_function_types_linear.h" - -#include "src/externals/service_stat.h" -#include "src/algorithms/service_error_handling.h" -#include "src/algorithms/kernel_function/oneapi/cl_kernels/kernel_function.cl" -#include "src/externals/service_profiler.h" -#include "src/sycl/blas_gpu.h" -#include "src/sycl/reducer.h" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace linear -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -template -services::Status KernelImplLinearOneAPI::computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplLinearOneAPI::computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplLinearOneAPI::computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, - NumericTable * result, const ParameterBase * par) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - const size_t nMatLeft = matLeft->getNumberOfRows(); - const size_t nMatRight = matRight->getNumberOfRows(); - - const size_t pMatLeft = matLeft->getNumberOfColumns(); - const size_t pMatRight = matRight->getNumberOfColumns(); - DAAL_ASSERT(pMatLeft == pMatRight); - - const Parameter * linPar = static_cast(par); - const algorithmFPType alpha = algorithmFPType(linPar->k); - const algorithmFPType beta = algorithmFPType(linPar->b); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(KernelLinearOneAPI.gemm); - - BlockDescriptor matLeftBlock; - BlockDescriptor matRightBlock; - BlockDescriptor resultBlock; - - DAAL_CHECK_STATUS(status, matLeft->getBlockOfRows(0, nMatLeft, ReadWriteMode::readOnly, matLeftBlock)); - DAAL_CHECK_STATUS(status, matRight->getBlockOfRows(0, nMatRight, ReadWriteMode::readOnly, matRightBlock)); - - DAAL_CHECK_STATUS(status, result->getBlockOfRows(0, nMatLeft, ReadWriteMode::writeOnly, resultBlock)); - - const services::internal::Buffer matLeftBuff = matLeftBlock.getBuffer(); - const services::internal::Buffer matRightBuff = matRightBlock.getBuffer(); - - services::internal::Buffer resultBuff = resultBlock.getBuffer(); - - if (beta != 0.0) - { - context.fill(resultBuff, 1.0, status); - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_CHECK_STATUS(status, BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::Trans, nMatLeft, - nMatRight, pMatLeft, alpha, matLeftBuff, pMatLeft, 0, matRightBuff, pMatRight, 0, - beta, resultBuff, nMatRight, 0)); - - DAAL_CHECK_STATUS(status, matLeft->releaseBlockOfRows(matLeftBlock)); - DAAL_CHECK_STATUS(status, matRight->releaseBlockOfRows(matRightBlock)); - DAAL_CHECK_STATUS(status, result->releaseBlockOfRows(resultBlock)); - } - - return status; -} - -} // namespace internal -} // namespace linear -} // namespace kernel_function -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_kernel_oneapi.h b/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_kernel_oneapi.h deleted file mode 100644 index 940cc149b49..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_linear_kernel_oneapi.h +++ /dev/null @@ -1,104 +0,0 @@ -/* file: kernel_function_linear_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template structs that calculate SVM Linear Kernel functions. -//-- -*/ - -#ifndef __KERNEL_FUNCTION_DENSE_LINEAR_KERNEL_ONEAPI_H__ -#define __KERNEL_FUNCTION_DENSE_LINEAR_KERNEL_ONEAPI_H__ - -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/kernel_function/kernel_function_types_linear.h" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace linear -{ -namespace internal -{ -using namespace daal::data_management; -using namespace daal::services; - -template -class KernelImplLinearOneAPI : public Kernel -{ -public: - services::Status compute(NumericTable * ntLeft, NumericTable * ntRight, NumericTable * result, const ParameterBase * par) - { - return services::ErrorMethodNotImplemented; - } -}; - -template -class KernelImplLinearOneAPI : public Kernel -{ -public: - services::Status compute(NumericTable * ntLeft, NumericTable * ntRight, NumericTable * result, const ParameterBase * par) - { - ComputationMode computationMode = par->computationMode; - switch (computationMode) - { - case vectorVector: return computeInternalVectorVector(ntLeft, ntRight, result, par); - case matrixVector: return computeInternalMatrixVector(ntLeft, ntRight, result, par); - case matrixMatrix: return computeInternalMatrixMatrix(ntLeft, ntRight, result, par); - default: return services::ErrorIncorrectParameter; - } - } - -protected: - services::Status computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, NumericTable * result, const ParameterBase * par); -}; - -template -class KernelImplLinearOneAPI : public Kernel -{ -public: - services::Status compute(NumericTable * ntLeft, NumericTable * ntRight, NumericTable * result, const ParameterBase * par) - { - ComputationMode computationMode = par->computationMode; - switch (computationMode) - { - case vectorVector: return computeInternalVectorVector(ntLeft, ntRight, result, par); - case matrixVector: return computeInternalMatrixVector(ntLeft, ntRight, result, par); - case matrixMatrix: return computeInternalMatrixMatrix(ntLeft, ntRight, result, par); - default: return services::ErrorIncorrectParameter; - } - } - -protected: - services::Status computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, NumericTable * result, const ParameterBase * par); -}; - -} // namespace internal -} // namespace linear -} // namespace kernel_function -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_csr_fast_oneapi_impl.i b/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_csr_fast_oneapi_impl.i deleted file mode 100644 index ab67b10cc1d..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_csr_fast_oneapi_impl.i +++ /dev/null @@ -1,135 +0,0 @@ -/* file: kernel_function_rbf_csr_fast_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// RBF kernel functions implementation -//-- -*/ - -#ifndef __KERNEL_FUNCTION_RBF_CSR_FAST_IMPL_ONEAPI_I__ -#define __KERNEL_FUNCTION_RBF_CSR_FAST_IMPL_ONEAPI_I__ - -#include "algorithms/kernel_function/kernel_function_types_rbf.h" -#include "src/data_management/service_numeric_table.h" -#include "src/externals/service_math.h" -#include "src/externals/service_profiler.h" -#include "src/sycl/spblas_gpu.h" -#include "src/sycl/reducer.h" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace rbf -{ -namespace internal -{ -using namespace daal::services::internal::sycl; -using namespace daal::services::internal::sycl::math; - -template -services::Status KernelImplRBFOneAPI::computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplRBFOneAPI::computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplRBFOneAPI::computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, - NumericTable * result, const ParameterBase * par) -{ - services::Status status; - auto & context = services::internal::getDefaultContext(); - - const size_t nMatLeft = matLeft->getNumberOfRows(); - const size_t nMatRight = matRight->getNumberOfRows(); - - const size_t pMatLeft = matLeft->getNumberOfColumns(); - const size_t pMatRight = matRight->getNumberOfColumns(); - DAAL_ASSERT(pMatLeft == pMatRight); - - const Parameter * rbfPar = static_cast(par); - const algorithmFPType coeff = algorithmFPType(-0.5 / (rbfPar->sigma * rbfPar->sigma)); - - DAAL_CHECK_STATUS(status, Helper::lazyAllocate(_sqrMatLeft, nMatLeft)); - DAAL_CHECK_STATUS(status, Helper::lazyAllocate(_sqrMatRight, nMatRight)); - - CSRBlockDescriptor matLeftBD, matRightBD; - - CSRNumericTableIface * matLeftCSR = dynamic_cast(matLeft); - DAAL_CHECK(matLeftCSR, services::ErrorIncorrectTypeOfInputNumericTable); - CSRNumericTableIface * matRightCSR = dynamic_cast(matRight); - DAAL_CHECK(matRightCSR, services::ErrorIncorrectTypeOfInputNumericTable); - - DAAL_CHECK_STATUS(status, matLeftCSR->getSparseBlock(0, nMatLeft, readOnly, matLeftBD)); - DAAL_CHECK_STATUS(status, matRightCSR->getSparseBlock(0, nMatRight, readOnly, matRightBD)); - - const auto matLeftValuesBuff = matLeftBD.getBlockValuesBuffer(); - const auto matLeftColumnIndicesBuff = matLeftBD.getBlockColumnIndicesBuffer(); - const auto matLeftRowIndicesBuff = matLeftBD.getBlockRowIndicesBuffer(); - - const auto matRightValuesBuff = matRightBD.getBlockValuesBuffer(); - const auto matRightColumnIndicesBuff = matRightBD.getBlockColumnIndicesBuffer(); - const auto matRightRowIndicesBuff = matRightBD.getBlockRowIndicesBuffer(); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(KernelRBF.sumOfSquaresCSR); - - DAAL_CHECK_STATUS(status, Helper::sumOfSquaresCSR(matLeftValuesBuff, matLeftRowIndicesBuff, _sqrMatLeft, nMatLeft)); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_STATUS(status, Helper::sumOfSquaresCSR(matRightValuesBuff, matRightRowIndicesBuff, _sqrMatRight, nMatRight)); - DAAL_CHECK_STATUS_VAR(status); - } - BlockDescriptor resultBlock; - DAAL_CHECK_STATUS(status, result->getBlockOfRows(0, nMatLeft, ReadWriteMode::writeOnly, resultBlock)); - services::internal::Buffer resultBuff = resultBlock.getBuffer(); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(KernelRBF.gemmCSR); - - DAAL_CHECK_STATUS(status, math::SpBlasGpu::xgemm( - math::Transpose::Trans, math::Transpose::NoTrans, nMatLeft, nMatRight, pMatLeft, algorithmFPType(-2.0), - matLeftValuesBuff, matLeftColumnIndicesBuff, matLeftRowIndicesBuff, matRightValuesBuff, - matRightColumnIndicesBuff, matRightRowIndicesBuff, algorithmFPType(0.0), resultBuff, nMatRight, 0)); - } - - DAAL_CHECK_STATUS(status, Helper::computeRBF(_sqrMatLeft, _sqrMatRight, nMatRight, coeff, resultBuff, nMatLeft, nMatRight)); - - DAAL_CHECK_STATUS(status, matLeftCSR->releaseSparseBlock(matLeftBD)); - DAAL_CHECK_STATUS(status, matRightCSR->releaseSparseBlock(matRightBD)); - DAAL_CHECK_STATUS(status, result->releaseBlockOfRows(resultBlock)); - - return status; -} - -} // namespace internal -} // namespace rbf -} // namespace kernel_function -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_dense_default_oneapi_impl.i b/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_dense_default_oneapi_impl.i deleted file mode 100644 index 3538b4e21f8..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_dense_default_oneapi_impl.i +++ /dev/null @@ -1,128 +0,0 @@ -/* file: kernel_function_rbf_dense_default_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// RBF kernel functions implementation -//-- -*/ - -#ifndef __KERNEL_FUNCTION_RBF_DENSE_DEFAULT_IMPL_ONEAPI_I__ -#define __KERNEL_FUNCTION_RBF_DENSE_DEFAULT_IMPL_ONEAPI_I__ - -#include "algorithms/kernel_function/kernel_function_types_rbf.h" -#include "src/data_management/service_numeric_table.h" -#include "src/externals/service_math.h" -#include "src/externals/service_profiler.h" -#include "src/sycl/blas_gpu.h" -#include "src/sycl/reducer.h" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace rbf -{ -namespace internal -{ -using namespace daal::services::internal::sycl; -using namespace daal::services::internal::sycl::math; - -template -services::Status KernelImplRBFOneAPI::computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplRBFOneAPI::computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, - NumericTable * result, const ParameterBase * par) -{ - return services::ErrorMethodNotImplemented; -} - -template -services::Status KernelImplRBFOneAPI::computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, - NumericTable * result, const ParameterBase * par) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - const size_t nMatLeft = matLeft->getNumberOfRows(); - const size_t nMatRight = matRight->getNumberOfRows(); - - const size_t pMatLeft = matLeft->getNumberOfColumns(); - const size_t pMatRight = matRight->getNumberOfColumns(); - DAAL_ASSERT(pMatLeft == pMatRight); - - const Parameter * rbfPar = static_cast(par); - const algorithmFPType coeff = algorithmFPType(-0.5 / (rbfPar->sigma * rbfPar->sigma)); - - BlockDescriptor matLeftBlock; - BlockDescriptor matRightBlock; - BlockDescriptor resultBlock; - - DAAL_CHECK_STATUS(status, matLeft->getBlockOfRows(0, nMatLeft, ReadWriteMode::readOnly, matLeftBlock)); - DAAL_CHECK_STATUS(status, matRight->getBlockOfRows(0, nMatRight, ReadWriteMode::readOnly, matRightBlock)); - - DAAL_CHECK_STATUS(status, result->getBlockOfRows(0, nMatLeft, ReadWriteMode::writeOnly, resultBlock)); - - const services::internal::Buffer matLeftBuf = matLeftBlock.getBuffer(); - const services::internal::Buffer matRightBuf = matRightBlock.getBuffer(); - - services::internal::Buffer rBuf = resultBlock.getBuffer(); - - DAAL_CHECK_STATUS(status, Helper::lazyAllocate(_sqrMatLeft, nMatLeft)); - DAAL_CHECK_STATUS(status, Helper::lazyAllocate(_sqrMatRight, nMatRight)); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(KernelRBF.sumOfSquares); - - Reducer::reduce(Reducer::BinaryOp::SUM_OF_SQUARES, Layout::RowMajor, matLeftBuf, _sqrMatLeft, nMatLeft, pMatLeft, status); - DAAL_CHECK_STATUS_VAR(status); - Reducer::reduce(Reducer::BinaryOp::SUM_OF_SQUARES, Layout::RowMajor, matRightBuf, _sqrMatRight, nMatRight, pMatRight, status); - DAAL_CHECK_STATUS_VAR(status); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(KernelRBF.gemm); - DAAL_CHECK_STATUS(status, BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::Trans, nMatLeft, - nMatRight, pMatLeft, algorithmFPType(-2.0), matLeftBuf, pMatLeft, 0, matRightBuf, - pMatRight, 0, algorithmFPType(0.0), rBuf, nMatRight, 0)); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nMatLeft, nMatRight); - DAAL_CHECK_STATUS(status, Helper::computeRBF(_sqrMatLeft, _sqrMatRight, nMatRight, coeff, rBuf, nMatLeft, nMatRight)); - - DAAL_CHECK_STATUS(status, matLeft->releaseBlockOfRows(matLeftBlock)); - DAAL_CHECK_STATUS(status, matRight->releaseBlockOfRows(matRightBlock)); - DAAL_CHECK_STATUS(status, result->releaseBlockOfRows(resultBlock)); - - return status; -} - -} // namespace internal -} // namespace rbf -} // namespace kernel_function -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_kernel_oneapi.h b/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_kernel_oneapi.h deleted file mode 100644 index 82a337b7d75..00000000000 --- a/cpp/daal/src/algorithms/kernel_function/oneapi/kernel_function_rbf_kernel_oneapi.h +++ /dev/null @@ -1,118 +0,0 @@ -/* file: kernel_function_rbf_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template structs that calculate SVM RBF Kernel functions. -//-- -*/ - -#ifndef __KERNEL_FUNCTION_DENSE_RBF_KERNEL_ONEAPI_H__ -#define __KERNEL_FUNCTION_DENSE_RBF_KERNEL_ONEAPI_H__ - -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/kernel_function/kernel_function_rbf.h" -#include "src/algorithms/kernel_function/oneapi/kernel_function_helper_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace kernel_function -{ -namespace rbf -{ -namespace internal -{ -using namespace daal::data_management; -using namespace daal::services; -using namespace daal::services::internal::sycl; - -template -class KernelImplRBFOneAPI : public Kernel -{ -public: - services::Status compute(NumericTable * ntLeft, NumericTable * ntRight, NumericTable * result, const ParameterBase * par) - { - return services::ErrorMethodNotImplemented; - } -}; - -template -class KernelImplRBFOneAPI : public Kernel -{ -public: - using Helper = HelperKernel; - - services::Status compute(NumericTable * ntLeft, NumericTable * ntRight, NumericTable * result, const ParameterBase * par) - { - ComputationMode computationMode = par->computationMode; - switch (computationMode) - { - case vectorVector: return computeInternalVectorVector(ntLeft, ntRight, result, par); - case matrixVector: return computeInternalMatrixVector(ntLeft, ntRight, result, par); - case matrixMatrix: return computeInternalMatrixMatrix(ntLeft, ntRight, result, par); - default: return services::ErrorIncorrectParameter; - } - } - -protected: - services::Status computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, NumericTable * result, const ParameterBase * par); - -private: - UniversalBuffer _sqrMatLeft; - UniversalBuffer _sqrMatRight; -}; - -template -class KernelImplRBFOneAPI : public Kernel -{ -public: - using Helper = HelperKernel; - - services::Status compute(NumericTable * ntLeft, NumericTable * ntRight, NumericTable * result, const ParameterBase * par) - { - ComputationMode computationMode = par->computationMode; - switch (computationMode) - { - case vectorVector: return computeInternalVectorVector(ntLeft, ntRight, result, par); - case matrixVector: return computeInternalMatrixVector(ntLeft, ntRight, result, par); - case matrixMatrix: return computeInternalMatrixMatrix(ntLeft, ntRight, result, par); - default: return services::ErrorIncorrectParameter; - } - } - -protected: - services::Status computeInternalVectorVector(NumericTable * vecLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixVector(NumericTable * matLeft, NumericTable * vecRight, NumericTable * result, const ParameterBase * par); - services::Status computeInternalMatrixMatrix(NumericTable * matLeft, NumericTable * matRight, NumericTable * result, const ParameterBase * par); - -private: - UniversalBuffer _sqrMatLeft; - UniversalBuffer _sqrMatRight; -}; - -} // namespace internal -} // namespace rbf -} // namespace kernel_function -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kernel_inst_arm.h b/cpp/daal/src/algorithms/kernel_inst_arm.h index e72d94ef019..12b48bc460d 100644 --- a/cpp/daal/src/algorithms/kernel_inst_arm.h +++ b/cpp/daal/src/algorithms/kernel_inst_arm.h @@ -43,29 +43,4 @@ template class ClassName DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ } -#define __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...) \ - DAAL_KERNEL_SVE_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ - namespace interface1 \ - { \ - template <> \ - ClassName DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>::ClassName( \ - daal::services::Environment::env * daalEnv) \ - : BaseClassName(daalEnv), _cntr(NULL) \ - { \ - GetCpuid switch (__DAAL_KERNEL_MIN(DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID, cpuid)) \ - { \ - DAAL_KERNEL_SVE_CONTAINER_CASE(ContainerTemplate, __VA_ARGS__) \ - default: \ - { \ - using cntrTemplateInst = ContainerTemplate<__VA_ARGS__, sve>; \ - static volatile services::internal::GpuSupportRegistrar registrar; \ - _cntr = (new cntrTemplateInst(daalEnv)); \ - break; \ - } \ - } \ - } \ - \ - template class ClassName DAAL_KERNEL_SVE_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ - } - #endif diff --git a/cpp/daal/src/algorithms/kernel_inst_riscv64.h b/cpp/daal/src/algorithms/kernel_inst_riscv64.h index 216d4857c4f..f127c80ef1f 100644 --- a/cpp/daal/src/algorithms/kernel_inst_riscv64.h +++ b/cpp/daal/src/algorithms/kernel_inst_riscv64.h @@ -43,29 +43,4 @@ template class ClassName DAAL_KERNEL_RV64_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ } -#define __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...) \ - DAAL_KERNEL_RV64_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ - namespace interface1 \ - { \ - template <> \ - ClassName DAAL_KERNEL_RV64_CONTAINER(ContainerTemplate, __VA_ARGS__)>::ClassName( \ - daal::services::Environment::env * daalEnv) \ - : BaseClassName(daalEnv), _cntr(NULL) \ - { \ - GetCpuid switch (__DAAL_KERNEL_MIN(DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID, cpuid)) \ - { \ - DAAL_KERNEL_RV64_CONTAINER_CASE(ContainerTemplate, __VA_ARGS__) \ - default: \ - { \ - using cntrTemplateInst = ContainerTemplate<__VA_ARGS__, rv64>; \ - static volatile services::internal::GpuSupportRegistrar registrar; \ - _cntr = (new cntrTemplateInst(daalEnv)); \ - break; \ - } \ - } \ - } \ - \ - template class ClassName DAAL_KERNEL_RV64_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ - } - #endif diff --git a/cpp/daal/src/algorithms/kernel_inst_x86.h b/cpp/daal/src/algorithms/kernel_inst_x86.h index 1b30c74ccb1..8b68e8a810a 100644 --- a/cpp/daal/src/algorithms/kernel_inst_x86.h +++ b/cpp/daal/src/algorithms/kernel_inst_x86.h @@ -51,37 +51,4 @@ DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ } -#define __DAAL_INSTANTIATE_DISPATCH_SYCL_IMPL(ContainerTemplate, Mode, ClassName, BaseClassName, GetCpuid, ...) \ - DAAL_KERNEL_SSE2_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_SSE42_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX2_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX512_CONTAINER1(ContainerTemplate, __VA_ARGS__) \ - namespace interface1 \ - { \ - template <> \ - ClassName DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>::ClassName(daal::services::Environment::env * daalEnv) \ - : BaseClassName(daalEnv), _cntr(NULL) \ - { \ - GetCpuid switch (__DAAL_KERNEL_MIN(DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID, cpuid)) \ - { \ - DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX512_CONTAINER_CASE_SYCL(ContainerTemplate, __VA_ARGS__) \ - default: \ - { \ - using cntrTemplateInst = ContainerTemplate<__VA_ARGS__, sse2>; \ - static volatile services::internal::GpuSupportRegistrar registrar; \ - _cntr = (new cntrTemplateInst(daalEnv)); \ - break; \ - } \ - } \ - } \ - \ - template class ClassName DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__) \ - DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>; \ - } - #endif diff --git a/cpp/daal/src/algorithms/kmeans/BUILD b/cpp/daal/src/algorithms/kmeans/BUILD index 67490444c08..1eac908e378 100644 --- a/cpp/daal/src/algorithms/kmeans/BUILD +++ b/cpp/daal/src/algorithms/kmeans/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/distributions:kernel", ], ) diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_container.h b/cpp/daal/src/algorithms/kmeans/kmeans_container.h index fd1c82eaafb..f5a22410a87 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_container.h +++ b/cpp/daal/src/algorithms/kmeans/kmeans_container.h @@ -29,10 +29,6 @@ #include "algorithms/kmeans/kmeans_batch.h" #include "algorithms/kmeans/kmeans_distributed.h" #include "src/algorithms/kmeans/kmeans_lloyd_kernel.h" -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi.h" -#include "src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_kernel_ucapi.h" -#include "src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_kernel_ucapi.h" -#include "services/internal/sycl/execution_context.h" #include "src/data_management/service_numeric_table.h" @@ -47,17 +43,7 @@ namespace interface2 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::KMeansBatchKernel, method, algorithmFPType); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::KMeansDenseLloydBatchKernelUCAPI, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(internal::KMeansBatchKernel, method, algorithmFPType); } template @@ -69,9 +55,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - Input * input = static_cast(_in); Result * result = static_cast(_res); @@ -83,30 +66,13 @@ services::Status BatchContainer::compute() Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - if (deviceInfo.isCpu || method != lloydDense) - { - __DAAL_CALL_KERNEL(env, internal::KMeansBatchKernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a, r, par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KMeansDenseLloydBatchKernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType), compute, a, r, par); - } + __DAAL_CALL_KERNEL(env, internal::KMeansBatchKernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a, r, par); } template DistributedContainer::DistributedContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::KMeansDistributedStep1Kernel, method, algorithmFPType); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::KMeansDistributedStep1KernelUCAPI, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(internal::KMeansDistributedStep1Kernel, method, algorithmFPType); } template @@ -140,20 +106,9 @@ services::Status DistributedContainer: r[5] = static_cast(pres->get(partialAssignments).get()); } - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - daal::services::Environment::env & env = *_env; - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::KMeansDistributedStep1Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, na, a, nr, r, par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KMeansDistributedStep1KernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType), compute, na, a, nr, r, - par); - } + __DAAL_CALL_KERNEL(env, internal::KMeansDistributedStep1Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, na, a, nr, r, par); } template @@ -172,35 +127,15 @@ services::Status DistributedContainer: r[0] = static_cast(res->get(assignments).get()); daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::KMeansDistributedStep1Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), finalizeCompute, na, a, nr, - r, par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KMeansDistributedStep1KernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType), finalizeCompute, na, a, - nr, r, par); - } + __DAAL_CALL_KERNEL(env, internal::KMeansDistributedStep1Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), finalizeCompute, na, a, nr, r, + par); } template DistributedContainer::DistributedContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::KMeansDistributedStep2Kernel, method, algorithmFPType); - } - else - { - _kernel = new internal::KMeansDistributedStep2KernelUCAPI(); - } + __DAAL_INITIALIZE_KERNELS(internal::KMeansDistributedStep2Kernel, method, algorithmFPType); } template @@ -242,21 +177,11 @@ services::Status DistributedContainer Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); services::Status s; - if (deviceInfo.isCpu) - { - s = __DAAL_CALL_KERNEL_STATUS(env, internal::KMeansDistributedStep2Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, na, a, - nr, r, par); - } - else - { - s = __DAAL_CALL_KERNEL_STATUS_SYCL(env, internal::KMeansDistributedStep2KernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType), compute, na, a, - nr, r, par); - } + s = __DAAL_CALL_KERNEL_STATUS(env, internal::KMeansDistributedStep2Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, na, a, nr, + r, par); dcInput->clear(); return s; } @@ -282,19 +207,9 @@ services::Status DistributedContainer Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::KMeansDistributedStep2Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), finalizeCompute, na, a, nr, - r, par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::KMeansDistributedStep2KernelUCAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType), finalizeCompute, na, a, - nr, r, par); - } + __DAAL_CALL_KERNEL(env, internal::KMeansDistributedStep2Kernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), finalizeCompute, na, a, nr, r, + par); } } // namespace interface2 diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_base_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_base_kernel_ucapi_fpt.cpp deleted file mode 100755 index 8c5f3b01239..00000000000 --- a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_base_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: kmeans_dense_lloyd_base_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Lloyd method for K-means algorithm. -//-- -*/ - -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h" -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template class KMeansDenseLloydKernelBaseUCAPI; -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_fpt_dispatcher.cpp index 9fd7c6f68e0..0c09415ba2d 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_fpt_dispatcher.cpp @@ -28,7 +28,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kmeans::interface2::BatchContainer, batch, DAAL_FPTYPE, kmeans::lloydDense); +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kmeans::interface2::BatchContainer, batch, DAAL_FPTYPE, kmeans::lloydDense); namespace kmeans { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_kernel_ucapi_fpt.cpp deleted file mode 100644 index 4c8e0c1f1bd..00000000000 --- a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_batch_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: kmeans_dense_lloyd_batch_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Batch Kernel for GPU. -//-- -*/ - -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi.h" -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template class DAAL_EXPORT KMeansDenseLloydBatchKernelUCAPI; -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_fpt_dispatcher.cpp index ea3b7482a0d..e55da154977 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_fpt_dispatcher.cpp @@ -28,7 +28,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kmeans::interface2::DistributedContainer, distributed, step1Local, DAAL_FPTYPE, kmeans::lloydDense); +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kmeans::interface2::DistributedContainer, distributed, step1Local, DAAL_FPTYPE, kmeans::lloydDense); namespace kmeans { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_kernel_ucapi_fpt.cpp deleted file mode 100755 index 597f14ef6a1..00000000000 --- a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step1_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: kmeans_dense_lloyd_distr_step1_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Distr Step1 Kernel for GPU. -//-- -*/ - -#include "src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_kernel_ucapi.h" -#include "src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template class KMeansDistributedStep1KernelUCAPI; -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_fpt_dispatcher.cpp index 54a0cbe652a..2bd0e9b85ed 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_fpt_dispatcher.cpp @@ -28,7 +28,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kmeans::interface2::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, kmeans::lloydDense); +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kmeans::interface2::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, kmeans::lloydDense); namespace kmeans { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_kernel_ucapi_fpt.cpp deleted file mode 100755 index 2079d0395dc..00000000000 --- a/cpp/daal/src/algorithms/kmeans/kmeans_dense_lloyd_distr_step2_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: kmeans_dense_lloyd_distr_step2_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Lloyd method for K-means algorithm. -//-- -*/ - -#include "src/algorithms/kmeans/kmeans_lloyd_kernel.h" -#include "src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template class KMeansDistributedStep2KernelUCAPI; -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_container.h b/cpp/daal/src/algorithms/kmeans/kmeans_init_container.h index cb02f5091ae..5ee35a0d1c1 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_container.h +++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_container.h @@ -30,8 +30,6 @@ #include "algorithms/kmeans/kmeans_init_distributed.h" #include "src/algorithms/kmeans/kmeans_init_kernel.h" #include "src/algorithms/kmeans/kmeans_init_impl.h" -#include "src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi.h" -#include "services/internal/sycl/execution_context.h" namespace daal { @@ -44,17 +42,7 @@ namespace init template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu || (method != deterministicDense && method != randomDense)) - { - __DAAL_INITIALIZE_KERNELS(internal::KMeansInitKernel, method, algorithmFPType); - } - else - { - _kernel = new internal::KMeansInitDenseBatchKernelUCAPI(); - } + __DAAL_INITIALIZE_KERNELS(internal::KMeansInitKernel, method, algorithmFPType); } template @@ -66,9 +54,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - Input * input = static_cast(_in); Result * result = static_cast(_res); @@ -84,15 +69,7 @@ services::Status BatchContainer::compute() daal::services::Environment::env & env = *_env; - if (deviceInfo.isCpu || (method != deterministicDense && method != randomDense)) - { - __DAAL_CALL_KERNEL(env, internal::KMeansInitKernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, na, a, nr, r, par, - *par->engine); - } - else - { - return ((internal::KMeansInitDenseBatchKernelUCAPI *)(_kernel))->compute(na, a, nr, r, par, *par->engine); - } + __DAAL_CALL_KERNEL(env, internal::KMeansInitKernel, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, na, a, nr, r, par, *par->engine); } template diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_batch_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_batch_kernel_ucapi_fpt.cpp deleted file mode 100644 index fc0b3657e2c..00000000000 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_batch_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* file: kmeans_init_dense_batch_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means initialization Batch Kernel for GPU. -//-- -*/ - -#include "src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi.h" -#include "src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace init -{ -namespace internal -{ -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -template class DAAL_EXPORT KMeansInitDenseBatchKernelUCAPI; -} // namespace internal -} // namespace init -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_deterministic_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_deterministic_batch_fpt_dispatcher.cpp index 0fbca4be693..7e4f09f2a0e 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_deterministic_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_deterministic_batch_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kmeans::init::BatchContainer, batch, DAAL_FPTYPE, kmeans::init::deterministicDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kmeans::init::BatchContainer, batch, DAAL_FPTYPE, kmeans::init::deterministicDense) namespace kmeans { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_batch_fpt_dispatcher.cpp index aefaefba5ab..dea4bb5d503 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_batch_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kmeans::init::BatchContainer, batch, DAAL_FPTYPE, kmeans::init::randomDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kmeans::init::BatchContainer, batch, DAAL_FPTYPE, kmeans::init::randomDense) namespace kmeans { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step1_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step1_fpt_dispatcher.cpp index c90172a95c6..f18a7d86649 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step1_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step1_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kmeans::init::DistributedContainer, distributed, step1Local, DAAL_FPTYPE, kmeans::init::randomDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kmeans::init::DistributedContainer, distributed, step1Local, DAAL_FPTYPE, kmeans::init::randomDense) namespace kmeans { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step2_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step2_fpt_dispatcher.cpp index 6a333221bfe..0dba642ec04 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step2_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_dense_random_distr_step2_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(kmeans::init::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, kmeans::init::randomDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(kmeans::init::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, kmeans::init::randomDense) namespace kmeans { diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_init_result.h b/cpp/daal/src/algorithms/kmeans/kmeans_init_result.h index 3f466f2b2a1..420d56282a0 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_init_result.h +++ b/cpp/daal/src/algorithms/kmeans/kmeans_init_result.h @@ -25,12 +25,7 @@ #define __KMEANS_INIT_RESULT_ #include "algorithms/kmeans/kmeans_init_types.h" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" - -using namespace daal::services::internal::sycl; -using daal::data_management::internal::SyclHomogenNumericTable; +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -68,21 +63,10 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in else nFeatures = (static_cast(input))->get(data)->getNumberOfColumns(); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - services::Status status; - if (deviceInfo.isCpu || (method != deterministicDense && method != randomDense)) - { - Argument::set(centroids, data_management::SerializationIfacePtr(new data_management::HomogenNumericTable( - nFeatures, kmPar->nClusters, data_management::NumericTable::doAllocate))); - } - else - { - Argument::set(centroids, SyclHomogenNumericTable::create(nFeatures, kmPar->nClusters, - data_management::NumericTable::doAllocate, &status)); - } + Argument::set(centroids, data_management::SerializationIfacePtr(new data_management::HomogenNumericTable( + nFeatures, kmPar->nClusters, data_management::NumericTable::doAllocate))); return status; } diff --git a/cpp/daal/src/algorithms/kmeans/kmeans_result.h b/cpp/daal/src/algorithms/kmeans/kmeans_result.h index a57d4d23ac8..e3505317104 100644 --- a/cpp/daal/src/algorithms/kmeans/kmeans_result.h +++ b/cpp/daal/src/algorithms/kmeans/kmeans_result.h @@ -25,9 +25,7 @@ #define __KMEANS_RESULT_ #include "algorithms/kmeans/kmeans_types.h" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -35,9 +33,8 @@ namespace algorithms { namespace kmeans { -namespace dm = daal::data_management; -namespace dmi = daal::data_management::internal; -namespace si_sycl = daal::services::internal::sycl; +namespace dm = daal::data_management; +namespace dmi = daal::data_management::internal; /** * Allocates memory to store the results of the K-Means algorithm @@ -48,9 +45,6 @@ namespace si_sycl = daal::services::internal::sycl; template DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * parameter, const int method) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - const interface2::Parameter * kmPar2 = dynamic_cast(parameter); if (kmPar2 == nullptr) return services::Status(daal::services::ErrorNullParameterNotSupported); @@ -63,26 +57,16 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in { size_t nClusters = kmPar2->nClusters; - if (deviceInfo.isCpu) - { - set(objectiveFunction, dm::HomogenNumericTable::create(1, 1, dm::NumericTable::doAllocate, &status)); - set(nIterations, dm::HomogenNumericTable::create(1, 1, dm::NumericTable::doAllocate, &status)); + set(objectiveFunction, dm::HomogenNumericTable::create(1, 1, dm::NumericTable::doAllocate, &status)); + set(nIterations, dm::HomogenNumericTable::create(1, 1, dm::NumericTable::doAllocate, &status)); - if (kmPar2->resultsToEvaluate & computeCentroids) - { - set(centroids, dm::HomogenNumericTable::create(nFeatures, nClusters, dm::NumericTable::doAllocate, &status)); - } - if (kmPar2->resultsToEvaluate & computeAssignments || kmPar2->assignFlag) - { - set(assignments, dm::HomogenNumericTable::create(1, nRows, dm::NumericTable::doAllocate, &status)); - } + if (kmPar2->resultsToEvaluate & computeCentroids) + { + set(centroids, dm::HomogenNumericTable::create(nFeatures, nClusters, dm::NumericTable::doAllocate, &status)); } - else + if (kmPar2->resultsToEvaluate & computeAssignments || kmPar2->assignFlag) { - set(centroids, dmi::SyclHomogenNumericTable::create(nFeatures, nClusters, dm::NumericTable::doAllocate, &status)); - set(objectiveFunction, dm::HomogenNumericTable::create(1, 1, dm::NumericTable::doAllocate, &status)); - set(nIterations, dm::HomogenNumericTable::create(1, 1, dm::NumericTable::doAllocate, &status)); - set(assignments, dmi::SyclHomogenNumericTable::create(1, nRows, dm::NumericTable::doAllocate, &status)); + set(assignments, dm::HomogenNumericTable::create(1, nRows, dm::NumericTable::doAllocate, &status)); } } diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels.cl b/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels.cl deleted file mode 100644 index 96845cd2cf0..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels.cl +++ /dev/null @@ -1,337 +0,0 @@ -/* file: kmeans_cl_kernels.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means OpenCL kernels. -//-- -*/ - -#ifndef __KMEANS_CL_KERNELS_CL__ -#define __KMEANS_CL_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - kmeans_cl_kernels, - - void __sum_reduce(__local algorithmFPType * local_sum, uint local_id, uint local_size) { - for (uint stride = local_size / 2; stride > 0; stride /= 2) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (local_id < stride) - { - local_sum[local_id] += local_sum[local_id + stride]; - } - } - } - - __kernel void reduce_assignments(__global const algorithmFPType * centroidsSq, __global const algorithmFPType * distances, int N, int K, - algorithmFPType huge, __global int * assignments, __global algorithmFPType * mindistances) { - const int global_id = get_global_id(0); - const int size = get_local_size(1); - const int local_id = get_local_id(1); - - algorithmFPType minVal = huge; - int minIdx = -1; - for (int i = local_id; i < K; i += size) - { - algorithmFPType dist = distances[global_id + N * i]; - algorithmFPType sq = centroidsSq[i]; - algorithmFPType curVal = dist + 0.5 * sq; - minIdx = curVal < minVal ? i : minIdx; - minVal = curVal < minVal ? curVal : minVal; - } - algorithmFPType groupMin = sub_group_reduce_min(minVal); - minIdx = sub_group_reduce_min(minVal == groupMin ? minIdx : K); - if (local_id == 0) - { - assignments[global_id] = minIdx; - mindistances[global_id] = groupMin; - } - } - - int insert_subgroup_shift_right(__local int * data, int index, int offset, int newVal, int sub_group_id, int sub_group_size, int rem) { - if (index < 0) index = 0; - int ret = data[offset + sub_group_size - 1]; - int curVal = data[offset + sub_group_id]; - int prevVal = curVal; - int delta = index < sub_group_id ? 1 : 0; - int res = intel_sub_group_shuffle_up(prevVal, curVal, delta); - if (rem == 0 || sub_group_id < rem) - { - int v = index == sub_group_id ? newVal : res; - data[offset + sub_group_id] = v; - } - return ret; - } - - algorithmFPType insert_subgroup_shift_right_fp(__local algorithmFPType * data, int index, int offset, algorithmFPType newVal, int sub_group_id, - int sub_group_size, int rem) { - if (index < 0) index = 0; - algorithmFPType ret = data[offset + sub_group_size - 1]; - algorithmFPType curVal = data[offset + sub_group_id]; - algorithmFPType prevVal = curVal; - int delta = index < sub_group_id ? 1 : 0; - algorithmFPType res = intel_sub_group_shuffle_up(prevVal, curVal, delta); - if (rem == 0 || sub_group_id < rem) - { - data[offset + sub_group_id] = index == sub_group_id ? newVal : res; - } - return ret; - } - - void insert_shift_right(__local int * data, int index, int offset, int newVal, int sub_group_num, int rem, int sub_group_id, int sub_group_size) { - for (int i = index / sub_group_size; i < sub_group_num; i++) - { - int curRem = (i == sub_group_num - 1 && rem > 0) ? rem : sub_group_size; - newVal = insert_subgroup_shift_right(data, index - i * sub_group_size, offset + i * sub_group_size, newVal, sub_group_id, sub_group_size, - curRem); - } - } - - void insert_shift_right_fp(__local algorithmFPType * data, int index, int offset, algorithmFPType newVal, int sub_group_num, int rem, - int sub_group_id, int sub_group_size) { - for (int i = index / sub_group_size; i < sub_group_num; i++) - { - int curRem = (i == sub_group_num - 1 && rem > 0) ? rem : sub_group_size; - newVal = insert_subgroup_shift_right_fp(data, index - i * sub_group_size, offset + i * sub_group_size, newVal, sub_group_id, - sub_group_size, curRem); - } - } - - __kernel void partial_candidates(__global const int * assignments, __global const algorithmFPType * mindistances, - __global const algorithmFPType * distSq, __global const int * candidates, - __global const algorithmFPType * candidateDistances, __global int * candidates_tmp, - __global algorithmFPType * candidateDistances_tmp, int N, int K, int Reset) { - const int global_id = get_global_id(0); - const int gsize = get_global_size(0); - const int lsize = get_sub_group_size(); - const int local_id = get_sub_group_local_id(); - const int local_id_2 = get_local_id(1); - const int sg_size = get_max_sub_group_size(); - if (lsize < sg_size || get_sub_group_id() > 0 || global_id >= NUM_PARTS_CND) return; - - const algorithmFPType HUGE = -1.0e-15; - - int numgrp = K / lsize; - const int rem = K % lsize; - if (rem > 0 || numgrp == 0) numgrp++; - - __local algorithmFPType maxDist[CND_PART_SIZE]; - __local int maxItem[CND_PART_SIZE]; - - for (int i = 0; i < numgrp; i++) - { - if (i < numgrp - 1 || local_id < rem || rem == 0) - { - int offset = local_id + lsize * i; - algorithmFPType initValue = (global_id == 0 && !Reset) ? candidateDistances[offset] : HUGE; - int initIndex = (global_id == 0 && !Reset) ? candidates[offset] : -1; - maxDist[offset] = initValue; - maxItem[offset] = initIndex; - } - } - for (int iblock = global_id; iblock < N; iblock += gsize) - { - algorithmFPType newVal = 2.0 * mindistances[iblock] + distSq[iblock]; - if (newVal <= maxDist[K - 1]) continue; - int valCentroid = iblock; - int maxInd = -1; - for (int i = 0; i < numgrp; i++) - { - algorithmFPType curVal = HUGE; - if (i < numgrp - 1 || local_id < rem) curVal = maxDist[local_id + i * lsize] - newVal; - int valInd = curVal > 0.0 ? 1 : local_id - lsize; - int locInd = sub_group_reduce_min(valInd); - if (locInd < 0) - { - maxInd = i * lsize + lsize + locInd; - break; - } - } - if (maxInd > -1) - { - insert_shift_right_fp(maxDist, maxInd, 0, newVal, numgrp, rem, local_id, lsize); - insert_shift_right(maxItem, maxInd, 0, valCentroid, numgrp, rem, local_id, lsize); - } - } - - if (local_id == 0 && global_id < NUM_PARTS_CND) - { - for (int i = 0; i < K; i++) - { - candidateDistances_tmp[global_id * K + i] = maxDist[i]; - candidates_tmp[global_id * K + i] = maxItem[i]; - } - } - } - - __kernel void merge_candidates(__global int * candidates, __global algorithmFPType * candidateDistances, __global const int * candidates_tmp, - __global const algorithmFPType * candidateDistances_tmp, int K) { - const int global_id = get_global_id(0); - const int local_id = get_sub_group_local_id(); - - __local int curInd[NUM_PARTS_CND]; - if (global_id == 0) - { - curInd[local_id] = 0; - for (int i = 0; i < K; i++) - { - algorithmFPType curVal = local_id < NUM_PARTS_CND ? -candidateDistances_tmp[local_id * K + curInd[local_id]] : 1.0; - algorithmFPType maxVal = -sub_group_reduce_min(curVal); - if (maxVal < 0) - { - if (local_id == 0) candidates[i] = -1; - curInd[local_id]++; - continue; - } - int counterInd = -sub_group_reduce_min(-curVal < maxVal ? 1 : -local_id); - if (counterInd > NUM_PARTS_CND - 1) - { - if (local_id == 0) candidates[i] = -1; - curInd[local_id]++; - continue; - } - if (local_id == counterInd) - { - candidates[i] = candidates_tmp[curInd[counterInd] + counterInd * K]; - candidateDistances[i] = candidateDistances_tmp[curInd[counterInd] + counterInd * K]; - curInd[local_id]++; - } - } - } - } - - __kernel void partial_reduce_centroids(__global const algorithmFPType * data, __global const algorithmFPType * distances, - __global const int * assignments, __global algorithmFPType * partialCentroids, - __global int * partialCentroidsCounters, int N, int K, int P, int doReset) { - const int local_id = get_global_id(0) % P; - - const int global_id = get_global_id(0) / P; - const int global_size = get_global_size(0) / P; - - if (doReset) - { - for (int i = 0; i < K; i++) - { - partialCentroids[global_id * K * P + i * P + local_id] = 0.0; - } - - if (local_id == 0) - { - for (int i = 0; i < K; i++) - { - partialCentroidsCounters[global_id * K + i] = 0; - } - } - } - - for (int i = global_id; i < N; i += global_size) - { - int cl = assignments[i]; - if (local_id == 0) - { - partialCentroidsCounters[global_id * K + cl]++; - } - partialCentroids[global_id * K * P + cl * P + local_id] += data[i * P + local_id]; - } - } - - __kernel void merge_reduce_centroids(__global algorithmFPType * partialCentroids, __global int * partialCentroidsCounters, - __global algorithmFPType * centroids, int K, int P, int parts) { - const int local_id = get_local_id(0); - const int local_size = get_local_size(0); - - const int cl_id = get_group_id(0); - - for (int i = local_id + local_size; i < parts; i += local_size) - { - for (int j = 0; j < P; j++) - { - partialCentroids[local_id * K * P + cl_id * P + j] += partialCentroids[i * K * P + cl_id * P + j]; - } - partialCentroidsCounters[local_id * K + cl_id] += partialCentroidsCounters[i * K + cl_id]; - } - - for (int len = local_size / 2; len > 0; len >>= 1) - { - barrier(CLK_LOCAL_MEM_FENCE); - if (local_id < len) - { - for (int j = 0; j < P; j++) - { - partialCentroids[local_id * K * P + cl_id * P + j] += partialCentroids[(local_id + len) * K * P + cl_id * P + j]; - } - partialCentroidsCounters[local_id * K + cl_id] += partialCentroidsCounters[(local_id + len) * K + cl_id]; - } - } - - if (local_id == 0 && partialCentroidsCounters[cl_id] != 0) - { - for (int j = 0; j < P; j++) - { - centroids[cl_id * P + j] = partialCentroids[cl_id * P + j] / partialCentroidsCounters[cl_id]; - } - } - } - - __kernel void count_empty_clusters(__global const int * partialCentroidsCounters, int K, int nPartialCentroids, __global int * numEmptyClusters) { - const int local_id = get_local_id(1); - const int local_size = get_local_size(1); - - int numEmpty = 0; - for (int i = local_id; i < K; i += local_size) - { - int count = 0; - for (int j = 0; j < nPartialCentroids; j++) - { - count += partialCentroidsCounters[j * K + i]; - } - numEmpty += count > 0 ? 0 : 1; - } - numEmpty = sub_group_reduce_add(numEmpty); - if (local_id == 0) numEmptyClusters[0] = numEmpty; - } - - __kernel void update_objective_function(__global const algorithmFPType * dataSq, __global const algorithmFPType * distances, int N, int K, - __global algorithmFPType * objFunction) { - const int local_id = get_local_id(0); - const int local_size = get_local_size(0); - - __local algorithmFPType local_sum[LOCAL_SUM_SIZE]; - - local_sum[local_id] = 0.0f; - - for (int i = local_id; i < N; i += local_size) - { - local_sum[local_id] += dataSq[i] + 2.0 * distances[i]; - } - - __sum_reduce(local_sum, local_id, local_size); - - if (local_id == 0) - { - objFunction[0] += local_sum[0]; - } - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels_distr_steps.cl b/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels_distr_steps.cl deleted file mode 100644 index a6483616618..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels_distr_steps.cl +++ /dev/null @@ -1,87 +0,0 @@ -/* file: kmeans_cl_kernels_distr_steps.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means OpenCL kernels. -//-- -*/ - -#ifndef __KMEANS_CL_KERNELS_DISTR_STEPS_CL__ -#define __KMEANS_CL_KERNELS_DISTR_STEPS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - kmeans_cl_kernels_distr_steps, - - __kernel void init_clusters(__global int * partialCentroidsCounters, __global algorithmFPType * partialCentroids, __global int * cCounters, - __global algorithmFPType * centroids, int P) { - const int global_id = get_global_id(0); - const int local_id = get_local_id(1); - centroids[global_id * P + local_id] = partialCentroids[global_id * P + local_id]; - if (local_id == 0) cCounters[global_id] = partialCentroidsCounters[global_id]; - } - - __kernel void update_clusters(__global int * partialCentroidsCounters, __global algorithmFPType * partialCentroids, __global int * cCounters, - __global algorithmFPType * centroids, int P) { - const int global_id = get_global_id(0); - const int local_id = get_local_id(1); - const int oldN = partialCentroidsCounters[global_id]; - const int newN = cCounters[global_id]; - const algorithmFPType oldContrib = oldN > 0 ? oldN * partialCentroids[global_id * P + local_id] : 0.0; - const algorithmFPType newContrib = newN > 0 ? newN * centroids[global_id * P + local_id] : 0.0; - centroids[global_id * P + local_id] = (oldN + newN > 0) ? (oldContrib + newContrib) / (oldN + newN) : 0.0; - if (local_id == 0) cCounters[global_id] = oldN + newN; - } - - __kernel void init_candidates(__global int * partialCandidates, __global algorithmFPType * partialCValues, __global int * candidates, - __global algorithmFPType * cValues, int K) { - const int local_id = get_local_id(1); - const int local_size = get_local_size(1); - for (int i = local_id; i < K; i += local_size) - { - candidates[i] = partialCandidates[i]; - cValues[i] = partialCValues[i]; - } - } - - __kernel void update_candidates(__global int * partialCandidates, __global algorithmFPType * partialCValues, __global int * candidates, - __global algorithmFPType * cValues, int K) { - for (int i = K - 1; i >= 0; i--) - { - int j; - algorithmFPType last = cValues[K - 1]; - for (j = K - 2; j >= 0 && cValues[j] < partialCValues[i]; j--) - { - cValues[j + 1] = cValues[j]; - candidates[j + 1] = candidates[j]; - } - - if (j != K - 2 || last < partialCValues[i]) - { - cValues[j + 1] = partialCValues[i]; - candidates[j + 1] = partialCandidates[i]; - } - } - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_init_cl_kernels.cl b/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_init_cl_kernels.cl deleted file mode 100644 index 6a7dc894c70..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/cl_kernels/kmeans_init_cl_kernels.cl +++ /dev/null @@ -1,54 +0,0 @@ -/* file: kmeans_init_cl_kernels.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Init OpenCL kernels. -//-- -*/ - -#ifndef __KMEANS_INIT_CL_KERNELS_CL__ -#define __KMEANS_INIT_CL_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - kmeans_init_cl_kernels, - - __kernel void gather_random(__global const algorithmFPType * data, __global algorithmFPType * centroids, __global const int * indices, int N, - int K, int P) { - const int global_id_0 = get_global_id(0); - - const int local_id_1 = get_local_id(1); - const int local_size_1 = get_local_size(1); - - int ind = indices[global_id_0]; - - if (ind >= 0 && ind < N) - { - for (int i = local_id_1; i < P; i += local_size_1) - { - centroids[global_id_0 * P + i] = data[ind * P + i]; - } - } - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi.h b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi.h deleted file mode 100644 index 69ec2ed5bbb..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi.h +++ /dev/null @@ -1,56 +0,0 @@ -/* file: kmeans_dense_lloyd_batch_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Batch Kernel for GPU. -//-- -*/ - -#ifndef __KMEANS_DENSE_LLOYD_BATCH_KERNEL_UCAPI_H__ -#define __KMEANS_DENSE_LLOYD_BATCH_KERNEL_UCAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "algorithms/kmeans/kmeans_types.h" -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h" - -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template -class KMeansDenseLloydBatchKernelUCAPI : public KMeansDenseLloydKernelBaseUCAPI -{ -public: - services::Status compute(const NumericTable * const * a, const NumericTable * const * r, const Parameter * par); -}; - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi_impl.i b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi_impl.i deleted file mode 100644 index 5f70179a507..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_batch_kernel_ucapi_impl.i +++ /dev/null @@ -1,227 +0,0 @@ -/* file: kmeans_dense_lloyd_batch_kernel_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Batch Kernel for GPU. -//-- -*/ - -#ifndef __KMEANS_DENSE_LLOYD_BATCH_KERNEL_UCAPI_IMPL__ -#define __KMEANS_DENSE_LLOYD_BATCH_KERNEL_UCAPI_IMPL__ - -#include "services/env_detect.h" -#include "src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels.cl" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" -#include "src/services/service_data_utils.h" -#include "src/sycl/blas_gpu.h" -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi_impl.i" - -#include "src/externals/service_profiler.h" - -using namespace daal::services; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -constexpr size_t maxInt32AsSizeT = static_cast(daal::services::internal::MaxVal::get()); - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -using namespace daal::services::internal::sycl; -template -Status KMeansDenseLloydBatchKernelUCAPI::compute(const NumericTable * const * a, const NumericTable * const * r, - const Parameter * par) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute); - - Status st; - - NumericTable * ntData = const_cast(a[0]); - NumericTable * ntInCentroids = const_cast(a[1]); - NumericTable * ntOutCentroids = const_cast(r[0]); - NumericTable * ntAssignments = const_cast(r[1]); - NumericTable * ntObjFunction = const_cast(r[2]); - NumericTable * ntNIterations = const_cast(r[3]); - - const size_t nDataRowsAsSizeT = ntData->getNumberOfRows(); - const size_t nDataColumnsAsSizeT = ntData->getNumberOfColumns(); - DAAL_CHECK(nDataRowsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfRowsInInputNumericTable); - DAAL_CHECK(nDataColumnsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - const uint32_t nRows = static_cast(nDataRowsAsSizeT); - const uint32_t nFeatures = static_cast(nDataColumnsAsSizeT); - - const size_t nIterAsSizeT = par->maxIterations; - const size_t nClustersAsSizeT = par->nClusters; - DAAL_CHECK(nIterAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - DAAL_CHECK(nClustersAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nClustersAsSizeT, nDataColumnsAsSizeT); - const uint32_t nIter = static_cast(nIterAsSizeT); - const uint32_t nClusters = static_cast(nClustersAsSizeT); - - DAAL_ASSERT(ntObjFunction->getNumberOfRows() == 1 && ntObjFunction->getNumberOfColumns() == 1); - DAAL_ASSERT(ntNIterations->getNumberOfRows() == 1 && ntNIterations->getNumberOfColumns() == 1); - DAAL_ASSERT(ntAssignments->getNumberOfRows() == nDataRowsAsSizeT && ntAssignments->getNumberOfColumns() == 1); - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernel_factory, nClusters)); - - uint32_t blockSize = 0; - DAAL_CHECK_STATUS_VAR(this->getBlockSize(nRows, nClusters, nFeatures, blockSize)); - DAAL_CHECK_STATUS_VAR(this->fitPartialCentroidSize(nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->initializeBuffers(nClusters, nFeatures, blockSize)); - - BlockDescriptor inCentroidsRows; - DAAL_CHECK_STATUS_VAR(ntInCentroids->getBlockOfRows(0, nClusters, readOnly, inCentroidsRows)); - auto inCentroids = inCentroidsRows.getBuffer(); - - BlockDescriptor outCentroidsRows; - DAAL_CHECK_STATUS_VAR(ntOutCentroids->getBlockOfRows(0, nClusters, readWrite, outCentroidsRows)); - auto outCentroids = outCentroidsRows.getBuffer(); - - BlockDescriptor objFunctionRows; - DAAL_CHECK_STATUS_VAR(ntObjFunction->getBlockOfRows(0, 1, readWrite, objFunctionRows)); - auto objFunction = objFunctionRows.getBuffer(); - - math::SumReducer::Result dataSums(context, blockSize, TypeIds::id(), st); - DAAL_CHECK_STATUS_VAR(st); - math::SumReducer::Result centroidsSums(context, blockSize, TypeIds::id(), st); - DAAL_CHECK_STATUS_VAR(st); - - algorithmFPType prevObjFunction = (algorithmFPType)0.0; - - uint32_t iter = 0; - uint32_t nBlocks = nRows / blockSize + int32_t(nRows % blockSize != 0); - - for (; iter < nIter; iter++) - { - bool needCandidates = true; - for (uint32_t block = 0; block < nBlocks; block++) - { - auto range = Range::createFromBlock(block, blockSize, nRows); - - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(range.startIndex, range.count, readOnly, dataRows)); - auto data = dataRows.getBuffer(); - BlockDescriptor assignmentsRows; - DAAL_CHECK_STATUS_VAR(ntAssignments->getBlockOfRows(range.startIndex, range.count, writeOnly, assignmentsRows)); - auto assignments = assignmentsRows.getBuffer(); - DAAL_CHECK_STATUS_VAR(this->computeSquares(inCentroids, centroidsSums, this->_centroidsSq, nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->computeDistances(data, inCentroids, range.count, nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->computeAssignments(assignments, range.count, nClusters)); - DAAL_CHECK_STATUS_VAR(this->computeSquares(data, dataSums, this->_dataSq, range.count, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->partialReduceCentroids(data, assignments, range.count, nClusters, nFeatures, int(block == 0))); - if (needCandidates) - { - DAAL_CHECK_STATUS_VAR(this->getNumEmptyClusters(nClusters)); - DAAL_CHECK_STATUS_VAR(st); - int numEmpty = 0; - { - DAAL_ASSERT_UNIVERSAL_BUFFER(this->_numEmptyClusters, int, 1); - auto num = this->_numEmptyClusters.template get().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - numEmpty = *num.get(); - } - bool hasEmptyClusters = numEmpty > 0; - if (hasEmptyClusters) - { - DAAL_CHECK_STATUS_VAR(this->computePartialCandidates(assignments, range.count, nClusters, int(block == 0))); - DAAL_CHECK_STATUS_VAR(this->mergePartialCandidates(nClusters)); - } - needCandidates = hasEmptyClusters; - } - DAAL_CHECK_STATUS_VAR(this->updateObjectiveFunction(objFunction, range.count, nClusters, int(block == 0))); - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(dataRows)); - DAAL_CHECK_STATUS_VAR(ntAssignments->releaseBlockOfRows(assignmentsRows)); - } - - DAAL_CHECK_STATUS_VAR(this->mergeReduceCentroids(outCentroids, nClusters, nFeatures)); - algorithmFPType objFuncCorrection = 0.0; - if (needCandidates) - { - DAAL_CHECK_STATUS_VAR(this->setEmptyClusters(ntData, nRows, nClusters, nFeatures, outCentroids, objFuncCorrection)); - } - algorithmFPType curObjFunction = (algorithmFPType)0.0; - { - DAAL_ASSERT(objFunction.size() >= 1); - auto hostPtr = objFunction.toHost(data_management::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - curObjFunction = *hostPtr; - curObjFunction -= objFuncCorrection; - } - - if (par->accuracyThreshold > (algorithmFPType)0.0) - { - algorithmFPType objFuncDiff = - curObjFunction - prevObjFunction > 0 ? curObjFunction - prevObjFunction : -(curObjFunction - prevObjFunction); - if (objFuncDiff < par->accuracyThreshold) - { - iter++; - break; - } - } - prevObjFunction = curObjFunction; - inCentroids = outCentroids; - } - for (uint32_t block = 0; block < nBlocks; block++) - { - auto range = Range::createFromBlock(block, blockSize, nRows); - - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(range.startIndex, range.count, readOnly, dataRows)); - auto data = dataRows.getBuffer(); - - BlockDescriptor assignmentsRows; - DAAL_CHECK_STATUS_VAR(ntAssignments->getBlockOfRows(range.startIndex, range.count, writeOnly, assignmentsRows)); - auto assignments = assignmentsRows.getBuffer(); - - DAAL_CHECK_STATUS_VAR(this->computeSquares(inCentroids, centroidsSums, this->_centroidsSq, nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->computeDistances(data, inCentroids, range.count, nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->computeAssignments(assignments, range.count, nClusters)); - DAAL_CHECK_STATUS_VAR(this->computeSquares(data, dataSums, this->_dataSq, range.count, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->updateObjectiveFunction(objFunction, range.count, nClusters, int(block == 0))); - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(dataRows)); - DAAL_CHECK_STATUS_VAR(ntAssignments->releaseBlockOfRows(assignmentsRows)); - } - - DAAL_CHECK_STATUS_VAR(ntInCentroids->releaseBlockOfRows(inCentroidsRows)); - DAAL_CHECK_STATUS_VAR(ntOutCentroids->releaseBlockOfRows(outCentroidsRows)); - DAAL_CHECK_STATUS_VAR(ntObjFunction->releaseBlockOfRows(objFunctionRows)); - { - BlockDescriptor nIterationsRows; - DAAL_CHECK_STATUS_VAR(ntNIterations->getBlockOfRows(0, 1, writeOnly, nIterationsRows)); - auto nIterationsHostPtr = nIterationsRows.getBlockSharedPtr(); - *nIterationsHostPtr.get() = iter; - DAAL_CHECK_STATUS_VAR(ntNIterations->releaseBlockOfRows(nIterationsRows)); - } - - return st; -} - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h deleted file mode 100644 index e22ec116bbd..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h +++ /dev/null @@ -1,125 +0,0 @@ -/* file: kmeans_dense_lloyd_kernel_base_ucapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means BASE Batch Kernel for GPU. -//-- -*/ - -#ifndef __KMEANS_DENSE_LLOYD_KERNEL_BASE_UCAPI_H__ -#define __KMEANS_DENSE_LLOYD_KERNEL_BASE_UCAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "algorithms/kmeans/kmeans_types.h" -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "src/sycl/reducer.h" - -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -class Range -{ -public: - static Range createFromBlock(uint32_t blockIndex, uint32_t maxBlockSize, uint32_t sumOfBlocksSize) - { - const uint32_t startIndex = blockIndex * maxBlockSize; - const uint32_t endIndex = startIndex + maxBlockSize; - return Range { startIndex, endIndex > sumOfBlocksSize ? sumOfBlocksSize : endIndex }; - } - - uint32_t startIndex; - uint32_t endIndex; - uint32_t count; - -private: - Range(uint32_t startIndex, uint32_t endIndex) : startIndex(startIndex), endIndex(endIndex), count(endIndex - startIndex) {} -}; - -template -class KMeansDenseLloydKernelBaseUCAPI : public Kernel -{ -protected: - services::Status computeSquares(const services::internal::Buffer & data, - daal::services::internal::sycl::math::SumReducer::Result & result, - services::internal::sycl::UniversalBuffer & dataSq, uint32_t nRows, uint32_t nFeatures); - - services::Status computeDistances(const services::internal::Buffer & data, - const services::internal::Buffer & centroids, uint32_t blockSize, uint32_t nClusters, - uint32_t nFeatures); - - services::Status computeAssignments(const services::internal::sycl::UniversalBuffer & assignments, uint32_t blockSize, uint32_t nClusters); - - services::Status computePartialCandidates(const services::internal::sycl::UniversalBuffer & assignments, uint32_t blockSize, uint32_t nClusters, - uint32_t reset); - - services::Status mergePartialCandidates(uint32_t nClusters); - - services::Status partialReduceCentroids(const services::internal::Buffer & data, - const services::internal::sycl::UniversalBuffer & assignments, uint32_t blockSize, uint32_t nClusters, - uint32_t nFeatures, uint32_t doReset); - - services::Status mergeReduceCentroids(const services::internal::Buffer & centroids, uint32_t nClusters, uint32_t nFeatures); - - services::Status updateObjectiveFunction(const services::internal::Buffer & objFunction, uint32_t blockSize, uint32_t nClusters, - uint32_t doReset); - services::Status getNumEmptyClusters(uint32_t nClusters); - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & kernelFactory, uint32_t nClusters); - services::Status setEmptyClusters(NumericTable * const ntData, uint32_t nRows, uint32_t nClusters, uint32_t nFeatures, - services::internal::Buffer & outCentroids, algorithmFPType & objFuncCorrection); - services::Status initializeBuffers(uint32_t nClusters, uint32_t nFeatures, uint32_t blockSize); - services::Status getBlockSize(uint32_t nRows, uint32_t nClusters, uint32_t nFeatures, uint32_t & blockSize); - services::Status fitPartialCentroidSize(uint32_t nClusters, uint32_t nFeatures); - uint32_t getCandidatePartNum(uint32_t nClusters); - uint32_t getWorkgroupsCount(uint32_t rows); - services::String getBuildOptions(uint32_t nClusters); - - services::internal::sycl::UniversalBuffer _dataSq; - services::internal::sycl::UniversalBuffer _centroidsSq; - services::internal::sycl::UniversalBuffer _distances; - services::internal::sycl::UniversalBuffer _mindistances; - services::internal::sycl::UniversalBuffer _candidates; - services::internal::sycl::UniversalBuffer _candidateDistances; - services::internal::sycl::UniversalBuffer _partialCandidates; - services::internal::sycl::UniversalBuffer _partialCandidateDistances; - services::internal::sycl::UniversalBuffer _partialCentroids; - services::internal::sycl::UniversalBuffer _partialCentroidsCounters; - services::internal::sycl::UniversalBuffer _numEmptyClusters; - - const uint32_t _maxWorkItemsPerGroup = 128; // should be a power of two for interal needs - const uint32_t _maxLocalBuffer = 30000; // should be less than a half of local memory (two buffers) - const uint32_t _preferableSubGroup = 16; // preferable maximal sub-group size - uint32_t _nPartialCentroids = 128; // Recommended number of partial centroids - const uint32_t _nValuesInBlock = 1024 * 1024 * 1024 / sizeof(algorithmFPType); // Max block size is 1GB - const uint32_t _nMinRows = 1; // At least a single row should fit into block -}; - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi_impl.i b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi_impl.i deleted file mode 100644 index 3a7c4eb38a9..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi_impl.i +++ /dev/null @@ -1,563 +0,0 @@ -/* file: kmeans_dense_lloyd_kernel_base_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Base Kernel for GPU. -//-- -*/ - -#ifndef __KMEANS_DENSE_LLOYD_KERNEL_BASE_UCAPI_IMPL__ -#define __KMEANS_DENSE_LLOYD_KERNEL_BASE_UCAPI_IMPL__ - -#include "services/env_detect.h" -#include "src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels.cl" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" -#include "src/services/service_data_utils.h" -#include "src/sycl/blas_gpu.h" - -#include "src/externals/service_profiler.h" - -using namespace daal::services; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -constexpr uint32_t maxInt32AsUint32T = static_cast(daal::services::internal::MaxVal::get()); - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template -Status KMeansDenseLloydKernelBaseUCAPI::initializeBuffers(uint32_t nClusters, uint32_t nFeatures, uint32_t blockSize) -{ - DAAL_ASSERT(_nPartialCentroids <= maxInt32AsUint32T); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, blockSize, nClusters); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, _nPartialCentroids, nClusters); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, _nPartialCentroids * nClusters, nFeatures); - uint32_t nCandidateParts = getCandidatePartNum(nClusters); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nCandidateParts, nClusters); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - _distances = context.allocate(TypeIds::id(), blockSize * nClusters, st); - DAAL_CHECK_STATUS_VAR(st); - _mindistances = context.allocate(TypeIds::id(), blockSize, st); - DAAL_CHECK_STATUS_VAR(st); - _candidates = context.allocate(TypeIds::id(), nClusters, st); - DAAL_CHECK_STATUS_VAR(st); - _candidateDistances = context.allocate(TypeIds::id(), nClusters, st); - DAAL_CHECK_STATUS_VAR(st); - _partialCandidates = context.allocate(TypeIds::id(), nClusters * nCandidateParts, st); - DAAL_CHECK_STATUS_VAR(st); - _partialCandidateDistances = context.allocate(TypeIds::id(), nClusters * nCandidateParts, st); - DAAL_CHECK_STATUS_VAR(st); - _partialCentroids = context.allocate(TypeIds::id(), _nPartialCentroids * nClusters * nFeatures, st); - DAAL_CHECK_STATUS_VAR(st); - _partialCentroidsCounters = context.allocate(TypeIds::id(), _nPartialCentroids * nClusters, st); - DAAL_CHECK_STATUS_VAR(st); - _numEmptyClusters = context.allocate(TypeIds::id(), 1, st); - DAAL_CHECK_STATUS_VAR(st); - return Status(); -} - -template -uint32_t KMeansDenseLloydKernelBaseUCAPI::getCandidatePartNum(uint32_t nClusters) -{ - DAAL_ASSERT(_maxLocalBuffer / nClusters / sizeof(algorithmFPType) > 0); - return _maxLocalBuffer / nClusters / sizeof(algorithmFPType); -} -template -services::String KMeansDenseLloydKernelBaseUCAPI::getBuildOptions(uint32_t nClusters) -{ - uint32_t numParts = getCandidatePartNum(nClusters); - if (numParts > _preferableSubGroup) numParts = _preferableSubGroup; - char buffer[DAAL_MAX_STRING_SIZE]; - services::String buildOptions; - buildOptions.add("-cl-std=CL1.2 -D LOCAL_SUM_SIZE="); - daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, _maxWorkItemsPerGroup); - buildOptions.add(buffer); - buildOptions.add(" -D CND_PART_SIZE="); - daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, nClusters); - buildOptions.add(buffer); - buildOptions.add(" -D NUM_PARTS_CND="); - daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, numParts); - buildOptions.add(buffer); - return buildOptions; -} - -template -uint32_t KMeansDenseLloydKernelBaseUCAPI::getWorkgroupsCount(uint32_t rows) -{ - const uint32_t elementsPerGroup = _maxWorkItemsPerGroup; - uint32_t workgroupsCount = rows / elementsPerGroup; - - if (workgroupsCount * elementsPerGroup < rows) workgroupsCount++; - - return workgroupsCount; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::computeSquares(const services::internal::Buffer & data, - math::SumReducer::Result & result, UniversalBuffer & dataSq, uint32_t nRows, - uint32_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeSquares); - DAAL_ASSERT(data.size() >= nRows * nFeatures); - DAAL_ASSERT(nRows <= maxInt32AsUint32T); - DAAL_ASSERT(nFeatures <= maxInt32AsUint32T); - Status st; - dataSq = math::SumReducer::sum(math::Layout::RowMajor, data, nRows, nFeatures, result, st).sumOfSquares; - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::getNumEmptyClusters(uint32_t nClusters) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.countEmptyClusters); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - auto kernel = kernelFactory.getKernel("count_empty_clusters", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCentroidsCounters, int, _nPartialCentroids * nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_numEmptyClusters, int, 1); - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - - KernelArguments args(4, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, _partialCentroidsCounters, AccessModeIds::read); - args.set(1, static_cast(nClusters)); - args.set(2, static_cast(_nPartialCentroids)); - args.set(3, _numEmptyClusters, AccessModeIds::write); - - KernelRange local_range(1, _maxWorkItemsPerGroup); - KernelRange global_range(1, _maxWorkItemsPerGroup); - - KernelNDRange range(2); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::computeDistances(const services::internal::Buffer & data, - const services::internal::Buffer & centroids, - uint32_t blockSize, uint32_t nClusters, uint32_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeDistances); - DAAL_ASSERT(data.size() >= blockSize * nFeatures); - DAAL_ASSERT(centroids.size() >= nClusters * nFeatures); - Status st = BlasGpu::xgemm(math::Layout::ColMajor, math::Transpose::Trans, math::Transpose::NoTrans, blockSize, nClusters, - nFeatures, algorithmFPType(-1.0), data, nFeatures, 0, centroids, nFeatures, 0, algorithmFPType(0.0), - _distances.get(), blockSize, 0); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::computeAssignments(const UniversalBuffer & assignments, uint32_t blockSize, - uint32_t nClusters) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computeAssignments); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - auto kernel = kernelFactory.getKernel("reduce_assignments", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT_UNIVERSAL_BUFFER(_centroidsSq, algorithmFPType, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_distances, algorithmFPType, blockSize * nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(assignments, int, blockSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(_mindistances, algorithmFPType, blockSize); - - DAAL_ASSERT(blockSize <= maxInt32AsUint32T); - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - - KernelArguments args(7, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, _centroidsSq, AccessModeIds::read); - args.set(1, _distances, AccessModeIds::read); - args.set(2, static_cast(blockSize)); - args.set(3, static_cast(nClusters)); - if (TypeIds::id() == TypeIds::float32) - { - args.set(4, FLT_MAX); - } - else - { - args.set(4, DBL_MAX); - } - args.set(5, assignments, AccessModeIds::write); - args.set(6, _mindistances, AccessModeIds::write); - - KernelRange local_range(1, _preferableSubGroup); - KernelRange global_range(blockSize, _preferableSubGroup); - - KernelNDRange range(2); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::computePartialCandidates(const UniversalBuffer & assignments, uint32_t blockSize, - uint32_t nClusters, uint32_t reset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.computePartialCandidates); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - auto kernel = kernelFactory.getKernel("partial_candidates", st); - DAAL_CHECK_STATUS_VAR(st); - - int numParts = getCandidatePartNum(nClusters); - if (numParts > _preferableSubGroup) numParts = _preferableSubGroup; - DAAL_ASSERT_UNIVERSAL_BUFFER(assignments, int, blockSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(_mindistances, algorithmFPType, blockSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(_dataSq, algorithmFPType, blockSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(_candidates, int, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_candidateDistances, algorithmFPType, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCandidates, int, nClusters * numParts); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCandidateDistances, algorithmFPType, nClusters * numParts); - - DAAL_ASSERT(blockSize <= maxInt32AsUint32T); - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - DAAL_ASSERT(reset <= maxInt32AsUint32T); - - KernelArguments args(10, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, assignments, AccessModeIds::read); - args.set(1, _mindistances, AccessModeIds::read); - args.set(2, _dataSq, AccessModeIds::read); - args.set(3, _candidates, AccessModeIds::read); - args.set(4, _candidateDistances, AccessModeIds::read); - args.set(5, _partialCandidates, AccessModeIds::write); - args.set(6, _partialCandidateDistances, AccessModeIds::write); - args.set(7, static_cast(blockSize)); - args.set(8, static_cast(nClusters)); - args.set(9, static_cast(reset)); - - KernelRange local_range(1, _preferableSubGroup); - KernelRange global_range(numParts, _preferableSubGroup); - - KernelNDRange range(2); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::mergePartialCandidates(uint32_t nClusters) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.mergePartialCandidates); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - auto kernel = kernelFactory.getKernel("merge_candidates", st); - DAAL_CHECK_STATUS_VAR(st); - - int numParts = getCandidatePartNum(nClusters); - if (numParts > _preferableSubGroup) numParts = _preferableSubGroup; - DAAL_ASSERT_UNIVERSAL_BUFFER(_candidates, int, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_candidateDistances, algorithmFPType, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCandidates, int, numParts * nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCandidateDistances, algorithmFPType, numParts * nClusters); - - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - - KernelArguments args(5, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, _candidates, AccessModeIds::write); - args.set(1, _candidateDistances, AccessModeIds::write); - args.set(2, _partialCandidates, AccessModeIds::read); - args.set(3, _partialCandidateDistances, AccessModeIds::read); - args.set(4, static_cast(nClusters)); - - KernelRange local_range(1, numParts); - KernelRange global_range(1, numParts); - - KernelNDRange range(2); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - DAAL_CHECK_STATUS_VAR(st); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::partialReduceCentroids(const services::internal::Buffer & data, - const UniversalBuffer & assignments, uint32_t blockSize, - uint32_t nClusters, uint32_t nFeatures, uint32_t doReset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.partialReduceCentroids); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - auto kernel = kernelFactory.getKernel("partial_reduce_centroids", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT(data.size() >= blockSize * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_distances, algorithmFPType, blockSize * nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(assignments, int, blockSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCentroids, algorithmFPType, _nPartialCentroids * nClusters * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCentroidsCounters, int, _nPartialCentroids * nClusters); - - DAAL_ASSERT(blockSize <= maxInt32AsUint32T); - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - DAAL_ASSERT(nFeatures <= maxInt32AsUint32T); - DAAL_ASSERT(doReset <= maxInt32AsUint32T); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, _nPartialCentroids * nClusters, nFeatures); - - KernelArguments args(9, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, data, AccessModeIds::read); - args.set(1, _distances, AccessModeIds::read); - args.set(2, assignments, AccessModeIds::read); - args.set(3, _partialCentroids, AccessModeIds::write); - args.set(4, _partialCentroidsCounters, AccessModeIds::write); - args.set(5, static_cast(blockSize)); - args.set(6, static_cast(nClusters)); - args.set(7, static_cast(nFeatures)); - args.set(8, static_cast(doReset)); - - KernelRange global_range(_nPartialCentroids * nFeatures); - context.run(global_range, kernel, args, st); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::mergeReduceCentroids(const services::internal::Buffer & centroids, - uint32_t nClusters, uint32_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.mergeReduceCentroids); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - auto kernel = kernelFactory.getKernel("merge_reduce_centroids", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT(centroids.size() >= nClusters * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCentroids, algorithmFPType, _nPartialCentroids * nClusters * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCentroidsCounters, int, _nPartialCentroids * nClusters); - - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - DAAL_ASSERT(nFeatures <= maxInt32AsUint32T); - - KernelArguments args(6, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, _partialCentroids, AccessModeIds::readwrite); - args.set(1, _partialCentroidsCounters, AccessModeIds::readwrite); - args.set(2, centroids, AccessModeIds::write); - args.set(3, static_cast(nClusters)); - args.set(4, static_cast(nFeatures)); - args.set(5, static_cast(_nPartialCentroids)); - - KernelRange local_range(_nPartialCentroids); - KernelRange global_range(_nPartialCentroids * nClusters); - - KernelNDRange range(1); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::updateObjectiveFunction(const services::internal::Buffer & objFunction, - uint32_t blockSize, uint32_t nClusters, uint32_t doReset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateObjectiveFunction); - Status st; - if (doReset) - { - auto hostPtr = objFunction.toHost(data_management::writeOnly, st); - DAAL_CHECK_STATUS_VAR(st); - *hostPtr = 0.0f; - } - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - auto kernel = kernelFactory.getKernel("update_objective_function", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT(objFunction.size() >= 1); - DAAL_ASSERT_UNIVERSAL_BUFFER(_dataSq, algorithmFPType, blockSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(_mindistances, algorithmFPType, blockSize); - - DAAL_ASSERT(blockSize <= maxInt32AsUint32T); - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - - KernelArguments args(5, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, _dataSq, AccessModeIds::read); - args.set(1, _mindistances, AccessModeIds::read); - args.set(2, blockSize); - args.set(3, nClusters); - args.set(4, objFunction, AccessModeIds::readwrite); - - KernelRange local_range(_maxWorkItemsPerGroup); - KernelRange global_range(_maxWorkItemsPerGroup); - - KernelNDRange range(1); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - context.run(range, kernel, args, st); - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::buildProgram(ClKernelFactoryIface & kernelFactory, uint32_t nClusters) -{ - auto fptypeName = services::internal::sycl::getKeyFPType(); - auto buildOptions = fptypeName; - buildOptions.add(getBuildOptions(nClusters)); - services::String cachekey("__daal_algorithms_kmeans_lloyd_dense_batch_"); - cachekey.add(buildOptions.c_str()); - - Status st; - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), kmeans_cl_kernels, buildOptions.c_str(), st); - } - return st; -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::getBlockSize(uint32_t nRows, uint32_t nClusters, uint32_t nFeatures, uint32_t & blockSize) -{ - uint32_t gemmBlockSize = _nValuesInBlock; - while (gemmBlockSize > _nValuesInBlock / nClusters) - { - gemmBlockSize >>= 1; - } - if (gemmBlockSize < _nMinRows) - { - return Status(ErrorKMeansNumberOfClustersIsTooLarge); - } - uint32_t datasetBlockSize = _nValuesInBlock; - while (datasetBlockSize > _nValuesInBlock / nFeatures) - { - datasetBlockSize >>= 1; - } - if (datasetBlockSize < _nMinRows) - { - return Status(ErrorIncorrectNumberOfFeatures); - } - - blockSize = datasetBlockSize > gemmBlockSize ? gemmBlockSize : datasetBlockSize; - if (blockSize > nRows) - { - blockSize = nRows; - } - return Status(); -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::fitPartialCentroidSize(uint32_t nClusters, uint32_t nFeatures) -{ - while (_nPartialCentroids * nClusters * nFeatures > _nValuesInBlock) - { - _nPartialCentroids >>= 1; - } - if (_nPartialCentroids < _nMinRows) - { - return Status(ErrorKMeansNumberOfClustersIsTooLarge); - } - return Status(); -} - -template -Status KMeansDenseLloydKernelBaseUCAPI::setEmptyClusters(NumericTable * const ntData, uint32_t nRows, uint32_t nClusters, - uint32_t nFeatures, - services::internal::Buffer & outCentroids, - algorithmFPType & objFuncCorrection) -{ - services::Status status; - DAAL_ASSERT(outCentroids.size() >= nClusters * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(_partialCentroidsCounters, int, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_candidates, int, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(_candidateDistances, algorithmFPType, nClusters); - - auto counters = _partialCentroidsCounters.template get().toHost(ReadWriteMode::readOnly, status); - auto candidatesIds = _candidates.template get().toHost(ReadWriteMode::readOnly, status); - auto candidatesDists = _candidateDistances.template get().toHost(ReadWriteMode::readOnly, status); - auto clusterFeatures = outCentroids.toHost(ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t cPos = 0; - for (uint32_t iCl = 0; iCl < nClusters; iCl++) - if (counters.get()[iCl] == 0) - { - if (cPos >= nClusters) - { - continue; - } - int id = candidatesIds.get()[cPos]; - if (id < 0 || id >= nRows) - { - continue; - } - objFuncCorrection += candidatesDists.get()[cPos]; - BlockDescriptor singleRow; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(0, nRows, readOnly, singleRow)); - auto rowData = singleRow.getBlockPtr(); - if (!rowData) - { - return Status(ErrorNullPtr); - } - for (uint32_t iFeature = 0; iFeature < nFeatures; iFeature++) - clusterFeatures.get()[iCl * nFeatures + iFeature] = rowData[id * nFeatures + iFeature]; - cPos++; - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(singleRow)); - } - return status; -} - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi.h b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi.h deleted file mode 100644 index 3db5037a1da..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi.h +++ /dev/null @@ -1,72 +0,0 @@ -/* file: kmeans_init_dense_batch_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template function that computes K-means. -//-- -*/ - -#ifndef _KMEANS_INIT_DENSE_BATCH_KERNEL_UCAPI_H -#define _KMEANS_INIT_DENSE_BATCH_KERNEL_UCAPI_H - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "algorithms/kmeans/kmeans_init_types.h" -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "data_management/data/memory_block.h" - -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace init -{ -namespace internal -{ -template -class KMeansInitDenseBatchKernelUCAPI : public Kernel -{ -public: - services::Status compute(size_t na, const NumericTable * const * a, size_t nr, const NumericTable * const * r, const Parameter * par, - engines::BatchBase & engine); - -private: - services::Status init(size_t p, size_t n, size_t nRowsTotal, size_t nClusters, NumericTable * ntClusters, NumericTable * ntData, - unsigned int seed, engines::BatchBase & engine, size_t & clustersFound); - - services::Status gatherRandom(const services::internal::Buffer & data, - const services::internal::Buffer & clusters, services::internal::sycl::UniversalBuffer & indices, - uint32_t nRows, uint32_t nClusters, uint32_t nFeatures); - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & kernelFactory); - uint32_t getWorkgroupsCount(uint32_t rows); - - const uint32_t _maxWorkItemsPerGroup = 256; // should be a power of two for interal needs -}; - -} // namespace internal -} // namespace init -} // namespace kmeans -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi_impl.i b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi_impl.i deleted file mode 100644 index 3073ffa4d8f..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_init_dense_batch_kernel_ucapi_impl.i +++ /dev/null @@ -1,237 +0,0 @@ -/* file: kmeans_init_dense_batch_kernel_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Lloyd method for K-means algorithm. -//-- -*/ - -#include "src/algorithms/kmeans/oneapi/cl_kernels/kmeans_init_cl_kernels.cl" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" - -#include "data_management/data/numeric_table.h" -#include "services/daal_defines.h" -#include "src/services/service_data_utils.h" -#include "src/externals/service_memory.h" -#include "src/data_management/service_numeric_table.h" -#include "src/algorithms/distributions/uniform/uniform_kernel.h" -#include "src/algorithms/distributions/uniform/uniform_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace init -{ -namespace internal -{ -using namespace daal::internal; -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; -using namespace daal::algorithms::distributions::uniform::internal; - -constexpr uint32_t maxInt32AsUint32T = static_cast(daal::services::internal::MaxVal::get()); -constexpr size_t maxInt32AsSizeT = static_cast(daal::services::internal::MaxVal::get()); - -template -Status KMeansInitDenseBatchKernelUCAPI::init(size_t p, size_t n, size_t nRowsTotal, size_t nClusters, - NumericTable * ntClusters, NumericTable * ntData, unsigned int seed, - engines::BatchBase & engine, size_t & clustersFound) -{ - Status st; - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - - if (method == deterministicDense) - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nClusters, p); - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(0, nClusters, readOnly, dataRows)); - auto data = dataRows.getBuffer(); - - BlockDescriptor clustersRows; - DAAL_CHECK_STATUS_VAR(ntClusters->getBlockOfRows(0, nClusters, writeOnly, clustersRows)); - auto clusters = clustersRows.getBuffer(); - - DAAL_ASSERT(clusters.size() >= nClusters * p); - DAAL_ASSERT(data.size() >= nClusters * p); - context.copy(clusters, 0, data, 0, nClusters * p, st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(dataRows)); - DAAL_CHECK_STATUS_VAR(ntClusters->releaseBlockOfRows(clustersRows)); - - clustersFound = nClusters; - - return st; - } - - if (method == randomDense) - { - DAAL_CHECK(nClusters <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - DAAL_CHECK(nRowsTotal <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - auto indices = context.allocate(TypeIds::id(), nClusters, st); - DAAL_CHECK_STATUS_VAR(st); - { - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int, nClusters); - auto indicesHostPtr = indices.get().toHost(data_management::readWrite, st); - DAAL_CHECK_STATUS_VAR(st); - auto * indicesHost = indicesHostPtr.get(); - - uint32_t k = 0; - Status s; - for (uint32_t i = 0; i < nClusters; i++) - { - DAAL_CHECK_STATUS( - s, (UniformKernelDefault::compute(i, static_cast(nRowsTotal), engine, 1, indicesHost + k))); - uint32_t c = (size_t)indicesHost[k]; - int & value = indicesHost[k]; - for (uint32_t j = k; j > 0; j--) - { - if (value == indicesHost[j - 1]) - { - c = (uint32_t)(j - 1); - value = c; - } - } - if (c >= n) continue; - k++; - } - - clustersFound = k; - } - - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(0, nRowsTotal, readOnly, dataRows)); - auto data = dataRows.getBuffer(); - - BlockDescriptor clustersRows; - DAAL_CHECK_STATUS_VAR(ntClusters->getBlockOfRows(0, clustersFound, writeOnly, clustersRows)); - auto clusters = clustersRows.getBuffer(); - - DAAL_CHECK_STATUS_VAR(gatherRandom(data, clusters, indices, nRowsTotal, clustersFound, p)); - - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(dataRows)); - DAAL_CHECK_STATUS_VAR(ntClusters->releaseBlockOfRows(clustersRows)); - - return st; - } - - DAAL_ASSERT(false && "should never happen"); - return Status(); -} - -template -services::Status KMeansInitDenseBatchKernelUCAPI::compute(size_t na, const NumericTable * const * a, size_t nr, - const NumericTable * const * r, const Parameter * par, - engines::BatchBase & engine) -{ - NumericTable * ntData = const_cast(a[0]); - NumericTable * ntClusters = const_cast(r[0]); - - const size_t p = ntData->getNumberOfColumns(); - const size_t n = ntData->getNumberOfRows(); - const size_t nClusters = par->nClusters; - - size_t clustersFound = 0; - - return init(p, n, n, nClusters, ntClusters, ntData, par->seed, engine, clustersFound); -} - -template -uint32_t KMeansInitDenseBatchKernelUCAPI::getWorkgroupsCount(uint32_t rows) -{ - const uint32_t elementsPerGroup = _maxWorkItemsPerGroup; - uint32_t workgroupsCount = rows / elementsPerGroup; - - if (workgroupsCount * elementsPerGroup < rows) workgroupsCount++; - - return workgroupsCount; -} - -template -Status KMeansInitDenseBatchKernelUCAPI::gatherRandom(const services::internal::Buffer & data, - const services::internal::Buffer & clusters, - UniversalBuffer & indices, uint32_t nRows, uint32_t nClusters, - uint32_t nFeatures) -{ - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory)); - - auto kernel = kernelFactory.getKernel("gather_random", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT(nRows <= maxInt32AsUint32T); - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - DAAL_ASSERT(nFeatures <= maxInt32AsUint32T); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nClusters, nFeatures); - DAAL_ASSERT(data.size() >= nRows); - DAAL_ASSERT(clusters.size() >= nClusters * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int, nClusters); - - KernelArguments args(6, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, data, AccessModeIds::read); - args.set(1, clusters, AccessModeIds::write); - args.set(2, indices, AccessModeIds::read); - args.set(3, static_cast(nRows)); - args.set(4, static_cast(nClusters)); - args.set(5, static_cast(nFeatures)); - - KernelRange local_range(1, _maxWorkItemsPerGroup); - KernelRange global_range(nClusters, _maxWorkItemsPerGroup); - - KernelNDRange range(2); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - - { - context.run(range, kernel, args, st); - } - return st; -} - -template -Status KMeansInitDenseBatchKernelUCAPI::buildProgram(ClKernelFactoryIface & kernelFactory) -{ - auto fptypeName = services::internal::sycl::getKeyFPType(); - auto buildOptions = fptypeName; - buildOptions.add("-cl-std=CL1.2 -D LOCAL_SUM_SIZE=256"); // should be not less than _maxWorkitemsPerGroup - - services::String cachekey("__daal_algorithms_kmeans_init_dense_batch_"); - cachekey.add(fptypeName); - cachekey.add(buildOptions); - - Status st; - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), kmeans_init_cl_kernels, buildOptions.c_str(), st); - return st; -} - -} // namespace internal -} // namespace init -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_kernel_ucapi.h b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_kernel_ucapi.h deleted file mode 100644 index b2202c6b23b..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_kernel_ucapi.h +++ /dev/null @@ -1,52 +0,0 @@ -/* file: kmeans_lloyd_distr_step1_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Distr Step1 Kernel for GPU. -//-- -*/ - -#ifndef __KMEANS_LLOYD_DISTR_STEP1_KERNEL_UCAPI_H__ -#define __KMEANS_LLOYD_DISTR_STEP1_KERNEL_UCAPI_H__ - -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h" - -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template -class KMeansDistributedStep1KernelUCAPI : public KMeansDenseLloydKernelBaseUCAPI -{ -public: - services::Status compute(size_t na, const NumericTable * const * a, size_t nr, const NumericTable * const * r, const Parameter * par); - services::Status finalizeCompute(size_t na, const NumericTable * const * a, size_t nr, const NumericTable * const * r, const Parameter * par); -}; - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_ucapi_impl.i b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_ucapi_impl.i deleted file mode 100644 index 039e5e9712d..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_ucapi_impl.i +++ /dev/null @@ -1,246 +0,0 @@ -/* file: kmeans_lloyd_distr_step1_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Lloyd method for K-means algorithm. -//-- -*/ - -#include "services/env_detect.h" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" -#include "src/services/service_data_utils.h" -#include "src/sycl/blas_gpu.h" -#include "src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step1_kernel_ucapi.h" - -#include "src/externals/service_profiler.h" - -constexpr size_t maxInt32AsSizeT = static_cast(daal::services::internal::MaxVal::get()); - -using namespace daal::internal; -using namespace daal::services::internal; -using namespace daal::services; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -using namespace daal::services::internal::sycl; -template -Status KMeansDistributedStep1KernelUCAPI::compute(size_t na, const NumericTable * const * a, size_t nr, - const NumericTable * const * r, const Parameter * par) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute); - - Status st; - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - - NumericTable * ntData = const_cast(a[0]); - NumericTable * ntInCentroids = const_cast(a[1]); - NumericTable * ntClusterS0 = const_cast(r[0]); - NumericTable * ntClusterS1 = const_cast(r[1]); - NumericTable * ntObjFunction = const_cast(r[2]); - NumericTable * ntCValues = const_cast(r[3]); - NumericTable * ntCCentroids = const_cast(r[4]); - NumericTable * ntAssignments = const_cast(r[5]); - - const size_t nDataRowsAsSizeT = ntData->getNumberOfRows(); - const size_t nDataColumnsAsSizeT = ntData->getNumberOfColumns(); - DAAL_CHECK(nDataRowsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfRowsInInputNumericTable); - DAAL_CHECK(nDataColumnsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - const uint32_t nRows = static_cast(nDataRowsAsSizeT); - const uint32_t nFeatures = static_cast(nDataColumnsAsSizeT); - - const size_t nClustersAsSizeT = par->nClusters; - DAAL_CHECK(nClustersAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nClustersAsSizeT, nDataColumnsAsSizeT); - const uint32_t nClusters = static_cast(nClustersAsSizeT); - - DAAL_ASSERT(ntObjFunction->getNumberOfRows() == 1 && ntObjFunction->getNumberOfColumns() == 1); - - uint32_t blockSize = 0; - DAAL_CHECK_STATUS_VAR(this->getBlockSize(nRows, nClusters, nFeatures, blockSize)); - DAAL_CHECK_STATUS_VAR(this->fitPartialCentroidSize(nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->initializeBuffers(nClusters, nFeatures, blockSize)); - DAAL_ASSERT_UNIVERSAL_BUFFER(this->_numEmptyClusters, int, 1); - - BlockDescriptor inCentroidsRows; - DAAL_CHECK_STATUS_VAR(ntInCentroids->getBlockOfRows(0, nClusters, readOnly, inCentroidsRows)); - auto inCentroids = inCentroidsRows.getBuffer(); - - BlockDescriptor ntClusterS0Rows; - DAAL_CHECK_STATUS_VAR(ntClusterS0->getBlockOfRows(0, nClusters, writeOnly, ntClusterS0Rows)); - auto outCCounters = ntClusterS0Rows.getBuffer(); - - BlockDescriptor ntClusterS1Rows; - DAAL_CHECK_STATUS_VAR(ntClusterS1->getBlockOfRows(0, nClusters, writeOnly, ntClusterS1Rows)); - auto outCentroids = ntClusterS1Rows.getBuffer(); - - BlockDescriptor ntObjFunctionRows; - DAAL_CHECK_STATUS_VAR(ntObjFunction->getBlockOfRows(0, nClusters, writeOnly, ntObjFunctionRows)); - auto outObjFunction = ntObjFunctionRows.getBuffer(); - - BlockDescriptor ntCValuesRows; - DAAL_CHECK_STATUS_VAR(ntCValues->getBlockOfRows(0, nClusters, writeOnly, ntCValuesRows)); - auto outCValues = UniversalBuffer(ntCValuesRows.getBuffer()); - - BlockDescriptor ntCCentroidsRows; - DAAL_CHECK_STATUS_VAR(ntCCentroids->getBlockOfRows(0, nClusters, writeOnly, ntCCentroidsRows)); - auto outCCentroids = UniversalBuffer(ntCCentroidsRows.getBuffer()); - - DAAL_ASSERT_UNIVERSAL_BUFFER(outCValues, algorithmFPType, nClusters); - context.fill(outCValues, sizeof(algorithmFPType) == 4 ? FLT_MAX : DBL_MAX, st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory, nClusters)); - - auto assignments = context.allocate(TypeIds::id(), blockSize, st); - DAAL_CHECK_STATUS_VAR(st); - - math::SumReducer::Result dataSums(context, blockSize, TypeIds::id(), st); - DAAL_CHECK_STATUS_VAR(st); - math::SumReducer::Result centroidsSums(context, blockSize, TypeIds::id(), st); - DAAL_CHECK_STATUS_VAR(st); - - size_t nPartNum = this->getCandidatePartNum(nClusters); - size_t nBlocks = nRows / blockSize + int(nRows % blockSize != 0); - - bool needCandidates = true; - for (size_t block = 0; block < nBlocks; block++) - { - auto range = Range::createFromBlock(block, blockSize, nRows); - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(range.startIndex, range.count, readOnly, dataRows)); - auto data = dataRows.getBuffer(); - DAAL_CHECK_STATUS_VAR(this->computeSquares(inCentroids, centroidsSums, this->_centroidsSq, nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->computeDistances(data, inCentroids, range.count, nClusters, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->computeAssignments(assignments, range.count, nClusters)); - DAAL_CHECK_STATUS_VAR(this->computeSquares(data, dataSums, this->_dataSq, range.count, nFeatures)); - DAAL_CHECK_STATUS_VAR(this->partialReduceCentroids(data, assignments, range.count, nClusters, nFeatures, int(block == 0))); - if (needCandidates) - { - DAAL_CHECK_STATUS_VAR(this->getNumEmptyClusters(nClusters)); - DAAL_CHECK_STATUS_VAR(st); - int numEmpty = 0; - { - DAAL_ASSERT_UNIVERSAL_BUFFER(this->_numEmptyClusters, int, 1); - auto num = this->_numEmptyClusters.template get().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - numEmpty = num.get()[0]; - } - bool hasEmptyClusters = numEmpty > 0; - if (hasEmptyClusters) - { - DAAL_CHECK_STATUS_VAR(this->computePartialCandidates(assignments, range.count, nClusters, int(block == 0))); - DAAL_CHECK_STATUS_VAR(this->mergePartialCandidates(nClusters)); - } - needCandidates = hasEmptyClusters; - } - DAAL_CHECK_STATUS_VAR(this->updateObjectiveFunction(outObjFunction, range.count, nClusters, int(block == 0))); - DAAL_CHECK_STATUS_VAR(ntData->releaseBlockOfRows(dataRows)); - if (par->assignFlag) - { - BlockDescriptor assignmentsRows; - DAAL_CHECK_STATUS_VAR(ntAssignments->getBlockOfRows(0, nRows, writeOnly, assignmentsRows)); - auto finalAssignments = assignmentsRows.getBuffer(); - DAAL_ASSERT(finalAssignments.size() >= range.startIndex + range.count); - DAAL_ASSERT_UNIVERSAL_BUFFER(assignments, int, range.count); - context.copy(finalAssignments, range.startIndex, assignments, 0, range.count, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(ntAssignments->releaseBlockOfRows(assignmentsRows)); - } - } - DAAL_CHECK_STATUS_VAR(this->mergeReduceCentroids(outCentroids, nClusters, nFeatures)); - DAAL_ASSERT(outCCounters.size() >= nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(this->_partialCentroidsCounters, int, nClusters); - context.copy(outCCounters, 0, this->_partialCentroidsCounters, 0, nClusters, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(ntInCentroids->releaseBlockOfRows(inCentroidsRows)); - DAAL_CHECK_STATUS_VAR(ntClusterS0->releaseBlockOfRows(ntClusterS0Rows)); - DAAL_CHECK_STATUS_VAR(ntClusterS1->releaseBlockOfRows(ntClusterS1Rows)); - DAAL_CHECK_STATUS_VAR(ntObjFunction->releaseBlockOfRows(ntObjFunctionRows)); - if (needCandidates) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(outCValues, algorithmFPType, nClusters); - DAAL_ASSERT_UNIVERSAL_BUFFER(this->_candidateDistances, algorithmFPType, nClusters); - context.copy(outCValues, 0, this->_candidateDistances, 0, nClusters, st); - } - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(ntCValues->releaseBlockOfRows(ntCValuesRows)); - if (needCandidates) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(this->_candidates, int, nClusters); - auto hostCandidates = this->_candidates.template get().toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - for (uint32_t cPos = 0; cPos < nClusters; cPos++) - { - int index = hostCandidates.get()[cPos]; - if (index < 0 || index >= nRows) - { - continue; - } - BlockDescriptor dataRows; - DAAL_CHECK_STATUS_VAR(ntData->getBlockOfRows(index, 1, readOnly, dataRows)); - DAAL_ASSERT_UNIVERSAL_BUFFER(outCCentroids, algorithmFPType, cPos * nFeatures + nFeatures); - DAAL_ASSERT(dataRows.getBuffer().size() >= nFeatures); - context.copy(outCCentroids, cPos * nFeatures, dataRows.getBuffer(), 0, nFeatures, st); - DAAL_CHECK_STATUS_VAR(st); - } - } - DAAL_CHECK_STATUS_VAR(ntCCentroids->releaseBlockOfRows(ntCCentroidsRows)); - return st; -} - -template -Status KMeansDistributedStep1KernelUCAPI::finalizeCompute(size_t na, const NumericTable * const * a, size_t nr, - const NumericTable * const * r, const Parameter * par) -{ - if (!par->assignFlag) return Status(); - - NumericTable * ntPartialAssignments = const_cast(a[0]); - NumericTable * ntAssignments = const_cast(r[0]); - const size_t n = ntPartialAssignments->getNumberOfRows(); - - BlockDescriptor inBlock; - DAAL_CHECK_STATUS_VAR(ntPartialAssignments->getBlockOfRows(0, n, readOnly, inBlock)); - - BlockDescriptor outBlock; - DAAL_CHECK_STATUS_VAR(ntAssignments->getBlockOfRows(0, n, writeOnly, outBlock)); - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - Status status; - DAAL_ASSERT(outBlock.getBuffer().size() >= n); - DAAL_ASSERT(inBlock.getBuffer().size() >= n); - context.copy(outBlock.getBuffer(), 0, inBlock.getBuffer(), 0, n, status); - DAAL_CHECK_STATUS_VAR(ntPartialAssignments->releaseBlockOfRows(inBlock)); - DAAL_CHECK_STATUS_VAR(ntAssignments->releaseBlockOfRows(outBlock)); - return status; -} - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_kernel_ucapi.h b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_kernel_ucapi.h deleted file mode 100644 index 4a8120dccae..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_kernel_ucapi.h +++ /dev/null @@ -1,63 +0,0 @@ -/* file: kmeans_lloyd_distr_step2_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of K-means Distr Step1 Kernel for GPU. -//-- -*/ - -#ifndef __KMEANS_LLOYD_DISTR_STEP2_KERNEL_UCAPI_H__ -#define __KMEANS_LLOYD_DISTR_STEP2_KERNEL_UCAPI_H__ - -#include "src/algorithms/kmeans/oneapi/kmeans_dense_lloyd_kernel_base_ucapi.h" - -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template -class KMeansDistributedStep2KernelUCAPI : public Kernel -{ -public: - services::Status compute(size_t na, const NumericTable * const * a, size_t nr, const NumericTable * const * r, const Parameter * par); - services::Status finalizeCompute(size_t na, const NumericTable * const * a, size_t nr, const NumericTable * const * r, const Parameter * par); - services::Status updateClusters(bool init, const services::internal::Buffer & partialCentroidsCounters, - const services::internal::Buffer & partialCentroids, - const services::internal::Buffer & centroidCounters, - const services::internal::Buffer & centroids, uint32_t nClusters, uint32_t nFeatures); - - services::Status updateCandidates(bool init, const services::internal::Buffer & partialCandidates, - const services::internal::Buffer & partialCValues, - const services::internal::Buffer & candidates, const services::internal::Buffer & cValues, - uint32_t nClusters); - services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & kernelFactory); - uint32_t _maxWGSize = 256; -}; - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_ucapi_impl.i b/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_ucapi_impl.i deleted file mode 100644 index 64007c60422..00000000000 --- a/cpp/daal/src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_ucapi_impl.i +++ /dev/null @@ -1,341 +0,0 @@ -/* file: kmeans_lloyd_distr_step2_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Lloyd method for K-means algorithm. -//-- -*/ - -#include "services/env_detect.h" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types.h" -#include "src/services/service_data_utils.h" -#include "src/sycl/blas_gpu.h" -#include "src/algorithms/kmeans/oneapi/kmeans_lloyd_distr_step2_kernel_ucapi.h" -#include "src/algorithms/kmeans/oneapi/cl_kernels/kmeans_cl_kernels_distr_steps.cl" -#include "src/data_management/service_numeric_table.h" - -#include "src/externals/service_profiler.h" -//#include - -using namespace daal::internal; -using namespace daal::services::internal; -using namespace daal::services; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -constexpr size_t maxInt32AsSizeT = static_cast(daal::services::internal::MaxVal::get()); -constexpr uint32_t maxInt32AsUint32T = static_cast(daal::services::internal::MaxVal::get()); - -namespace daal -{ -namespace algorithms -{ -namespace kmeans -{ -namespace internal -{ -template -Status KMeansDistributedStep2KernelUCAPI::compute(size_t na, const NumericTable * const * a, size_t nr, - const NumericTable * const * r, const Parameter * par) -{ - Status st; - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(this->buildProgram(kernelFactory)); - - const size_t nClustersAsSizeT = par->nClusters; - const size_t nDataColumnsAsSizeT = r[1]->getNumberOfColumns(); - DAAL_CHECK(nClustersAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - DAAL_CHECK(nDataColumnsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - DAAL_CHECK(na <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - const uint32_t nClusters = static_cast(nClustersAsSizeT); - const uint32_t nFeatures = static_cast(nDataColumnsAsSizeT); - - NumericTable * ntClusterS0 = const_cast(r[0]); - NumericTable * ntClusterS1 = const_cast(r[1]); - NumericTable * ntObjFunction = const_cast(r[2]); - NumericTable * ntCValues = const_cast(r[3]); - NumericTable * ntCCentroids = const_cast(r[4]); - NumericTable * ntAssignments = const_cast(r[5]); - - BlockDescriptor ntClusterS0Rows; - DAAL_CHECK_STATUS_VAR(ntClusterS0->getBlockOfRows(0, nClusters, writeOnly, ntClusterS0Rows)); - auto outCCounters = ntClusterS0Rows.getBuffer(); - - BlockDescriptor ntClusterS1Rows; - DAAL_CHECK_STATUS_VAR(ntClusterS1->getBlockOfRows(0, nClusters, writeOnly, ntClusterS1Rows)); - auto outCentroids = ntClusterS1Rows.getBuffer(); - - BlockDescriptor ntObjFunctionRows; - DAAL_CHECK_STATUS_VAR(ntObjFunction->getBlockOfRows(0, 1, writeOnly, ntObjFunctionRows)); - - DAAL_ASSERT(ntObjFunctionRows.getBuffer().size() >= 1); - auto outObjFunction = ntObjFunctionRows.getBuffer().toHost(data_management::writeOnly, st); - DAAL_CHECK_STATUS_VAR(st); - - BlockDescriptor ntCValuesRows; - DAAL_CHECK_STATUS_VAR(ntCValues->getBlockOfRows(0, nClusters, writeOnly, ntCValuesRows)); - auto outCValues = ntCValuesRows.getBuffer(); - - BlockDescriptor ntCCentroidsRows; - DAAL_CHECK_STATUS_VAR(ntCCentroids->getBlockOfRows(0, nClusters, writeOnly, ntCCentroidsRows)); - auto outCCentroids = ntCCentroidsRows.getBuffer(); - - const uint32_t nBlocks = static_cast(na) / 5; - - algorithmFPType tmpObjValue = 0.0; - - for (uint32_t i = 0; i < nBlocks; i++) - { - BlockDescriptor ntParClusterS0Rows; - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 0])->getBlockOfRows(0, nClusters, readOnly, ntParClusterS0Rows)); - auto inParClusterS0 = ntParClusterS0Rows.getBuffer(); - - BlockDescriptor ntParClusterS1Rows; - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 1])->getBlockOfRows(0, nClusters, readOnly, ntParClusterS1Rows)); - auto inParClusterS1 = ntParClusterS1Rows.getBuffer(); - - DAAL_CHECK_STATUS_VAR(updateClusters(i == 0, inParClusterS0, inParClusterS1, outCCounters, outCentroids, nClusters, nFeatures)); - - BlockDescriptor ntParObjFunctionRows; - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 2])->getBlockOfRows(0, 1, readOnly, ntParObjFunctionRows)); - DAAL_ASSERT(ntParObjFunctionRows.getBuffer().size() > 0); - auto inParObjFunction = ntParObjFunctionRows.getBuffer().toHost(data_management::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - tmpObjValue += *inParObjFunction.get(); - - BlockDescriptor ntParCValuesRows; - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 3])->getBlockOfRows(0, nClusters, readOnly, ntParCValuesRows)); - auto inParCValues = ntParCValuesRows.getBuffer(); - - BlockDescriptor ntParCCandidates; - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 4])->getBlockOfRows(0, nClusters, readOnly, ntParCCandidates)); - auto intParCCandidates = ntParCCandidates.getBuffer(); - - DAAL_CHECK_STATUS_VAR(updateCandidates(i == 0, intParCCandidates, inParCValues, outCCentroids, outCValues, nClusters)); - - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 0])->releaseBlockOfRows(ntParClusterS0Rows)); - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 1])->releaseBlockOfRows(ntParClusterS1Rows)); - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 2])->releaseBlockOfRows(ntParObjFunctionRows)); - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 3])->releaseBlockOfRows(ntParCValuesRows)); - DAAL_CHECK_STATUS_VAR(const_cast(a[i * 5 + 4])->releaseBlockOfRows(ntParCCandidates)); - } - *outObjFunction.get() = tmpObjValue; - { - DAAL_ASSERT(outCentroids.size() >= nFeatures * nClusters); - auto retCentroids = outCentroids.toHost(ReadWriteMode::readWrite, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_ASSERT(outCCounters.size() >= nClusters); - auto retCCounters = outCCounters.toHost(ReadWriteMode::readOnly, st); - DAAL_CHECK_STATUS_VAR(st); - - for (int j = 0; j < nClusters; j++) - { - int count = retCCounters.get()[j]; - if (!count) continue; - for (int k = 0; k < nFeatures; k++) retCentroids.get()[j * nFeatures + k] *= count; - } - } - DAAL_CHECK_STATUS_VAR(ntClusterS0->releaseBlockOfRows(ntClusterS0Rows)); - DAAL_CHECK_STATUS_VAR(ntClusterS1->releaseBlockOfRows(ntClusterS1Rows)); - DAAL_CHECK_STATUS_VAR(ntObjFunction->releaseBlockOfRows(ntObjFunctionRows)); - DAAL_CHECK_STATUS_VAR(ntCValues->releaseBlockOfRows(ntCValuesRows)); - DAAL_CHECK_STATUS_VAR(ntCCentroids->releaseBlockOfRows(ntCCentroidsRows)); - - return st; -} - -template -Status KMeansDistributedStep2KernelUCAPI::finalizeCompute(size_t na, const NumericTable * const * a, size_t nr, - const NumericTable * const * r, const Parameter * par) -{ - Status st; - const size_t nClustersAsSizeT = par->nClusters; - const size_t nDataColumnsAsSizeT = a[1]->getNumberOfColumns(); - DAAL_CHECK(nClustersAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectParameter); - DAAL_CHECK(nDataColumnsAsSizeT <= maxInt32AsSizeT, services::ErrorIncorrectNumberOfColumnsInInputNumericTable); - const uint32_t nClusters = static_cast(nClustersAsSizeT); - const uint32_t p = static_cast(nDataColumnsAsSizeT); - int result = 0; - - ReadRows mtInClusterS0(*const_cast(a[0]), 0, nClusters); - DAAL_CHECK_BLOCK_STATUS(mtInClusterS0); - ReadRows mtInClusterS1(*const_cast(a[1]), 0, nClusters); - DAAL_CHECK_BLOCK_STATUS(mtInClusterS1); - ReadRows mtInTargetFunc(*const_cast(a[2]), 0, 1); - DAAL_CHECK_BLOCK_STATUS(mtInTargetFunc); - - ReadRows mtCValues(*const_cast(a[3]), 0, nClusters); - DAAL_CHECK_BLOCK_STATUS(mtCValues); - ReadRows mtCCentroids(*const_cast(a[4]), 0, nClusters); - DAAL_CHECK_BLOCK_STATUS(mtCCentroids); - - const int * clusterS0 = mtInClusterS0.get(); - const algorithmFPType * clusterS1 = mtInClusterS1.get(); - const algorithmFPType * inTarget = mtInTargetFunc.get(); - - const algorithmFPType * cValues = mtCValues.get(); - const algorithmFPType * cCentroids = mtCCentroids.get(); - - WriteOnlyRows mtClusters(*const_cast(r[0]), 0, nClusters); - DAAL_CHECK_BLOCK_STATUS(mtClusters); - WriteOnlyRows mtTargetFunct(*const_cast(r[1]), 0, 1); - DAAL_CHECK_BLOCK_STATUS(mtTargetFunct); - - algorithmFPType * clusters = mtClusters.get(); - algorithmFPType * outTarget = mtTargetFunct.get(); - - *outTarget = *inTarget; - - uint32_t cPos = 0; - - for (uint32_t i = 0; i < nClusters; i++) - { - if (clusterS0[i] > 0) - { - algorithmFPType coeff = 1.0 / clusterS0[i]; - - for (uint32_t j = 0; j < p; j++) - { - clusters[i * p + j] = clusterS1[i * p + j] * coeff; - } - } - else - { - DAAL_CHECK(!(cValues[cPos] < (algorithmFPType)0.0), services::ErrorKMeansNumberOfClustersIsTooLarge); - outTarget[0] -= cValues[cPos]; - result |= daal::services::internal::daal_memcpy_s(&clusters[i * p], p * sizeof(algorithmFPType), &cCentroids[cPos * p], - p * sizeof(algorithmFPType)); - cPos++; - } - } - - return (!result) ? services::Status() : services::Status(services::ErrorMemoryCopyFailedInternal); -} - -template -Status KMeansDistributedStep2KernelUCAPI::updateClusters(bool init, const services::internal::Buffer & partialCentroidsCounters, - const services::internal::Buffer & partialCentroids, - const services::internal::Buffer & centroidCounters, - const services::internal::Buffer & centroids, - uint32_t nClusters, uint32_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.mergeReduceCentroids); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - auto kernelUpdateClusters = init ? kernelFactory.getKernel("init_clusters", st) : kernelFactory.getKernel("update_clusters", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT(nFeatures <= maxInt32AsUint32T); - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nFeatures, nClusters); - DAAL_ASSERT(partialCentroidsCounters.size() >= nClusters); - DAAL_ASSERT(partialCentroids.size() >= nClusters * nFeatures); - DAAL_ASSERT(centroidCounters.size() >= nClusters); - DAAL_ASSERT(centroids.size() >= nClusters * nFeatures); - - KernelArguments args(5, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, partialCentroidsCounters, AccessModeIds::read); - args.set(1, partialCentroids, AccessModeIds::read); - args.set(2, centroidCounters, AccessModeIds::readwrite); - args.set(3, centroids, AccessModeIds::readwrite); - args.set(4, static_cast(nFeatures)); - - KernelRange local_range(1, nFeatures > _maxWGSize ? _maxWGSize : nFeatures); - KernelRange global_range(nClusters, nFeatures); - - KernelNDRange range(2); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateClusters.run); - context.run(range, kernelUpdateClusters, args, st); - } - return st; -} - -template -Status KMeansDistributedStep2KernelUCAPI::updateCandidates(bool init, const services::internal::Buffer & partialCandidates, - const services::internal::Buffer & partialCValues, - const services::internal::Buffer & candidates, - const services::internal::Buffer & cValues, - uint32_t nClusters) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.mergeReduceCentroids); - Status st; - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernelFactory = context.getClKernelFactory(); - auto kernelUpdateCandidates = init ? kernelFactory.getKernel("init_candidates", st) : kernelFactory.getKernel("update_candidates", st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_ASSERT(nClusters <= maxInt32AsUint32T); - DAAL_ASSERT(partialCandidates.size() >= nClusters); - DAAL_ASSERT(partialCValues.size() >= nClusters); - DAAL_ASSERT(candidates.size() >= nClusters); - DAAL_ASSERT(cValues.size() >= nClusters); - - KernelArguments args(5, st); - DAAL_CHECK_STATUS_VAR(st); - args.set(0, partialCandidates, AccessModeIds::read); - args.set(1, partialCValues, AccessModeIds::read); - args.set(2, candidates, AccessModeIds::readwrite); - args.set(3, cValues, AccessModeIds::readwrite); - args.set(4, static_cast(nClusters)); - - KernelRange local_range(1, 1); - KernelRange global_range(1, 1); - - KernelNDRange range(2); - range.global(global_range, st); - DAAL_CHECK_STATUS_VAR(st); - range.local(local_range, st); - DAAL_CHECK_STATUS_VAR(st); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.updateClusters.run); - context.run(range, kernelUpdateCandidates, args, st); - } - return st; -} - -template -Status KMeansDistributedStep2KernelUCAPI::buildProgram(ClKernelFactoryIface & kernelFactory) -{ - Status st; - auto fptypeName = services::internal::sycl::getKeyFPType(); - auto buildOptions = fptypeName; - services::String cachekey("__daal_algorithms_kmeans_lloyd_dense_distr_step2_"); - cachekey.add(fptypeName); - cachekey.add(buildOptions); - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), kmeans_cl_kernels_distr_steps, buildOptions.c_str(), st); - } - return st; -} - -} // namespace internal -} // namespace kmeans -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_model/BUILD b/cpp/daal/src/algorithms/linear_model/BUILD index 3824272fc2c..7a591f08d34 100644 --- a/cpp/daal/src/algorithms/linear_model/BUILD +++ b/cpp/daal/src/algorithms/linear_model/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/regression:kernel", ], ) diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_model_fpt.cpp b/cpp/daal/src/algorithms/linear_model/linear_model_model_fpt.cpp index 36d9b4be738..ccaeb974c4b 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_model_fpt.cpp +++ b/cpp/daal/src/algorithms/linear_model/linear_model_model_fpt.cpp @@ -15,7 +15,6 @@ * limitations under the License. *******************************************************************************/ -#include "data_management/data/internal/numeric_table_sycl_homogen.h" #include "src/algorithms/linear_model/linear_model_model_impl.h" #include "data_management/data/homogen_numeric_table.h" @@ -28,24 +27,14 @@ namespace linear_model namespace internal { using namespace daal::data_management; -using daal::data_management::internal::SyclHomogenNumericTable; template ModelInternal::ModelInternal(size_t nFeatures, size_t nResponses, const Parameter & par, modelFPType dummy) : _interceptFlag(par.interceptFlag) { services::Status st; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); + _beta = HomogenNumericTable::create(nFeatures + 1, nResponses, NumericTable::doAllocate, 0, &st); - if (deviceInfo.isCpu) - { - _beta = HomogenNumericTable::create(nFeatures + 1, nResponses, NumericTable::doAllocate, 0, &st); - } - else - { - _beta = SyclHomogenNumericTable::create(nFeatures + 1, nResponses, NumericTable::doAllocate, 0, &st); - } if (!st) return; } diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_predict_batch_fpt.cpp b/cpp/daal/src/algorithms/linear_model/linear_model_predict_batch_fpt.cpp index 46103fb4735..329cd671cf2 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_predict_batch_fpt.cpp +++ b/cpp/daal/src/algorithms/linear_model/linear_model_predict_batch_fpt.cpp @@ -21,7 +21,6 @@ //-- */ -#include "data_management/data/internal/numeric_table_sycl_homogen.h" #include "algorithms/linear_model/linear_model_predict_types.h" #include "data_management/data/homogen_numeric_table.h" @@ -35,7 +34,6 @@ namespace prediction { using namespace daal::services; using namespace daal::data_management; -using daal::data_management::internal::SyclHomogenNumericTable; template DAAL_EXPORT Status Result::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * par, const int method) @@ -45,17 +43,8 @@ DAAL_EXPORT Status Result::allocate(const daal::algorithms::Input * input, const size_t nDependentVariables = in->get(model)->getNumberOfResponses(); Status st; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); + set(prediction, HomogenNumericTable::create(nDependentVariables, nVectors, NumericTable::doAllocate, &st)); - if (deviceInfo.isCpu) - { - set(prediction, HomogenNumericTable::create(nDependentVariables, nVectors, NumericTable::doAllocate, &st)); - } - else - { - set(prediction, SyclHomogenNumericTable::create(nDependentVariables, nVectors, NumericTable::doAllocate, &st)); - } return st; } diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_predict_container.h b/cpp/daal/src/algorithms/linear_model/linear_model_predict_container.h index b2626d49ca8..a2b425a673b 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_predict_container.h +++ b/cpp/daal/src/algorithms/linear_model/linear_model_predict_container.h @@ -29,8 +29,6 @@ #include "algorithms/linear_model/linear_model_predict.h" #include "src/algorithms/linear_model/linear_model_predict_kernel.h" -#include "src/algorithms/linear_model/oneapi/linear_model_predict_kernel_oneapi.h" - namespace daal { namespace algorithms @@ -42,17 +40,7 @@ namespace prediction template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : PredictionContainerIface() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::PredictKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); } template @@ -73,17 +61,7 @@ services::Status BatchContainer::compute() daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, a, m, r); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::PredictKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, a, m, r) - } + __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, a, m, r); } } // namespace prediction diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_fpt_dispatcher.cpp index 7c80f4a755c..eede44d34d1 100644 --- a/cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_fpt_dispatcher.cpp @@ -29,6 +29,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(linear_model::prediction::BatchContainer, batch, DAAL_FPTYPE, linear_model::prediction::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(linear_model::prediction::BatchContainer, batch, DAAL_FPTYPE, linear_model::prediction::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index 70e726b4b8a..00000000000 --- a/cpp/daal/src/algorithms/linear_model/linear_model_predict_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* file: linear_model_predict_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of prediction stage of linear regression algorithm. -//-- -*/ - -#include "src/algorithms/linear_model/oneapi/linear_model_predict_kernel_oneapi.h" -#include "src/algorithms/linear_model/oneapi/linear_model_predict_dense_default_batch_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace prediction -{ -namespace internal -{ -template class DAAL_EXPORT PredictKernelOneAPI; -} // namespace internal -} // namespace prediction -} // namespace linear_model -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_oneapi_fpt.cpp b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_oneapi_fpt.cpp deleted file mode 100644 index 1106ee0d260..00000000000 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_finalize_oneapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: linear_model_train_normeq_finalize_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h" -#include "src/algorithms/linear_model/oneapi/linear_model_train_normeq_finalize_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace normal_equations -{ -namespace training -{ -namespace internal -{ -template class FinalizeKernelOneAPI; -} -} // namespace training -} // namespace normal_equations -} // namespace linear_model -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_oneapi_fpt.cpp b/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_oneapi_fpt.cpp deleted file mode 100644 index b8f884dec88..00000000000 --- a/cpp/daal/src/algorithms/linear_model/linear_model_train_normeq_update_oneapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: linear_model_train_normeq_update_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h" -#include "src/algorithms/linear_model/oneapi/linear_model_train_normeq_update_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace normal_equations -{ -namespace training -{ -namespace internal -{ -template class UpdateKernelOneAPI; -} // namespace internal -} // namespace training -} // namespace normal_equations -} // namespace linear_model -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/linear_model_prediction.cl b/cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/linear_model_prediction.cl deleted file mode 100644 index b1624272f2a..00000000000 --- a/cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/linear_model_prediction.cl +++ /dev/null @@ -1,45 +0,0 @@ -/* file: linear_model_prediction.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Linear Regression predict kernels. -//-- -*/ - -#ifndef __LINEAR_MODEL_PREDICTION_CL__ -#define __LINEAR_MODEL_PREDICTION_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - clKernelPrediction, - - __kernel void addBetaIntercept(const __global algorithmFPType * beta, uint nBetas, __global algorithmFPType * yTable, uint nResponses) { - const uint rowIdx = get_global_id(0); - const uint colIdx = get_global_id(1); - - const algorithmFPType value = yTable[rowIdx * nResponses + colIdx]; - - yTable[rowIdx * nResponses + colIdx] = value + beta[colIdx * nBetas]; - } - -); - -#endif // __LINEAR_MODEL_PREDICTION_CL__ diff --git a/cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/reduce_results.cl b/cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/reduce_results.cl deleted file mode 100644 index 71352c8ae7e..00000000000 --- a/cpp/daal/src/algorithms/linear_model/oneapi/cl_kernel/reduce_results.cl +++ /dev/null @@ -1,43 +0,0 @@ -/* file: reduce_results.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of copy kernels. -//-- -*/ - -#ifndef __REDUCE_RESULTS_CL__ -#define __REDUCE_RESULTS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - clKernelCopy, - - __kernel void reduceResults(__global algorithmFPType * dst, uint dstOffset, uint dstStride, const __global algorithmFPType * src, uint srcOffset, - uint srcStride) { - const uint valIdx = get_global_id(0); - - dst[dstStride * valIdx + dstOffset] += src[srcStride * valIdx + srcOffset]; - } - -); - -#endif // __REDUCE_RESULTS_CL__ diff --git a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_dense_default_batch_oneapi_impl.i b/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_dense_default_batch_oneapi_impl.i deleted file mode 100644 index ca501d1e9f0..00000000000 --- a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_dense_default_batch_oneapi_impl.i +++ /dev/null @@ -1,174 +0,0 @@ -/* file: linear_model_predict_dense_default_batch_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Common functions for linear regression predictions calculation -//-- -*/ - -#ifndef __LINEAR_MODEL_PREDICT_DENSE_DEFAULT_BATCH_ONEAPI_IMPL_I__ -#define __LINEAR_MODEL_PREDICT_DENSE_DEFAULT_BATCH_ONEAPI_IMPL_I__ - -#include "src/algorithms/linear_model/oneapi/linear_model_predict_kernel_oneapi.h" -#include "src/data_management/service_numeric_table.h" -#include "src/sycl/blas_gpu.h" -#include "services/internal/execution_context.h" -#include "src/services/service_data_utils.h" -#include "src/algorithms/linear_model/oneapi/cl_kernel/linear_model_prediction.cl" - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace prediction -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -template -services::Status PredictKernelOneAPI::addBetaIntercept(const services::internal::Buffer & betaTable, - const size_t nBetas, - services::internal::Buffer & yTable, - const size_t yNRows, const size_t yNCols) -{ - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - const services::String options = getKeyFPType(); - services::String cachekey("__daal_algorithms_linear_model_prediction_"); - cachekey.add(options); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelPrediction, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "addBetaIntercept"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(yNCols <= services::internal::MaxVal::get()); - DAAL_ASSERT(nBetas <= services::internal::MaxVal::get()); - - DAAL_ASSERT(betaTable.size() >= nBetas * yNCols); - DAAL_ASSERT(yTable.size() >= yNRows * yNCols); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - args.set(0, betaTable, AccessModeIds::read); - args.set(1, static_cast(nBetas)); - args.set(2, yTable, AccessModeIds::write); - args.set(3, static_cast(yNCols)); - - KernelRange range(yNRows, yNCols); - - ctx.run(range, kernel, args, status); - - return status; -} - -template -services::Status PredictKernelOneAPI::compute_impl(const NumericTable * a, const NumericTable * b, NumericTable * r, - bool interceptFlag) -{ - services::Status status; - - NumericTable * xTable = const_cast(a); - NumericTable * yTable = const_cast(r); - NumericTable * betaTable = const_cast(b); - - const size_t nRows = xTable->getNumberOfRows(); - const size_t nBetas = betaTable->getNumberOfColumns(); - const size_t nResponses = betaTable->getNumberOfRows(); - - const size_t nRowsPerBlock = 90000; - - const size_t nBlocks = (nRows / nRowsPerBlock) + (bool(nRows % nRowsPerBlock) ? 1 : 0); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nBlocks, nRowsPerBlock); - - BlockDescriptor betaBlock; - DAAL_CHECK_STATUS(status, betaTable->getBlockOfRows(0, nResponses, ReadWriteMode::readOnly, betaBlock)); - - const services::internal::Buffer betaBuf = betaBlock.getBuffer(); - - for (size_t blockIdx = 0; blockIdx < nBlocks; ++blockIdx) - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, blockIdx, nRowsPerBlock); - const size_t startRow = blockIdx * nRowsPerBlock; - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, startRow, nRowsPerBlock); - const size_t endRow = ((startRow + nRowsPerBlock) > nRows) ? nRows : (startRow + nRowsPerBlock); - DAAL_ASSERT(endRow >= startRow); - - BlockDescriptor xBlock; - BlockDescriptor yBlock; - - DAAL_CHECK_STATUS(status, xTable->getBlockOfRows(startRow, endRow - startRow, ReadWriteMode::readOnly, xBlock)); - DAAL_CHECK_STATUS(status, yTable->getBlockOfRows(startRow, endRow - startRow, ReadWriteMode::readWrite, yBlock)); - - const services::internal::Buffer xBuf = xBlock.getBuffer(); - services::internal::Buffer yBuf = yBlock.getBuffer(); - - const size_t xNRows = endRow - startRow; - DAAL_ASSERT(nBetas >= 1); - const size_t xNCols = nBetas - 1; - const size_t yNCols = nResponses; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, xNRows, xNCols); - DAAL_ASSERT(xBuf.size() >= xNRows * xNCols); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, yNCols, xNCols); - DAAL_ASSERT(betaBuf.size() >= yNCols * xNCols); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, xNRows, yNCols); - DAAL_ASSERT(yBuf.size() >= xNRows * yNCols); - - /* SYRK: Compute beta*xTable for each block */ - status = BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::Trans, xNRows, yNCols, xNCols, - algorithmFPType(1.0), xBuf, xNCols, 0, betaBuf, nBetas, 1, algorithmFPType(0.0), yBuf, yNCols, 0); - - DAAL_CHECK_STATUS_VAR(status); - - if (interceptFlag) - { - DAAL_CHECK_STATUS(status, addBetaIntercept(betaBuf, nBetas, yBuf, xNRows, yNCols)); - } - - DAAL_CHECK_STATUS(status, xTable->releaseBlockOfRows(xBlock)); - DAAL_CHECK_STATUS(status, yTable->releaseBlockOfRows(yBlock)); - } - - DAAL_CHECK_STATUS(status, betaTable->releaseBlockOfRows(betaBlock)); - - return status; -} - -template -services::Status PredictKernelOneAPI::compute(const NumericTable * a, const linear_model::Model * m, NumericTable * r) -{ - linear_model::Model * model = const_cast(m); - return compute_impl(a, model->getBeta().get(), r, model->getInterceptFlag()); -} - -} // namespace internal -} // namespace prediction -} // namespace linear_model -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_kernel_oneapi.h b/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_kernel_oneapi.h deleted file mode 100644 index 934007559dc..00000000000 --- a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_predict_kernel_oneapi.h +++ /dev/null @@ -1,77 +0,0 @@ -/* file: linear_model_predict_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template function that computes linear regression -// prediction results. -//-- -*/ - -#ifndef __LINEAR_MODEL_PREDICT_KERNEL_ONEAPI_H__ -#define __LINEAR_MODEL_PREDICT_KERNEL_ONEAPI_H__ - -#include "algorithms/linear_model/linear_model_predict.h" -#include "src/externals/service_memory.h" -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" - -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace prediction -{ -namespace internal -{ -template -class PredictKernelOneAPI : public daal::algorithms::Kernel -{ -public: - /** - * \brief Compute linear regression prediction results. - * - * \param a[in] Matrix of input variables X - * \param m[in] Linear regression model obtained on training stage - * \param r[out] Prediction results - */ - services::Status compute(const NumericTable * a, const linear_model::Model * m, NumericTable * r); -}; - -template -class PredictKernelOneAPI : public daal::algorithms::Kernel -{ -public: - services::Status compute(const NumericTable * a, const linear_model::Model * m, NumericTable * r); - services::Status compute_impl(const NumericTable * a, const NumericTable * b, NumericTable * r, bool interceptFlag); - -protected: - services::Status addBetaIntercept(const services::internal::Buffer & betaTable, const size_t nBetas, - services::internal::Buffer & yTable, const size_t yNRows, const size_t yNCols); -}; - -} // namespace internal -} // namespace prediction -} // namespace linear_model -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_finalize_oneapi_impl.i b/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_finalize_oneapi_impl.i deleted file mode 100644 index 5214a5d3b3f..00000000000 --- a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_finalize_oneapi_impl.i +++ /dev/null @@ -1,193 +0,0 @@ -/* file: linear_model_train_normeq_finalize_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of common base classes for normal equations model training. -//-- -*/ - -#include "src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h" -#include "services/internal/sycl/math/types.h" -#include "src/sycl/lapack_gpu.h" -#include "src/externals/service_lapack.h" -#include "src/externals/service_profiler.h" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace normal_equations -{ -namespace training -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -template -services::Status FinalizeKernelOneAPI::compute(NumericTable & xtxTable, NumericTable & xtyTable, NumericTable & xtxFinalTable, - NumericTable & xtyFinalTable, NumericTable & betaTable, bool interceptFlag, - const KernelHelperOneAPIIface & helper) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(computeFinalize); - services::Status status; - - const size_t nBetasIntercept = xtxTable.getNumberOfRows(); - const size_t nBetas = interceptFlag ? nBetasIntercept : (nBetasIntercept + 1); - const size_t nResponses = xtyTable.getNumberOfRows(); - - { - if (&xtxTable != &xtxFinalTable) - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeFinalize.copyToxtxFinalTable); - DAAL_CHECK_STATUS(status, copyDataToFinalTable(xtxTable, xtxFinalTable)); - } - - if (&xtyTable != &xtyFinalTable) - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeFinalize.copyToxtyFinalTable); - DAAL_CHECK_STATUS(status, copyDataToFinalTable(xtyTable, xtyFinalTable)); - } - } - - auto & context = services::internal::getDefaultContext(); - - { - BlockDescriptor xtxBlock; - BlockDescriptor xtyBlock; - - DAAL_CHECK_STATUS(status, xtxTable.getBlockOfRows(0, nBetasIntercept, ReadWriteMode::readOnly, xtxBlock)); - DAAL_CHECK_STATUS(status, xtyTable.getBlockOfRows(0, nResponses, ReadWriteMode::readOnly, xtyBlock)); - - const services::internal::Buffer xtxBuf = xtxBlock.getBuffer(); - const services::internal::Buffer xtyBuf = xtyBlock.getBuffer(); - - DAAL_CHECK_STATUS(status, xtxTable.releaseBlockOfRows(xtxBlock)); - DAAL_CHECK_STATUS(status, xtyTable.releaseBlockOfRows(xtyBlock)); - - const TypeIds::Id idType = TypeIds::id(); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nBetasIntercept, nBetasIntercept); - UniversalBuffer xtxCopyAlloc = context.allocate(idType, nBetasIntercept * nBetasIntercept, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::Buffer xtxBufCopy = xtxCopyAlloc.get(); - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeFinalize.xtxCopy); - DAAL_ASSERT(xtxBuf.size() >= nBetasIntercept * nBetasIntercept); - DAAL_ASSERT(xtxBufCopy.size() >= nBetasIntercept * nBetasIntercept); - context.copy(xtxBufCopy, 0, xtxBuf, 0, nBetasIntercept * nBetasIntercept, status); - } - DAAL_CHECK_STATUS_VAR(status); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nResponses, nBetasIntercept); - UniversalBuffer xtyCopyAlloc = context.allocate(idType, nResponses * nBetasIntercept, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::Buffer betaBuf = xtyCopyAlloc.get(); - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeFinalize.betaBufCopy); - DAAL_ASSERT(xtyBuf.size() >= nResponses * nBetasIntercept); - DAAL_ASSERT(betaBuf.size() >= nResponses * nBetasIntercept); - context.copy(betaBuf, 0, xtyBuf, 0, nResponses * nBetasIntercept, status); - } - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, helper.computeBetasImpl(nBetasIntercept, xtxBufCopy, nResponses, betaBuf, interceptFlag)); - - BlockDescriptor betaBlock; - DAAL_CHECK_STATUS(status, betaTable.getBlockOfRows(0, nResponses, ReadWriteMode::readWrite, betaBlock)); - services::internal::Buffer betaResBuf = betaBlock.getBuffer(); - - DAAL_ITTNOTIFY_SCOPED_TASK(computeFinalize.copyBetaToResult); - DAAL_CHECK_STATUS(status, helper.copyBetaToResult(betaBuf, betaResBuf, nBetas, nResponses, interceptFlag)); - DAAL_CHECK_STATUS(status, betaTable.releaseBlockOfRows(betaBlock)); - } - - return status; -} - -template -services::Status FinalizeKernelOneAPI::copyDataToFinalTable(NumericTable & srcTable, NumericTable & dstTable) -{ - services::Status status; - BlockDescriptor srcBlock; - BlockDescriptor dstBlock; - - const size_t nRows = srcTable.getNumberOfRows(); - const size_t nCols = srcTable.getNumberOfColumns(); - - DAAL_CHECK_STATUS(status, srcTable.getBlockOfRows(0, nRows, ReadWriteMode::readOnly, srcBlock)); - DAAL_CHECK_STATUS(status, dstTable.getBlockOfRows(0, nRows, ReadWriteMode::readWrite, dstBlock)); - - const services::internal::Buffer srcBuf = srcBlock.getBuffer(); - services::internal::Buffer dstBuf = dstBlock.getBuffer(); - - auto & context = services::internal::getDefaultContext(); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nCols, nRows); - DAAL_ASSERT(dstBuf.size() >= nCols * nRows); - DAAL_ASSERT(srcBuf.size() >= nCols * nRows); - context.copy(dstBuf, 0, srcBuf, 0, nCols * nRows, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, srcTable.releaseBlockOfRows(srcBlock)); - DAAL_CHECK_STATUS(status, dstTable.releaseBlockOfRows(dstBlock)); - - return status; -} - -template -services::Status FinalizeKernelOneAPI::solveSystem(const size_t p, services::internal::Buffer & a, const size_t ny, - services::internal::Buffer & b) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(solveSystem); - services::Status status; - - const math::UpLo uplo = math::UpLo::Upper; - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, ny, p); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, p, p); - DAAL_ASSERT(a.size() >= p * p); - DAAL_ASSERT(b.size() >= p * ny); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(solveSystem.xpotrf); - /* Perform L*L' decomposition of X'*X */ - status = LapackGpu::xpotrf(uplo, p, a, p); - } - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(solveSystem.xpotrs); - /* Solve L*L'*b=Y */ - status = LapackGpu::xpotrs(uplo, p, ny, a, p, b, p); - } - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -} // namespace internal -} // namespace training -} // namespace normal_equations -} // namespace linear_model -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h b/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h deleted file mode 100644 index af874b0d01e..00000000000 --- a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h +++ /dev/null @@ -1,147 +0,0 @@ -/* file: linear_model_train_normeq_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of common base classes for normal equations model training. -//-- -*/ - -#ifndef __LINEAR_MODEL_TRAIN_NORMEQ_KERNEL_ONEAPI_H__ -#define __LINEAR_MODEL_TRAIN_NORMEQ_KERNEL_ONEAPI_H__ - -#include "services/env_detect.h" -#include "data_management/data/numeric_table.h" -#include "src/data_management/service_numeric_table.h" - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace normal_equations -{ -namespace training -{ -namespace internal -{ -/** - * Abstract class that defines interface for the helper function that computes the regression coefficients. - */ -template -class KernelHelperOneAPIIface -{ -public: - /** - * Computes regression coefficients by solving the system of linear equations - * \param[in] p Size of the system of linear equations - * \param[in] a Matrix of size P x P with semifinished left hand side of the system - * \param[in] ny Number of right hand sides of the system - * \param[in,out] b Matrix of size Ny x P. - * On input, the right hand sides of the system of linear equations - * On output, the regression coefficients - * \param[in] interceptFlag Flag. If true, then it is required to compute an intercept term - * \return Status of the computations - */ - virtual services::Status computeBetasImpl(size_t p, services::internal::Buffer & a, size_t ny, - services::internal::Buffer & b, bool inteceptFlag) const = 0; - - virtual services::Status copyBetaToResult(const services::internal::Buffer & betaTmp, - services::internal::Buffer & betaRes, const size_t nBetas, const size_t nResponses, - const bool interceptFlag) const = 0; -}; - -/** - * Implements the common part of the regression coefficients computation from partial result - */ -template -class FinalizeKernelOneAPI -{ -public: - /** - * Computes regression coefficients by solving the symmetric system of linear equations - * - X' - matrix of size N x P' that contains input data set of size N x P - * and optionally a column of 1's. - * Column of 1's is added when it is required to compute an intercept term - * - P' - number of columns in X'. - * P' = P + 1, when it is required to compute an intercept term; - * P' = P, otherwise - * \param[in] xtx Input matrix \f$X'^T \times X'\f$ of size P' x P' - * \param[in] xty Input matrix \f$X'^T \times Y\f$ of size Ny x P' - * \param[out] xtxFinal Resulting matrix \f$X'^T \times X'\f$ of size P' x P' - * \param[out] xtyFinal Resulting matrix \f$X'^T \times Y\f$ of size Ny x P' - * \param[out] beta Matrix with regression coefficients of size Ny x (P + 1) - * \param[in] interceptFlag Flag. True if intercept term is not zero, false otherwise - * \param[in] helper Object that implements the differences in the regression - * coefficients computation - * \return Status of the computations - */ - static services::Status compute(NumericTable & xtx, NumericTable & xty, NumericTable & xtxFinal, NumericTable & xtyFinal, NumericTable & beta, - bool interceptFlag, const KernelHelperOneAPIIface & helper); - - static services::Status copyDataToFinalTable(NumericTable & srcTable, NumericTable & dstTable); - - /** - * Solves the symmetric system of linear equations - * \param[in] p Size of the system of linear equations - * \param[in] a Matrix of size P x P with the left hand side of the system - * \param[in] ny Number of right hand sides of the system - * \param[in,out] b Matrix of size Ny x P. - * On input, the right hand sides of the system of linear equations - * On output, the regression coefficients - * \param[in] internalError Error code that have to be returned in case incorrect parameters - * are passed into lapack routines - * \return Status of the computations - */ - static services::Status solveSystem(const size_t p, services::internal::Buffer & a, const size_t ny, - services::internal::Buffer & b); -}; - -/** - * Implements the common part of the partial results update with new block of input data - */ -template -class UpdateKernelOneAPI -{ -public: - /** - * Updates normal equations model with the new block of data - * \param[in] x Input data set of size N x P - * \param[in] y Input responses of size N x Ny - * \param[out] xtx Matrix \f$X'^T \times X'\f$ of size P' x P' - * \param[out] xty Matrix \f$X'^T \times Y\f$ of size Ny x P' - * \param[in] interceptFlag Flag. - * - True if it is required to compute an intercept term and P' = P + 1 - * - False otherwis, P' = P - * \return Status of the computations - */ - static services::Status compute(NumericTable & x, NumericTable & y, NumericTable & xtx, NumericTable & xty, bool interceptFlag); - -private: - static services::Status reduceResults(services::internal::Buffer & dst, size_t dstOffset, size_t dstStride, - const services::internal::Buffer & src, size_t srcOffset, size_t srcStride, size_t count); -}; - -} // namespace internal -} // namespace training -} // namespace normal_equations -} // namespace linear_model -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_update_oneapi_impl.i b/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_update_oneapi_impl.i deleted file mode 100644 index 293db00adf5..00000000000 --- a/cpp/daal/src/algorithms/linear_model/oneapi/linear_model_train_normeq_update_oneapi_impl.i +++ /dev/null @@ -1,270 +0,0 @@ -/* file: linear_model_train_normeq_update_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of common base classes for normal equations model training. -//-- -*/ - -#include "src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h" -#include "src/sycl/blas_gpu.h" -#include "services/internal/execution_context.h" -#include "src/externals/service_profiler.h" -#include "src/algorithms/linear_model/oneapi/cl_kernel/reduce_results.cl" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace linear_model -{ -namespace normal_equations -{ -namespace training -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -template -services::Status UpdateKernelOneAPI::compute(NumericTable & xTable, NumericTable & yTable, NumericTable & xtx, NumericTable & xty, - bool interceptFlag) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(computeUpdate); - services::Status status; - - const size_t nRows = xTable.getNumberOfRows(); - const size_t nCols = xTable.getNumberOfColumns(); - const size_t nResponses = yTable.getNumberOfColumns(); - const size_t nBetas = nCols + 1; - DAAL_ASSERT((interceptFlag ? (nBetas >= 0) : (nBetas >= 1))); - const size_t nBetasIntercept = (interceptFlag ? nBetas : (nBetas - 1)); - - BlockDescriptor xtxBlock; - BlockDescriptor xtyBlock; - - DAAL_CHECK_STATUS(status, xtx.getBlockOfRows(0, nBetasIntercept, ReadWriteMode::readWrite, xtxBlock)); - DAAL_CHECK_STATUS(status, xty.getBlockOfRows(0, nResponses, ReadWriteMode::readWrite, xtyBlock)); - - auto & context = services::internal::getDefaultContext(); - services::internal::Buffer xtxBuff = xtxBlock.getBuffer(); - services::internal::Buffer xtyBuff = xtyBlock.getBuffer(); - - DAAL_CHECK_STATUS(status, xtx.releaseBlockOfRows(xtxBlock)); - DAAL_CHECK_STATUS(status, xty.releaseBlockOfRows(xtyBlock)); - - const size_t nRowsPerBlock = 90000; - const size_t nBlocks = (nRows / nRowsPerBlock) + (bool(nRows % nRowsPerBlock) ? 1 : 0); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nBlocks, nRowsPerBlock); - - services::internal::Buffer sumXBuf; - services::internal::Buffer sumYBuf; - services::internal::Buffer onesBuf; - - if (interceptFlag) - { - const TypeIds::Id idType = TypeIds::id(); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nBetasIntercept, nCols); - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, nBetasIntercept, (nBetasIntercept * nCols)); - DAAL_ASSERT(xtxBuff.size() >= (nBetasIntercept + nBetasIntercept * nCols)); - sumXBuf = xtxBuff.getSubBuffer(nBetasIntercept * nCols, nBetasIntercept, status); - DAAL_CHECK_STATUS_VAR(status); - - UniversalBuffer sumYBufTmp = context.allocate(idType, nResponses, status); - DAAL_CHECK_STATUS_VAR(status); - - sumYBuf = sumYBufTmp.get(); - context.fill(sumYBuf, algorithmFPType(0), status); - DAAL_CHECK_STATUS_VAR(status); - - UniversalBuffer onesBufTmp = context.allocate(idType, nRowsPerBlock, status); - DAAL_CHECK_STATUS_VAR(status); - onesBuf = onesBufTmp.get(); - - context.fill(onesBuf, algorithmFPType(1), status); - DAAL_CHECK_STATUS_VAR(status); - } - - for (size_t blockIdx = 0; blockIdx < nBlocks; ++blockIdx) - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, blockIdx, nRowsPerBlock); - const size_t startRow = blockIdx * nRowsPerBlock; - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, startRow, nRowsPerBlock); - const size_t endRow = ((startRow + nRowsPerBlock) > nRows) ? nRows : (startRow + nRowsPerBlock); - - BlockDescriptor xBlock; - BlockDescriptor yBlock; - - DAAL_CHECK_STATUS(status, xTable.getBlockOfRows(startRow, endRow - startRow, ReadWriteMode::readOnly, xBlock)); - DAAL_CHECK_STATUS(status, yTable.getBlockOfRows(startRow, endRow - startRow, ReadWriteMode::readOnly, yBlock)); - - const services::internal::Buffer xBuf = xBlock.getBuffer(); - const services::internal::Buffer yBuf = yBlock.getBuffer(); - - DAAL_ASSERT(endRow >= startRow); - const size_t xNRows = endRow - startRow; - const size_t xNCols = nCols; - const size_t xtxNCols = nBetasIntercept; - const size_t yNCols = nResponses; - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeUpdate.syrkX); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, xNRows, xNCols); - DAAL_ASSERT(xBuf.size() >= xNRows * xNCols); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, xNCols, xNCols); - DAAL_ASSERT(xtxBuff.size() >= xNCols * xNCols); - - /* Compute XTX for each block and reduce to final result */ - status = BlasGpu::xsyrk(math::Layout::RowMajor, math::UpLo::Upper, math::Transpose::Trans, xNCols, xNRows, - algorithmFPType(1.0), xBuf, xNCols, 0, algorithmFPType(1.0), xtxBuff, xtxNCols, 0); - } - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeUpdate.gemmXY); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, xNRows, xNCols); - DAAL_ASSERT(xBuf.size() >= xNRows * xNCols); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, yNCols, xNCols); - DAAL_ASSERT(xtyBuff.size() >= yNCols * xNCols); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, xNRows, yNCols); - DAAL_ASSERT(yBuf.size() >= xNRows * yNCols); - - /* Compute XTY (in real YTX) for each block and reduce to final result*/ - status = - BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::Trans, math::Transpose::NoTrans, yNCols, xNCols, xNRows, - algorithmFPType(1.0), yBuf, yNCols, 0, xBuf, xNCols, 0, algorithmFPType(1.0), xtyBuff, xtxNCols, 0); - } - DAAL_CHECK_STATUS_VAR(status); - - if (interceptFlag) - { - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeUpdate.gemm1X); - - DAAL_ASSERT(onesBuf.size() >= xNRows); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, xNCols, xNRows); - DAAL_ASSERT(xBuf.size() >= xNCols * xNRows); - DAAL_ASSERT(sumXBuf.size() >= xNCols); - - /* Compute reduce X in columns for each block and reduce it to final result*/ - status = BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::NoTrans, 1, xNCols, - xNRows, algorithmFPType(1.0), onesBuf, nRowsPerBlock, 0, xBuf, xNCols, 0, - algorithmFPType(1.0), sumXBuf, xNCols, 0); - } - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeUpdate.gemm1Y); - - DAAL_ASSERT(onesBuf.size() >= xNRows); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, yNCols, xNRows); - DAAL_ASSERT(yBuf.size() >= yNCols * xNRows); - DAAL_ASSERT(sumYBuf.size() >= yNCols); - - /* Compute reduce Y in columns for each block and reduce it to final result*/ - status = BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::NoTrans, 1, yNCols, - xNRows, algorithmFPType(1.0), onesBuf, nRowsPerBlock, 0, yBuf, yNCols, 0, - algorithmFPType(1.0), sumYBuf, yNCols, 0); - } - DAAL_CHECK_STATUS_VAR(status); - } - - DAAL_CHECK_STATUS(status, xTable.releaseBlockOfRows(xBlock)); - DAAL_CHECK_STATUS(status, yTable.releaseBlockOfRows(yBlock)); - } - - if (interceptFlag) - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeUpdate.copyResults); - - algorithmFPType nrowsVal = static_cast(nRows); - const services::internal::Buffer nrowsBuf(&nrowsVal, 1, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, reduceResults(sumXBuf, nCols, 1, nrowsBuf, 0, 1, 1)); - - DAAL_CHECK_STATUS(status, reduceResults(xtyBuff, nCols, nBetasIntercept, sumYBuf, 0, 1, nResponses)); - } - - return status; -} - -template -services::Status UpdateKernelOneAPI::reduceResults(services::internal::Buffer & dst, size_t dstOffset, - size_t dstStride, const services::internal::Buffer & src, - size_t srcOffset, size_t srcStride, size_t count) -{ - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - const services::String options = getKeyFPType(); - services::String cachekey("__daal_algorithms_linear_model_copy_"); - cachekey.add(options); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelCopy, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "reduceResults"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(count <= services::internal::MaxVal::get()); - DAAL_ASSERT(count >= 1); - - DAAL_ASSERT(dstStride <= services::internal::MaxVal::get()); - DAAL_ASSERT(dstOffset <= services::internal::MaxVal::get()); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, dstStride, (count - 1)); - DAAL_OVERFLOW_CHECK_BY_ADDING(uint32_t, dstOffset, (dstStride * (count - 1))); - DAAL_ASSERT(dst.size() >= (dstStride * (count - 1) + dstOffset)); - - args.set(0, dst, AccessModeIds::write); - args.set(1, static_cast(dstOffset)); - args.set(2, static_cast(dstStride)); - - DAAL_ASSERT(srcStride <= services::internal::MaxVal::get()); - DAAL_ASSERT(srcOffset <= services::internal::MaxVal::get()); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, srcStride, (count - 1)); - DAAL_OVERFLOW_CHECK_BY_ADDING(uint32_t, srcOffset, (srcStride * (count - 1))); - DAAL_ASSERT(src.size() >= (srcStride * (count - 1) + srcOffset)); - - args.set(3, src, AccessModeIds::read); - args.set(4, static_cast(srcOffset)); - args.set(5, static_cast(srcStride)); - - KernelRange range(count); - - ctx.run(range, kernel, args, status); - - return status; -} - -} // namespace internal -} // namespace training -} // namespace normal_equations -} // namespace linear_model -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_regression/BUILD b/cpp/daal/src/algorithms/linear_regression/BUILD index 0fa3f657e3e..fd8621c6bfa 100644 --- a/cpp/daal/src/algorithms/linear_regression/BUILD +++ b/cpp/daal/src/algorithms/linear_regression/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/regression:kernel", "@onedal//cpp/daal/src/algorithms/linear_model:kernel", ], diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_ne_model_fpt.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_ne_model_fpt.cpp index 38ca129acd7..a07ad42acdb 100644 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_ne_model_fpt.cpp +++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_ne_model_fpt.cpp @@ -23,8 +23,6 @@ #include "src/algorithms/linear_regression/linear_regression_ne_model_impl.h" #include "data_management/data/homogen_numeric_table.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "services/internal/execution_context.h" namespace daal { @@ -35,7 +33,6 @@ namespace linear_regression namespace internal { using namespace daal::data_management; -using daal::data_management::internal::SyclHomogenNumericTable; /** * Constructs the linear regression model for the normal equations method @@ -54,23 +51,10 @@ ModelNormEqInternal::ModelNormEqInternal(size_t featnum, size_t nrhs, const line dimWithoutBeta--; } - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - _xtxTable = HomogenNumericTable::create(dimWithoutBeta, dimWithoutBeta, NumericTable::doAllocate, 0, &st); - if (!st) return; - _xtyTable = HomogenNumericTable::create(dimWithoutBeta, nrhs, NumericTable::doAllocate, 0, &st); - if (!st) return; - } - else - { - _xtxTable = SyclHomogenNumericTable::create(dimWithoutBeta, dimWithoutBeta, NumericTable::doAllocate, 0, &st); - if (!st) return; - _xtyTable = SyclHomogenNumericTable::create(dimWithoutBeta, nrhs, NumericTable::doAllocate, 0, &st); - if (!st) return; - } + _xtxTable = HomogenNumericTable::create(dimWithoutBeta, dimWithoutBeta, NumericTable::doAllocate, 0, &st); + if (!st) return; + _xtyTable = HomogenNumericTable::create(dimWithoutBeta, nrhs, NumericTable::doAllocate, 0, &st); + if (!st) return; } template ModelNormEqInternal::ModelNormEqInternal(size_t featnum, size_t nrhs, const linear_regression::Parameter & par, DAAL_FPTYPE dummy, diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_container.h b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_container.h index 13717120396..01afe340db3 100644 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_container.h +++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_container.h @@ -32,9 +32,6 @@ #include "algorithms/linear_regression/linear_regression_ne_model.h" #include "algorithms/linear_regression/linear_regression_qr_model.h" #include "src/data_management/service_numeric_table.h" -#include "services/internal/execution_context.h" - -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h" namespace daal { @@ -55,17 +52,7 @@ using namespace daal::internal; template BatchContainer::BatchContainer(Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if ((method == training::normEqDense) && (!deviceInfo.isCpu)) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::BatchKernelOneAPI, algorithmFPType, training::normEqDense); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::BatchKernel, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::BatchKernel, algorithmFPType, method); } template @@ -91,25 +78,12 @@ Status BatchContainer::compute() Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (method == training::normEqDense) { linear_regression::ModelNormEqPtr m = linear_regression::ModelNormEq::cast(result->get(model)); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::BatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), compute, - *(input->get(data)), *(input->get(dependentVariables)), *(m->getXTXTable()), *(m->getXTYTable()), *(m->getBeta()), - par->interceptFlag); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::BatchKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), compute, - *(input->get(data)), *(input->get(dependentVariables)), *(m->getXTXTable()), *(m->getXTYTable()), *(m->getBeta()), - par->interceptFlag); - } + __DAAL_CALL_KERNEL(env, internal::BatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), compute, *(input->get(data)), + *(input->get(dependentVariables)), *(m->getXTXTable()), *(m->getXTYTable()), *(m->getBeta()), par->interceptFlag); } else { @@ -127,17 +101,7 @@ Status BatchContainer::compute() template OnlineContainer::OnlineContainer(Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if ((method == training::normEqDense) && (!deviceInfo.isCpu)) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::OnlineKernelOneAPI, algorithmFPType, training::normEqDense); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::OnlineKernel, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::OnlineKernel, algorithmFPType, method); } template @@ -163,24 +127,12 @@ Status OnlineContainer::compute() Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (method == training::normEqDense) { linear_regression::ModelNormEqPtr m = linear_regression::ModelNormEq::cast(partialResult->get(training::partialModel)); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::OnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), compute, - *(input->get(data)), *(input->get(dependentVariables)), *(m->getXTXTable()), *(m->getXTYTable()), par->interceptFlag); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::OnlineKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), compute, - *(input->get(data)), *(input->get(dependentVariables)), *(m->getXTXTable()), *(m->getXTYTable()), - par->interceptFlag); - } + __DAAL_CALL_KERNEL(env, internal::OnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), compute, *(input->get(data)), + *(input->get(dependentVariables)), *(m->getXTXTable()), *(m->getXTYTable()), par->interceptFlag); } else { @@ -207,26 +159,13 @@ Status OnlineContainer::finalizeCompute() Parameter * par = static_cast(_par); Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (method == training::normEqDense) { linear_regression::ModelNormEqPtr pm = linear_regression::ModelNormEq::cast(partialResult->get(training::partialModel)); linear_regression::ModelNormEqPtr m = linear_regression::ModelNormEq::cast(result->get(training::model)); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::OnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), finalizeCompute, - *(pm->getXTXTable()), *(pm->getXTYTable()), *(m->getXTXTable()), *(m->getXTYTable()), *(m->getBeta()), - par->interceptFlag); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::OnlineKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), - finalizeCompute, *(pm->getXTXTable()), *(pm->getXTYTable()), *(m->getXTXTable()), *(m->getXTYTable()), - *(m->getBeta()), par->interceptFlag); - } + __DAAL_CALL_KERNEL(env, internal::OnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, training::normEqDense), finalizeCompute, + *(pm->getXTXTable()), *(pm->getXTYTable()), *(m->getXTXTable()), *(m->getXTYTable()), *(m->getBeta()), par->interceptFlag); } else { diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_fpt_dispatcher.cpp index 6dababfe722..2a66ba0cddb 100644 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(linear_regression::training::BatchContainer, batch, DAAL_FPTYPE, linear_regression::training::normEqDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(linear_regression::training::BatchContainer, batch, DAAL_FPTYPE, linear_regression::training::normEqDense) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_oneapi_fpt.cpp deleted file mode 100644 index d962990fde3..00000000000 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* file: linear_regression_train_dense_normeq_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of linear regression training functions for the method -// of normal equations for GPU. -//-- -*/ - -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h" -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace linear_regression -{ -namespace training -{ -namespace internal -{ -template class BatchKernelOneAPI; - -} // namespace internal -} // namespace training -} // namespace linear_regression -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_helper_oneapi_fpt.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_helper_oneapi_fpt.cpp deleted file mode 100644 index d6b1ae4685d..00000000000 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_helper_oneapi_fpt.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* file: linear_regression_train_dense_normeq_helper_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of linear regression training functions for the method -// of normal equations. -//-- -*/ - -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h" -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_helper_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace linear_regression -{ -namespace training -{ -namespace internal -{ -template class KernelHelperOneAPI; - -} // namespace internal -} // namespace training -} // namespace linear_regression -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_fpt_dispatcher.cpp index f005b909ba4..6062d9497b8 100644 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_fpt_dispatcher.cpp @@ -27,7 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(linear_regression::training::OnlineContainer, online, DAAL_FPTYPE, - linear_regression::training::normEqDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(linear_regression::training::OnlineContainer, online, DAAL_FPTYPE, linear_regression::training::normEqDense) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_oneapi_fpt.cpp b/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_oneapi_fpt.cpp deleted file mode 100644 index 06c6da43c8f..00000000000 --- a/cpp/daal/src/algorithms/linear_regression/linear_regression_train_dense_normeq_online_oneapi_fpt.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* file: linear_regression_train_dense_normeq_online_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of linear regression training functions for the method -// of normal equations for GPU in online compute mode. -//-- -*/ - -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h" -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace linear_regression -{ -namespace training -{ -namespace internal -{ -template class DAAL_EXPORT OnlineKernelOneAPI; - -} // namespace internal -} // namespace training -} // namespace linear_regression -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/linear_regression/oneapi/cl_kernel/helper_beta_copy.cl b/cpp/daal/src/algorithms/linear_regression/oneapi/cl_kernel/helper_beta_copy.cl deleted file mode 100644 index c36b7a947de..00000000000 --- a/cpp/daal/src/algorithms/linear_regression/oneapi/cl_kernel/helper_beta_copy.cl +++ /dev/null @@ -1,53 +0,0 @@ -/* file: helper_beta_copy.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Linead Regression OpenCL kernels. -//-- -*/ - -#ifndef __HELPER_BETA_COPY_CL__ -#define __HELPER_BETA_COPY_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - clKernelHelperBetaCopy, - - __kernel void copyBeta(const __global algorithmFPType * src, uint nCols, uint nColsSrc, __global algorithmFPType * dst, uint intercept) { - const uint idxY = get_global_id(0); - const uint idxX = get_global_id(1); - - if (idxX == 0) - { - if (intercept == 1) - { - dst[idxY * nCols] = src[idxY * nColsSrc + nColsSrc - 1]; - } - } - else - { - dst[idxY * nCols + idxX] = src[idxY * nColsSrc + idxX - 1]; - } - } - -); - -#endif // __HELPER_BETA_COPY_CL__ diff --git a/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_helper_oneapi_impl.i b/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_helper_oneapi_impl.i deleted file mode 100644 index fc42556ff9f..00000000000 --- a/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_helper_oneapi_impl.i +++ /dev/null @@ -1,107 +0,0 @@ -/* file: linear_regression_train_dense_normeq_helper_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for linear regression -// Normal Equations (normEqDense) method. -//-- -*/ - -#ifndef __LINEAR_REGRESSION_TRAIN_DENSE_NORMEQ_HELPER_ONEAPI_IMPL_I__ -#define __LINEAR_REGRESSION_TRAIN_DENSE_NORMEQ_HELPER_ONEAPI_IMPL_I__ - -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h" -#include "services/internal/execution_context.h" -#include "src/algorithms/linear_regression/oneapi/cl_kernel/helper_beta_copy.cl" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace linear_regression -{ -namespace training -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -template -services::Status KernelHelperOneAPI::computeBetasImpl(const size_t p, services::internal::Buffer & a, - const size_t ny, services::internal::Buffer & b, - const bool inteceptFlag) const -{ - return linear_model::normal_equations::training::internal::FinalizeKernelOneAPI::solveSystem(p, a, ny, b); -} - -template -services::Status KernelHelperOneAPI::copyBetaToResult(const services::internal::Buffer & betaTmp, - services::internal::Buffer & betaRes, const size_t nBetas, - const size_t nResponses, const bool interceptFlag) const -{ - services::Status status; - - const size_t nBetasIntercept = interceptFlag ? nBetas : (nBetas - 1); - const size_t intercept = interceptFlag ? 1 : 0; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - const services::String options = getKeyFPType(); - services::String cachekey("__daal_algorithms_linear_regression_training_helper_"); - cachekey.add(options); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelHelperBetaCopy, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "copyBeta"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(nBetas <= services::internal::MaxVal::get()); - DAAL_ASSERT(nBetasIntercept <= services::internal::MaxVal::get()); - DAAL_ASSERT(intercept <= services::internal::MaxVal::get()); - DAAL_ASSERT(nResponses <= services::internal::MaxVal::get()); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nResponses, nBetasIntercept); - DAAL_ASSERT(betaTmp.size() >= nResponses * nBetasIntercept); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nResponses, nBetas); - DAAL_ASSERT(betaRes.size() >= nResponses * nBetas); - - KernelArguments args(5, status); - args.set(0, betaTmp, AccessModeIds::read); - args.set(1, static_cast(nBetas)); - args.set(2, static_cast(nBetasIntercept)); - args.set(3, betaRes, AccessModeIds::write); - args.set(4, static_cast(intercept)); - - KernelRange range(nResponses, nBetas); - - ctx.run(range, kernel, args, status); - - return status; -} - -} // namespace internal -} // namespace training -} // namespace linear_regression -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_oneapi_impl.i b/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_oneapi_impl.i deleted file mode 100644 index 5ac9c0e38de..00000000000 --- a/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_dense_normeq_oneapi_impl.i +++ /dev/null @@ -1,70 +0,0 @@ -/* file: linear_regression_train_dense_normeq_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for linear regression -// Normal Equations (normEqDense) method. -//-- -*/ - -#ifndef __LINEAR_REGRESSION_TRAIN_DENSE_NORMEQ_ONEAPI_IMPL_I__ -#define __LINEAR_REGRESSION_TRAIN_DENSE_NORMEQ_ONEAPI_IMPL_I__ - -#include "src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace linear_regression -{ -namespace training -{ -namespace internal -{ -template -services::Status BatchKernelOneAPI::compute(NumericTable & x, NumericTable & y, NumericTable & xtx, - NumericTable & xty, NumericTable & beta, bool interceptFlag) const -{ - services::Status status = UpdateKernelType::compute(x, y, xtx, xty, interceptFlag); - if (status) status = FinalizeKernelType::compute(xtx, xty, xtx, xty, beta, interceptFlag, KernelHelperOneAPI()); - return status; -} - -template -services::Status OnlineKernelOneAPI::compute(NumericTable & x, NumericTable & y, NumericTable & xtx, - NumericTable & xty, bool interceptFlag) const -{ - return UpdateKernelType::compute(x, y, xtx, xty, interceptFlag); -} - -template -services::Status OnlineKernelOneAPI::finalizeCompute(NumericTable & xtx, NumericTable & xty, - NumericTable & xtxFinal, NumericTable & xtyFinal, - NumericTable & beta, bool interceptFlag) const -{ - return FinalizeKernelType::compute(xtx, xty, xtxFinal, xtyFinal, beta, interceptFlag, KernelHelperOneAPI()); -} - -} // namespace internal -} // namespace training -} // namespace linear_regression -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h b/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h deleted file mode 100644 index a3ab43e3d64..00000000000 --- a/cpp/daal/src/algorithms/linear_regression/oneapi/linear_regression_train_kernel_oneapi.h +++ /dev/null @@ -1,92 +0,0 @@ -/* file: linear_regression_train_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for linear regression -// training. -//-- -*/ - -#ifndef __LINEAR_REGRESSION_TRAIN_KERNEL_ONEAPI_H__ -#define __LINEAR_REGRESSION_TRAIN_KERNEL_ONEAPI_H__ - -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/linear_regression/linear_regression_training_types.h" -#include "src/algorithms/linear_model/oneapi/linear_model_train_normeq_kernel_oneapi.h" -#include "algorithms/algorithm_kernel.h" - -namespace daal -{ -namespace algorithms -{ -namespace linear_regression -{ -namespace training -{ -namespace internal -{ -template -class BatchKernelOneAPI -{}; - -template -class KernelHelperOneAPI : public linear_model::normal_equations::training::internal::KernelHelperOneAPIIface -{ -public: - services::Status computeBetasImpl(const size_t p, services::internal::Buffer & a, const size_t ny, - services::internal::Buffer & b, const bool inteceptFlag) const; - services::Status copyBetaToResult(const services::internal::Buffer & betaTmp, - services::internal::Buffer & betaRes, const size_t nBetas, const size_t nResponses, - const bool interceptFlag) const; -}; - -template -class BatchKernelOneAPI : public daal::algorithms::Kernel -{ - typedef linear_model::normal_equations::training::internal::UpdateKernelOneAPI UpdateKernelType; - typedef linear_model::normal_equations::training::internal::FinalizeKernelOneAPI FinalizeKernelType; - -public: - services::Status compute(NumericTable & x, NumericTable & y, NumericTable & xtx, NumericTable & xty, NumericTable & beta, - bool interceptFlag) const; -}; - -template -class OnlineKernelOneAPI -{}; - -template -class OnlineKernelOneAPI : public daal::algorithms::Kernel -{ - typedef linear_model::normal_equations::training::internal::UpdateKernelOneAPI UpdateKernelType; - typedef linear_model::normal_equations::training::internal::FinalizeKernelOneAPI FinalizeKernelType; - -public: - services::Status compute(NumericTable & x, NumericTable & y, NumericTable & xtx, NumericTable & xty, bool interceptFlag) const; - services::Status finalizeCompute(NumericTable & xtx, NumericTable & xty, NumericTable & xtxFinal, NumericTable & xtyFinal, NumericTable & beta, - bool interceptFlag) const; -}; - -} // namespace internal -} // namespace training -} // namespace linear_regression -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/logistic_regression/BUILD b/cpp/daal/src/algorithms/logistic_regression/BUILD index 34bd6de4f24..356c57cd80c 100644 --- a/cpp/daal/src/algorithms/logistic_regression/BUILD +++ b/cpp/daal/src/algorithms/logistic_regression/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/classifier:kernel", "@onedal//cpp/daal/src/algorithms/objective_function:kernel", "@onedal//cpp/daal/src/algorithms/objective_function/logistic_loss:kernel", diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_container.h b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_container.h index 07651cc5213..bc0b40927d0 100644 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_container.h +++ b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_container.h @@ -27,7 +27,6 @@ #include "algorithms/classifier/classifier_model.h" #include "src/services/service_algo_utils.h" #include "src/algorithms/logistic_regression/logistic_regression_predict_kernel.h" -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_predict_kernel_oneapi.h" namespace daal { @@ -42,17 +41,7 @@ namespace interface2 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : PredictionContainerIface() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::PredictBatchKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::PredictKernel, algorithmFPType, method); } template @@ -79,19 +68,8 @@ services::Status BatchContainer::compute() daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), a, m, par->nClasses, r, prob, logProb); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::PredictBatchKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), a, m, par->nClasses, r, prob, logProb); - } + __DAAL_CALL_KERNEL(env, internal::PredictKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + daal::services::internal::hostApp(*input), a, m, par->nClasses, r, prob, logProb); } } // namespace interface2 diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_fpt_dispatcher.cpp index 330be3e7079..5d677300d06 100644 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_fpt_dispatcher.cpp @@ -29,8 +29,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(logistic_regression::prediction::BatchContainer, batch, DAAL_FPTYPE, - logistic_regression::prediction::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(logistic_regression::prediction::BatchContainer, batch, DAAL_FPTYPE, + logistic_regression::prediction::defaultDense) namespace logistic_regression { namespace prediction diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_oneapi_fpt.cpp deleted file mode 100755 index 19ddd069a7e..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_predict_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,42 +0,0 @@ -/* file: logistic_regression_predict_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of prediction stage of logistic regression classification algorithm. -//-- -*/ - -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_predict_kernel_oneapi.h" -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_predict_dense_default_batch_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace logistic_regression -{ -namespace prediction -{ -namespace internal -{ -template class PredictBatchKernelOneAPI; -} -} // namespace prediction -} // namespace logistic_regression -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_container.h b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_container.h index 047e470920e..206ce8cd772 100644 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_container.h +++ b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_container.h @@ -32,8 +32,6 @@ #include "algorithms/optimization_solver/sgd/sgd_batch.h" #include "src/services/service_algo_utils.h" -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi.h" - namespace daal { namespace algorithms @@ -47,17 +45,7 @@ namespace interface3 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::TrainBatchKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::TrainBatchKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::TrainBatchKernel, algorithmFPType, method); } template @@ -77,19 +65,8 @@ services::Status BatchContainer::compute() const logistic_regression::training::Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::TrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::getHostApp(*input), x, y, *m, *result, *par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::TrainBatchKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::getHostApp(*input), x, y, *m, *result, *par); - } + __DAAL_CALL_KERNEL(env, internal::TrainBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + daal::services::internal::getHostApp(*input), x, y, *m, *result, *par); } template diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_fpt_dispatcher.cpp index 085da0ef501..065549930c1 100644 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_fpt_dispatcher.cpp @@ -28,8 +28,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(logistic_regression::training::BatchContainer, batch, DAAL_FPTYPE, - logistic_regression::training::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(logistic_regression::training::BatchContainer, batch, DAAL_FPTYPE, logistic_regression::training::defaultDense) namespace logistic_regression { diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index aab91e0d844..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_train_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: logistic_regression_train_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Logistic Regression training Batch Kernel for GPU. -//-- -*/ - -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi.h" -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_train_dense_default_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace logistic_regression -{ -namespace training -{ -namespace internal -{ -template class TrainBatchKernelOneAPI; - -} // namespace internal -} // namespace training -} // namespace logistic_regression -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_training_result_fpt.cpp b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_training_result_fpt.cpp index b5cc45d0b4a..e4055ec298d 100644 --- a/cpp/daal/src/algorithms/logistic_regression/logistic_regression_training_result_fpt.cpp +++ b/cpp/daal/src/algorithms/logistic_regression/logistic_regression_training_result_fpt.cpp @@ -23,7 +23,7 @@ #include "algorithms/logistic_regression/logistic_regression_training_types.h" #include "src/algorithms/logistic_regression/logistic_regression_model_impl.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -33,7 +33,6 @@ namespace logistic_regression { namespace internal { -using daal::data_management::internal::SyclHomogenNumericTable; template ModelImpl::ModelImpl(size_t nFeatures, bool interceptFlag, size_t nClasses, modelFPType dummy, services::Status * st) @@ -42,17 +41,7 @@ ModelImpl::ModelImpl(size_t nFeatures, bool interceptFlag, size_t nClasses, mode const size_t nRows = nClasses == 2 ? 1 : nClasses; const size_t nCols = nFeatures + 1; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - _beta = data_management::HomogenNumericTable::create(nCols, nRows, data_management::NumericTable::doAllocate, 0, st); - } - else - { - _beta = SyclHomogenNumericTable::create(nCols, nRows, data_management::NumericTable::doAllocate, 0, st); - } + _beta = data_management::HomogenNumericTable::create(nCols, nRows, data_management::NumericTable::doAllocate, 0, st); } template ModelImpl::ModelImpl(size_t nFeatures, bool interceptFlag, size_t nClasses, DAAL_FPTYPE dummy, services::Status * st); diff --git a/cpp/daal/src/algorithms/logistic_regression/oneapi/cl_kernel/logistic_regression_dense_default.cl b/cpp/daal/src/algorithms/logistic_regression/oneapi/cl_kernel/logistic_regression_dense_default.cl deleted file mode 100644 index 3ed520768e7..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/oneapi/cl_kernel/logistic_regression_dense_default.cl +++ /dev/null @@ -1,65 +0,0 @@ -/* file: logistic_regression_dense_default.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Logistic Regression OpenCL kernels. -//-- -*/ - -#ifndef __LOGISTIC_REGRESSION_KERNELS_CL__ -#define __LOGISTIC_REGRESSION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelLogisticResgression, - - __kernel void heaviside(const __global algorithmFPType * const x, __global algorithmFPType * result) { - const uint i = get_global_id(0); - - const algorithmFPType zero = (algorithmFPType)0; - const algorithmFPType one = (algorithmFPType)1; - - result[i] = x[i] >= zero ? one : zero; - } - - __kernel void argMax(const __global algorithmFPType * const x, __global algorithmFPType * result, const uint p) { - const uint i = get_global_id(0); - - algorithmFPType maxVal = x[i * p + 0]; - uint maxIdx = 0; - - for (uint j = 1; j < p; j++) - { - if (maxVal < x[i * p + j]) - { - maxVal = x[i * p + j]; - maxIdx = j; - } - } - - result[i] = (algorithmFPType)maxIdx; - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_dense_default_batch_oneapi_impl.i b/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_dense_default_batch_oneapi_impl.i deleted file mode 100644 index c170e6cba78..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_dense_default_batch_oneapi_impl.i +++ /dev/null @@ -1,258 +0,0 @@ -/* file: logistic_regression_predict_dense_default_batch_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implemetation prediction of logistic regression on the GPU -//-- -*/ - -#ifndef __LOGISTIC_REGRESSION_PREDICT_DENSE_DEFAULT_BATCH_ONEAPI_IMPL_I__ -#define __LOGISTIC_REGRESSION_PREDICT_DENSE_DEFAULT_BATCH_ONEAPI_IMPL_I__ - -#include "src/algorithms/logistic_regression/logistic_regression_model_impl.h" -#include "src/algorithms/logistic_regression/oneapi/cl_kernel/logistic_regression_dense_default.cl" - -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace logistic_regression -{ -namespace prediction -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -// Heaviside step function -template -services::Status PredictBatchKernelOneAPI::heaviside(const services::internal::Buffer & x, - services::internal::Buffer & result, const uint32_t n) -{ - services::Status status; - - DAAL_CHECK(x.size() == n, services::ErrorIncorrectParameter); - DAAL_CHECK(result.size() == n, services::ErrorIncorrectParameter); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - const services::String options = getKeyFPType(); - services::String cachekey("__daal_algorithms_logistic_regression_prediction_"); - cachekey.add(options); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelLogisticResgression, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "heaviside"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, result, AccessModeIds::write); - - KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; -} - -// Index max elements for each row -template -services::Status PredictBatchKernelOneAPI::argMax(const services::internal::Buffer & x, - services::internal::Buffer & result, const uint32_t n, - const uint32_t p) -{ - services::Status status; - - DAAL_CHECK(x.size() == n * p, services::ErrorIncorrectParameter); // overflow checked in compute() - DAAL_CHECK(result.size() == n, services::ErrorIncorrectParameter); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - const services::String options = getKeyFPType(); - services::String cachekey("__daal_algorithms_logistic_regression_prediction_"); - cachekey.add(options); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelLogisticResgression, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "argMax"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, result, AccessModeIds::write); - args.set(2, p); - - KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; -} - -template -services::Status PredictBatchKernelOneAPI::compute(services::HostAppIface * pHostApp, NumericTable * x, - const logistic_regression::Model * m, size_t nClasses, - NumericTable * pRes, NumericTable * pProb, NumericTable * pLogProb) -{ - constexpr size_t maxInt32Value = static_cast(daal::services::internal::MaxVal::get()); - - services::Status status; - - auto & ctx = services::internal::getDefaultContext(); - - const daal::algorithms::logistic_regression::internal::ModelImpl * pModel = - static_cast(m); - - const size_t n = x->getNumberOfRows(); - const size_t p = x->getNumberOfColumns(); - - DAAL_CHECK(n <= maxInt32Value, services::ErrorIncorrectNumberOfRows); - DAAL_CHECK(p <= maxInt32Value, services::ErrorIncorrectNumberOfColumns); - DAAL_CHECK(nClasses <= maxInt32Value, services::ErrorIncorrectNumberOfClasses); - DAAL_OVERFLOW_CHECK_BY_ADDING(uint32_t, p, 1); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, n, (p + 1)); - - const bool isBinary = nClasses == 2; - - NumericTablePtr beta = pModel->getBeta(); - - // X - BlockDescriptor xBlock; - DAAL_CHECK_STATUS(status, x->getBlockOfRows(0, n, ReadWriteMode::readOnly, xBlock)); - const services::internal::Buffer xBuff = xBlock.getBuffer(); - - // Beta - DAAL_ASSERT(beta->getNumberOfRows() == (nClasses == 2) ? 1 : nClasses); - DAAL_ASSERT(beta->getNumberOfColumns() == p + 1); - - BlockDescriptor betaBlock; - DAAL_CHECK_STATUS(status, beta->getBlockOfRows(0, beta->getNumberOfRows(), ReadWriteMode::readOnly, betaBlock)); - const services::internal::Buffer betaBuff = betaBlock.getBuffer(); - - //compute - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_fUniversal, n * beta->getNumberOfRows())); - services::internal::Buffer fBuf = _fUniversal.get(); - - const uint32_t offset = uint32_t(1); - - if (isBinary) - { - DAAL_CHECK_STATUS(status, LogisticLoss::applyBeta(xBuff, betaBuff, fBuf, n, p, offset)); - DAAL_CHECK_STATUS(status, LogisticLoss::betaIntercept(betaBuff, fBuf, n)); - } - else - { - DAAL_CHECK_STATUS(status, CrossEntropyLoss::applyBeta(xBuff, betaBuff, fBuf, n, nClasses, p, p + 1, offset)); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_oneVector, n)); - services::internal::Buffer oneVectorBuf = _oneVector.get(); - ctx.fill(_oneVector, 1.0, status); - - DAAL_CHECK_STATUS(status, CrossEntropyLoss::betaIntercept(oneVectorBuf, betaBuff, fBuf, n, nClasses, p + 1)); - } - - if (pProb || pLogProb) - { - // before transforming raw values to sigmoid and logarithm, predict labels - - NumericTable * pRaw = pProb ? pProb : pLogProb; - - DAAL_ASSERT(pRaw->getNumberOfRows() == n); - DAAL_ASSERT(pRaw->getNumberOfColumns() == nClasses); - - BlockDescriptor rawBlock; - DAAL_CHECK_STATUS(status, pRaw->getBlockOfRows(0, n, ReadWriteMode::writeOnly, rawBlock)); - services::internal::Buffer aRawBuff = rawBlock.getBuffer(); - - if (isBinary) - { - bool calculateInverse = true; - DAAL_CHECK_STATUS(status, LogisticLoss::sigmoids(fBuf, aRawBuff, n, calculateInverse)); - } - else - { - DAAL_CHECK_STATUS(status, CrossEntropyLoss::softmax(fBuf, aRawBuff, n, nClasses)); - } - - if (pLogProb) - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, n, nClasses); - - if (pProb) - { - DAAL_ASSERT(pLogProb->getNumberOfRows() == n); - DAAL_ASSERT(pLogProb->getNumberOfColumns() == nClasses); - - BlockDescriptor logProbBlock; - DAAL_CHECK_STATUS(status, pLogProb->getBlockOfRows(0, n, ReadWriteMode::writeOnly, logProbBlock)); - services::internal::Buffer logProbBuff = logProbBlock.getBuffer(); - - DAAL_CHECK_STATUS(status, math::vLog(aRawBuff, logProbBuff, n * nClasses)); - - DAAL_CHECK_STATUS(status, pLogProb->releaseBlockOfRows(logProbBlock)); - } - else - { - DAAL_CHECK_STATUS(status, math::vLog(aRawBuff, aRawBuff, n * nClasses)); - } - } - DAAL_CHECK_STATUS(status, pRaw->releaseBlockOfRows(rawBlock)); - } - - if (pRes) - { - DAAL_ASSERT(pRes->getNumberOfRows() == n); - DAAL_ASSERT(pRes->getNumberOfColumns() == 1); - - BlockDescriptor yBlock; - DAAL_CHECK_STATUS(status, pRes->getBlockOfRows(0, n, ReadWriteMode::writeOnly, yBlock)); - services::internal::Buffer yBuff = yBlock.getBuffer(); - - if (isBinary) - { - DAAL_CHECK_STATUS(status, heaviside(fBuf, yBuff, n)); - } - else - { - DAAL_CHECK_STATUS(status, argMax(fBuf, yBuff, n, nClasses)); - } - - DAAL_CHECK_STATUS(status, pRes->releaseBlockOfRows(yBlock)); - } - - DAAL_CHECK_STATUS(status, x->releaseBlockOfRows(xBlock)); - DAAL_CHECK_STATUS(status, beta->releaseBlockOfRows(betaBlock)); - - return status; -} - -} // namespace internal -} // namespace prediction -} // namespace logistic_regression -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_kernel_oneapi.h b/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_kernel_oneapi.h deleted file mode 100644 index 827254f61b7..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_predict_kernel_oneapi.h +++ /dev/null @@ -1,76 +0,0 @@ -/* file: logistic_regression_predict_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for logistic regression -// training. -//-- -*/ - -#ifndef __LOGISTIC_REGRESSION_PREDICT_KERNEL_ONEAPI_H__ -#define __LOGISTIC_REGRESSION_PREDICT_KERNEL_ONEAPI_H__ - -#include "algorithms/logistic_regression/logistic_regression_training_types.h" -#include "algorithms/logistic_regression/logistic_regression_predict.h" -#include "src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h" -#include "src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_kernel_oneapi.h" -#include "src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_kernel_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace logistic_regression -{ -namespace prediction -{ -namespace internal -{ -template -class PredictBatchKernelOneAPI : public daal::algorithms::Kernel -{ -public: - using LogisticLoss = - optimization_solver::logistic_loss::internal::LogLossKernelOneAPI; - - using CrossEntropyLoss = optimization_solver::cross_entropy_loss::internal::CrossEntropyLossKernelOneAPI< - algorithmFPType, optimization_solver::cross_entropy_loss::Method::defaultDense>; - - using HelperObjectiveFunction = optimization_solver::objective_function::internal::HelperObjectiveFunction; - - services::Status compute(services::HostAppIface * pHost, NumericTable * x, const logistic_regression::Model * m, size_t nClasses, - NumericTable * pRes, NumericTable * pProbab, NumericTable * pLogProbab); - - static services::Status heaviside(const services::internal::Buffer & x, services::internal::Buffer & result, - const uint32_t n); - - static services::Status argMax(const services::internal::Buffer & x, services::internal::Buffer & result, - const uint32_t n, const uint32_t p); - -private: - services::internal::sycl::UniversalBuffer _fUniversal; - services::internal::sycl::UniversalBuffer _oneVector; -}; - -} // namespace internal -} // namespace prediction -} // namespace logistic_regression -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_dense_default_oneapi_impl.i b/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_dense_default_oneapi_impl.i deleted file mode 100644 index 5585fe3d54f..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_dense_default_oneapi_impl.i +++ /dev/null @@ -1,184 +0,0 @@ -/* file: logistic_regression_train_dense_default_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of auxiliary functions for logistic regression classification -// (defaultDense) method. -//-- -*/ - -#ifndef __LOGISTIC_REGRESSION_TRAIN_DENSE_DEFAULT_ONEAPI_IMPL_I__ -#define __LOGISTIC_REGRESSION_TRAIN_DENSE_DEFAULT_ONEAPI_IMPL_I__ - -#include "algorithms/optimization_solver/objective_function/logistic_loss_batch.h" -#include "algorithms/optimization_solver/objective_function/cross_entropy_loss_batch.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" - -#include "src/services/service_data_utils.h" - -#include "src/externals/service_profiler.h" - -namespace daal -{ -namespace algorithms -{ -namespace logistic_regression -{ -namespace training -{ -namespace internal -{ -using namespace daal::algorithms::logistic_regression::training::internal; -using namespace daal::algorithms::optimization_solver; -using namespace daal::data_management; -using namespace daal::services::internal::sycl; - -template -services::Status TrainBatchKernelOneAPI::compute(const services::HostAppIfacePtr & pHost, const NumericTablePtr & x, - const NumericTablePtr & y, logistic_regression::Model & m, Result & res, - const Parameter & par) -{ - services::Status status; - - const size_t p = x->getNumberOfColumns(); - const size_t n = x->getNumberOfRows(); - - constexpr size_t maxInt32Value = static_cast(daal::services::internal::MaxVal::get()); - - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, p, 1); - const size_t nBeta = p + 1; - - DAAL_ASSERT(nBeta == m.getNumberOfBetas()); - const size_t nClasses = par.nClasses; - const TypeIds::Id idType = TypeIds::id(); - - DAAL_CHECK(nClasses <= maxInt32Value, services::ErrorIncorrectNumberOfClasses); - DAAL_CHECK(p <= maxInt32Value, services::ErrorIncorrectNumberOfFeatures); - - auto & ctx = services::internal::getDefaultContext(); - - services::SharedPtr pSolver = par.optimizationSolver->clone(); - DAAL_ASSERT(pSolver == true); - pSolver->setHostApp(pHost); - if (nClasses == 2) - { - services::SharedPtr > objFunc(logistic_loss::Batch::create(n)); - - DAAL_ASSERT(objFunc == true); - objFunc->input.set(logistic_loss::data, x); - objFunc->input.set(logistic_loss::dependentVariables, y); - objFunc->parameter().interceptFlag = par.interceptFlag; - objFunc->parameter().penaltyL1 = par.penaltyL1; - objFunc->parameter().penaltyL2 = par.penaltyL2; - pSolver->getParameter()->function = objFunc; - } - else - { - DAAL_CHECK(nClasses > 2, services::ErrorIncorrectParameter); - - services::SharedPtr > objFunc(cross_entropy_loss::Batch::create(nClasses, n)); - - DAAL_ASSERT(objFunc == true); - objFunc->input.set(cross_entropy_loss::data, x); - objFunc->input.set(cross_entropy_loss::dependentVariables, y); - objFunc->parameter().interceptFlag = par.interceptFlag; - objFunc->parameter().penaltyL1 = par.penaltyL1; - objFunc->parameter().penaltyL2 = par.penaltyL2; - pSolver->getParameter()->function = objFunc; - } - - const size_t nBetaRows = m.getBeta()->getNumberOfRows(); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nBeta, nBetaRows); - const size_t nBetaTotal = nBeta * nBetaRows; - - UniversalBuffer argumentU = ctx.allocate(idType, nBetaTotal, status); - services::internal::Buffer argumentBuff = argumentU.get(); - - auto argumentSNT = data_management::internal::SyclHomogenNumericTable::create(argumentBuff, 1, nBetaTotal, &status); - DAAL_CHECK_STATUS_VAR(status); - - ctx.fill(argumentU, 0.0, status); - DAAL_CHECK_STATUS_VAR(status); - - //initialization - if (nClasses != 2) - { - const algorithmFPType initialVal = algorithmFPType(1e-3); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::setColElem(0, initialVal, argumentBuff, nClasses, nBeta)); - } - - //initialize solver arguments - pSolver->getInput()->set(optimization_solver::iterative_solver::inputArgument, argumentSNT); - - DAAL_CHECK_STATUS(status, pSolver->computeNoThrow()); - - { - NumericTablePtr nIterationsNT = pSolver->getResult()->get(optimization_solver::iterative_solver::nIterations); - - BlockDescriptor nIterationsBlock; - DAAL_CHECK_STATUS(status, nIterationsNT->getBlockOfRows(0, 1, ReadWriteMode::readOnly, nIterationsBlock)); - const int * pnIterations = nIterationsBlock.getBlockPtr(); - - NumericTablePtr nIterationsOut = data_management::HomogenNumericTable::create(1, 1, NumericTable::doAllocate, pnIterations[0], &status); - DAAL_CHECK_STATUS_VAR(status); - - par.optimizationSolver->getResult()->set(optimization_solver::iterative_solver::nIterations, nIterationsOut); - DAAL_CHECK_STATUS(status, nIterationsNT->releaseBlockOfRows(nIterationsBlock)); - } - - data_management::NumericTablePtr minimumSNT = pSolver->getResult()->get(optimization_solver::iterative_solver::minimum); - BlockDescriptor minimumBlock; - DAAL_CHECK_STATUS(status, minimumSNT->getBlockOfRows(0, nBetaTotal, ReadWriteMode::readOnly, minimumBlock)); - - services::internal::Buffer minimumBuff = minimumBlock.getBuffer(); - - data_management::NumericTablePtr betaNT = m.getBeta(); - { - BlockDescriptor dataRows; - - DAAL_CHECK_STATUS(status, betaNT->getBlockOfRows(0, nBetaRows, ReadWriteMode::writeOnly, dataRows)); - - services::internal::Buffer betaBuff = dataRows.getBuffer(); - - DAAL_ASSERT(betaBuff.size() == nBetaTotal); - DAAL_ASSERT(minimumBuff.size() == nBetaTotal); - ctx.copy(betaBuff, 0, minimumBuff, 0, nBetaTotal, status); - DAAL_CHECK_STATUS_VAR(status); - - if (!par.interceptFlag) - { - DAAL_CHECK(nBeta <= maxInt32Value, services::ErrorIncorrectNumberOfBetas); - DAAL_CHECK(nBetaRows <= maxInt32Value, services::ErrorIncorrectNumberOfRows); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::setColElem(0, algorithmFPType(0), betaBuff, nBetaRows, nBeta)); - } - - DAAL_CHECK_STATUS(status, betaNT->releaseBlockOfRows(dataRows)); - } - - DAAL_CHECK_STATUS(status, minimumSNT->releaseBlockOfRows(minimumBlock)); - - return status; -} - -} // namespace internal -} // namespace training -} // namespace logistic_regression -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi.h b/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi.h deleted file mode 100644 index 2c3e929605b..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi.h +++ /dev/null @@ -1,57 +0,0 @@ -/* file: logistic_regression_train_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of structure containing kernels for logistic regression -// training. -//-- -*/ - -#ifndef __LOGISTIC_REGRESSION_TRAIN_KERNEL_ONEAPI_H__ -#define __LOGISTIC_REGRESSION_TRAIN_KERNEL_ONEAPI_H__ - -#include "algorithms/logistic_regression/logistic_regression_training_types.h" -#include "src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace logistic_regression -{ -namespace training -{ -namespace internal -{ -template -class TrainBatchKernelOneAPI : public daal::algorithms::Kernel -{ - using HelperObjectiveFunction = optimization_solver::objective_function::internal::HelperObjectiveFunction; - -public: - services::Status compute(const services::HostAppIfacePtr & pHost, const NumericTablePtr & x, const NumericTablePtr & y, - logistic_regression::Model & m, Result & res, const Parameter & par); -}; - -} // namespace internal -} // namespace training -} // namespace logistic_regression -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi_instance.h b/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi_instance.h deleted file mode 100644 index cbd128c747f..00000000000 --- a/cpp/daal/src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi_instance.h +++ /dev/null @@ -1,30 +0,0 @@ -/* file: logistic_regression_train_kernel_oneapi_instance.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of logistic regression Batch Kernel Adapter for GPU. -//-- -*/ - -#ifndef __LOGISTIC_REGRESSION_TRAIN_KERNEL_ONEAPI_INSTANCE_H__ -#define __LOGISTIC_REGRESSION_TRAIN_KERNEL_ONEAPI_INSTANCE_H__ - -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_train_kernel_oneapi.h" -#include "src/algorithms/logistic_regression/oneapi/logistic_regression_train_dense_default_oneapi_impl.i" - -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/BUILD b/cpp/daal/src/algorithms/low_order_moments/BUILD index 9f64eecd83b..9810bc00230 100644 --- a/cpp/daal/src/algorithms/low_order_moments/BUILD +++ b/cpp/daal/src/algorithms/low_order_moments/BUILD @@ -4,9 +4,8 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_container.h b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_container.h index e73f8d40482..27450b452ad 100644 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_container.h +++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_container.h @@ -29,10 +29,6 @@ #include "algorithms/moments/low_order_moments_online.h" #include "algorithms/moments/low_order_moments_distributed.h" #include "src/algorithms/low_order_moments/low_order_moments_kernel.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_batch_oneapi.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_online_oneapi.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_distributed_oneapi.h" -#include "services/internal/execution_context.h" namespace daal { @@ -43,17 +39,7 @@ namespace low_order_moments template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::LowOrderMomentsBatchKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(oneapi::internal::LowOrderMomentsBatchKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::LowOrderMomentsBatchKernel, algorithmFPType, method); } template @@ -73,35 +59,13 @@ services::Status BatchContainer::compute() Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::LowOrderMomentsBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, dataTable, result, - par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, oneapi::internal::LowOrderMomentsBatchKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - dataTable, result, par); - } + __DAAL_CALL_KERNEL(env, internal::LowOrderMomentsBatchKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, dataTable, result, par); } template OnlineContainer::OnlineContainer(daal::services::Environment::env * daalEnv) { - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::LowOrderMomentsOnlineKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(oneapi::internal::LowOrderMomentsOnlineKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::LowOrderMomentsOnlineKernel, algorithmFPType, method); } template @@ -122,19 +86,8 @@ services::Status OnlineContainer::compute() Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::LowOrderMomentsOnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, dataTable, - partialResult, par, isOnline); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, oneapi::internal::LowOrderMomentsOnlineKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - dataTable, partialResult, par, isOnline); - } + __DAAL_CALL_KERNEL(env, internal::LowOrderMomentsOnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, dataTable, + partialResult, par, isOnline); } template @@ -146,54 +99,32 @@ services::Status OnlineContainer::finalizeCompute( Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - // for other methods oneapi isn't implemented yet - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - result->set(minimum, partialResult->get(partialMinimum)); result->set(maximum, partialResult->get(partialMaximum)); result->set(sum, partialResult->get(partialSum)); result->set(sumSquares, partialResult->get(partialSumSquares)); result->set(sumSquaresCentered, partialResult->get(partialSumSquaresCentered)); - if (method != defaultDense || deviceInfo.isCpu) - { - NumericTable * nObservationsTable = partialResult->get(nObservations).get(); - NumericTable * sumTable = partialResult->get(partialSum).get(); - NumericTable * sumSqTable = partialResult->get(partialSumSquares).get(); - NumericTable * sumSqCenTable = partialResult->get(partialSumSquaresCentered).get(); - - NumericTable * meanTable = result->get(mean).get(); - NumericTable * raw2MomTable = result->get(secondOrderRawMoment).get(); - NumericTable * varianceTable = result->get(variance).get(); - NumericTable * stDevTable = result->get(standardDeviation).get(); - NumericTable * variationTable = result->get(variation).get(); - - __DAAL_CALL_KERNEL(env, internal::LowOrderMomentsOnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), finalizeCompute, - nObservationsTable, sumTable, sumSqTable, sumSqCenTable, meanTable, raw2MomTable, varianceTable, stDevTable, - variationTable, par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, oneapi::internal::LowOrderMomentsOnlineKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), - finalizeCompute, partialResult, result, par); - } + NumericTable * nObservationsTable = partialResult->get(nObservations).get(); + NumericTable * sumTable = partialResult->get(partialSum).get(); + NumericTable * sumSqTable = partialResult->get(partialSumSquares).get(); + NumericTable * sumSqCenTable = partialResult->get(partialSumSquaresCentered).get(); + + NumericTable * meanTable = result->get(mean).get(); + NumericTable * raw2MomTable = result->get(secondOrderRawMoment).get(); + NumericTable * varianceTable = result->get(variance).get(); + NumericTable * stDevTable = result->get(standardDeviation).get(); + NumericTable * variationTable = result->get(variation).get(); + + __DAAL_CALL_KERNEL(env, internal::LowOrderMomentsOnlineKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), finalizeCompute, + nObservationsTable, sumTable, sumSqTable, sumSqCenTable, meanTable, raw2MomTable, varianceTable, stDevTable, variationTable, + par); } template DistributedContainer::DistributedContainer(daal::services::Environment::env * daalEnv) { - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::LowOrderMomentsDistributedKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(oneapi::internal::LowOrderMomentsDistributedKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::LowOrderMomentsDistributedKernel, algorithmFPType, method); } template @@ -212,22 +143,10 @@ services::Status DistributedContainer Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - // for other methods oneapi isn't implemented yet - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - services::Status s; - if (method != defaultDense || deviceInfo.isCpu) - { - s = __DAAL_CALL_KERNEL_STATUS(env, internal::LowOrderMomentsDistributedKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - collection, partialResult, par); - } - else - { - s = __DAAL_CALL_KERNEL_STATUS_SYCL(env, oneapi::internal::LowOrderMomentsDistributedKernelOneAPI, - __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, collection, partialResult, par); - } + s = __DAAL_CALL_KERNEL_STATUS(env, internal::LowOrderMomentsDistributedKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + collection, partialResult, par); collection->clear(); return s; @@ -242,10 +161,6 @@ services::Status DistributedContainer Parameter * par = static_cast(_par); daal::services::Environment::env & env = *_env; - // for other methods oneapi isn't implemented yet - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - result->set(minimum, partialResult->get(partialMinimum)); result->set(maximum, partialResult->get(partialMaximum)); result->set(sum, partialResult->get(partialSum)); @@ -254,28 +169,20 @@ services::Status DistributedContainer services::Status s; - if (method != defaultDense || deviceInfo.isCpu) - { - NumericTable * nObservationsTable = partialResult->get(nObservations).get(); - NumericTable * sumTable = partialResult->get(partialSum).get(); - NumericTable * sumSqTable = partialResult->get(partialSumSquares).get(); - NumericTable * sumSqCenTable = partialResult->get(partialSumSquaresCentered).get(); - - NumericTable * meanTable = result->get(mean).get(); - NumericTable * raw2MomTable = result->get(secondOrderRawMoment).get(); - NumericTable * varianceTable = result->get(variance).get(); - NumericTable * stDevTable = result->get(standardDeviation).get(); - NumericTable * variationTable = result->get(variation).get(); - - s = __DAAL_CALL_KERNEL_STATUS(env, internal::LowOrderMomentsDistributedKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), - finalizeCompute, nObservationsTable, sumTable, sumSqTable, sumSqCenTable, meanTable, raw2MomTable, - varianceTable, stDevTable, variationTable, par); - } - else - { - s = __DAAL_CALL_KERNEL_STATUS_SYCL(env, oneapi::internal::LowOrderMomentsDistributedKernelOneAPI, - __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), finalizeCompute, partialResult, result, par); - } + NumericTable * nObservationsTable = partialResult->get(nObservations).get(); + NumericTable * sumTable = partialResult->get(partialSum).get(); + NumericTable * sumSqTable = partialResult->get(partialSumSquares).get(); + NumericTable * sumSqCenTable = partialResult->get(partialSumSquaresCentered).get(); + + NumericTable * meanTable = result->get(mean).get(); + NumericTable * raw2MomTable = result->get(secondOrderRawMoment).get(); + NumericTable * varianceTable = result->get(variance).get(); + NumericTable * stDevTable = result->get(standardDeviation).get(); + NumericTable * variationTable = result->get(variation).get(); + + s = __DAAL_CALL_KERNEL_STATUS(env, internal::LowOrderMomentsDistributedKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), finalizeCompute, + nObservationsTable, sumTable, sumSqTable, sumSqCenTable, meanTable, raw2MomTable, varianceTable, stDevTable, + variationTable, par); return s; } diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_fpt_dispatcher.cpp index 1b25fd6edbb..2843069ad1a 100644 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_fpt_dispatcher.cpp @@ -27,6 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(low_order_moments::BatchContainer, batch, DAAL_FPTYPE, low_order_moments::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(low_order_moments::BatchContainer, batch, DAAL_FPTYPE, low_order_moments::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index bc956377363..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* file: low_order_moments_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of low order moments kernel. -//-- -*/ - -#include "src/externals/service_profiler.h" - -#include "src/algorithms/low_order_moments/low_order_moments_container.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_batch_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -template class DAAL_EXPORT LowOrderMomentsBatchKernelOneAPI; -template class LowOrderMomentsBatchKernelOneAPI; -template class LowOrderMomentsBatchKernelOneAPI; -template class LowOrderMomentsBatchKernelOneAPI; -template class LowOrderMomentsBatchKernelOneAPI; -template class LowOrderMomentsBatchKernelOneAPI; -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_fpt_dispatcher.cpp index 1d76f8421dd..8a6b22fa03a 100644 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_fpt_dispatcher.cpp @@ -27,7 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(low_order_moments::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, - low_order_moments::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(low_order_moments::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, low_order_moments::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_oneapi_fpt.cpp b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_oneapi_fpt.cpp deleted file mode 100644 index cb41958fc8e..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_distr_step2_oneapi_fpt.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* file: low_order_moments_dense_default_distr_step2_oneapi_fpt.cpp*/ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of low order moments kernel. -//-- -*/ - -#include "src/externals/service_profiler.h" - -#include "src/algorithms/low_order_moments/low_order_moments_container.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_distributed_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -template class LowOrderMomentsDistributedKernelOneAPI; -template class LowOrderMomentsDistributedKernelOneAPI; -template class LowOrderMomentsDistributedKernelOneAPI; -template class LowOrderMomentsDistributedKernelOneAPI; -template class LowOrderMomentsDistributedKernelOneAPI; -template class LowOrderMomentsDistributedKernelOneAPI; -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_fpt_dispatcher.cpp index db83cfa27d5..adbc37d600f 100644 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_fpt_dispatcher.cpp @@ -27,6 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(low_order_moments::OnlineContainer, online, DAAL_FPTYPE, low_order_moments::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(low_order_moments::OnlineContainer, online, DAAL_FPTYPE, low_order_moments::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_oneapi_fpt.cpp b/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_oneapi_fpt.cpp deleted file mode 100644 index 8087ef17d49..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/low_order_moments_dense_default_online_oneapi_fpt.cpp +++ /dev/null @@ -1,49 +0,0 @@ -/* file: low_order_moments_dense_default_online_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of low order moments kernel. -//-- -*/ - -#include "src/externals/service_profiler.h" - -#include "src/algorithms/low_order_moments/low_order_moments_container.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_online_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -template class DAAL_EXPORT LowOrderMomentsOnlineKernelOneAPI; -template class LowOrderMomentsOnlineKernelOneAPI; -template class LowOrderMomentsOnlineKernelOneAPI; -template class LowOrderMomentsOnlineKernelOneAPI; -template class LowOrderMomentsOnlineKernelOneAPI; -template class LowOrderMomentsOnlineKernelOneAPI; -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/low_order_moments/moments_batch.h b/cpp/daal/src/algorithms/low_order_moments/moments_batch.h index 62b2e52a9c4..ceb6e0a2804 100644 --- a/cpp/daal/src/algorithms/low_order_moments/moments_batch.h +++ b/cpp/daal/src/algorithms/low_order_moments/moments_batch.h @@ -24,8 +24,7 @@ #define __MOMENTS_BATCH__ #include "algorithms/moments/low_order_moments_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "services/internal/execution_context.h" +#include "data_management/data/homogen_numeric_table.h" using namespace daal::data_management; @@ -48,24 +47,10 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in size_t nFeatures = 0; DAAL_CHECK_STATUS(s, static_cast(input)->getNumberOfColumns(nFeatures)); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - for (size_t i = 0; i < lastResultId + 1; i++) - { - Argument::set(i, HomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); - DAAL_CHECK_STATUS_VAR(s); - } - } - else + for (size_t i = 0; i < lastResultId + 1; i++) { - for (size_t i = 0; i < lastResultId + 1; i++) - { - Argument::set(i, data_management::internal::SyclHomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); - DAAL_CHECK_STATUS_VAR(s); - } + Argument::set(i, HomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); + DAAL_CHECK_STATUS_VAR(s); } return s; @@ -85,24 +70,10 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::PartialRes services::Status s; DAAL_CHECK_STATUS(s, static_cast(partialResult)->getNumberOfColumns(nFeatures)); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - for (size_t i = 0; i < lastResultId + 1; i++) - { - Argument::set(i, HomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); - DAAL_CHECK_STATUS_VAR(s); - } - } - else + for (size_t i = 0; i < lastResultId + 1; i++) { - for (size_t i = 0; i < lastResultId + 1; i++) - { - Argument::set(i, data_management::internal::SyclHomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); - DAAL_CHECK_STATUS_VAR(s); - } + Argument::set(i, HomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); + DAAL_CHECK_STATUS_VAR(s); } return s; } diff --git a/cpp/daal/src/algorithms/low_order_moments/moments_online.h b/cpp/daal/src/algorithms/low_order_moments/moments_online.h index fcbf76c8725..9b228b12ae3 100644 --- a/cpp/daal/src/algorithms/low_order_moments/moments_online.h +++ b/cpp/daal/src/algorithms/low_order_moments/moments_online.h @@ -25,12 +25,10 @@ #include "algorithms/moments/low_order_moments_types.h" #include "src/data_management/service_numeric_table.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "services/internal/execution_context.h" +#include "data_management/data/homogen_numeric_table.h" using namespace daal::internal; using namespace daal::data_management; -using daal::data_management::internal::SyclHomogenNumericTable; namespace daal { @@ -52,24 +50,10 @@ DAAL_EXPORT services::Status PartialResult::allocate(const daal::algorithms::Inp size_t nFeatures = 0; DAAL_CHECK_STATUS(s, static_cast(input)->getNumberOfColumns(nFeatures)); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (method != defaultDense || deviceInfo.isCpu) - { - set(nObservations, HomogenNumericTable::create(1, 1, NumericTable::doAllocate, &s)); - for (size_t i = 1; i < lastPartialResultId + 1; i++) - { - Argument::set(i, HomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); - } - } - else + set(nObservations, HomogenNumericTable::create(1, 1, NumericTable::doAllocate, &s)); + for (size_t i = 1; i < lastPartialResultId + 1; i++) { - set(nObservations, SyclHomogenNumericTable::create(1, 1, NumericTable::doAllocate, &s)); - for (size_t i = 1; i < lastPartialResultId + 1; i++) - { - Argument::set(i, SyclHomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); - } + Argument::set(i, HomogenNumericTable::create(nFeatures, 1, NumericTable::doAllocate, &s)); } return s; } @@ -82,36 +66,30 @@ DAAL_EXPORT services::Status PartialResult::initialize(const daal::algorithms::I services::Status s; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - DAAL_CHECK_STATUS(s, get(nObservations)->assign((algorithmFPType)0.0)) - if (method != defaultDense || deviceInfo.isCpu) - { - DAAL_CHECK_STATUS(s, get(partialSum)->assign((algorithmFPType)0.0)) - DAAL_CHECK_STATUS(s, get(partialSumSquares)->assign((algorithmFPType)0.0)) - DAAL_CHECK_STATUS(s, get(partialSumSquaresCentered)->assign((algorithmFPType)0.0)) + DAAL_CHECK_STATUS(s, get(partialSum)->assign((algorithmFPType)0.0)) + DAAL_CHECK_STATUS(s, get(partialSumSquares)->assign((algorithmFPType)0.0)) + DAAL_CHECK_STATUS(s, get(partialSumSquaresCentered)->assign((algorithmFPType)0.0)) - ReadRows dataBlock(input->get(data).get(), 0, 1); - DAAL_CHECK_BLOCK_STATUS(dataBlock) - const algorithmFPType * firstRow = dataBlock.get(); + ReadRows dataBlock(input->get(data).get(), 0, 1); + DAAL_CHECK_BLOCK_STATUS(dataBlock) + const algorithmFPType * firstRow = dataBlock.get(); - WriteOnlyRows partialMinimumBlock(get(partialMinimum).get(), 0, 1); - DAAL_CHECK_BLOCK_STATUS(partialMinimumBlock) - algorithmFPType * partialMinimumArray = partialMinimumBlock.get(); + WriteOnlyRows partialMinimumBlock(get(partialMinimum).get(), 0, 1); + DAAL_CHECK_BLOCK_STATUS(partialMinimumBlock) + algorithmFPType * partialMinimumArray = partialMinimumBlock.get(); - WriteOnlyRows partialMaximumBlock(get(partialMaximum).get(), 0, 1); - DAAL_CHECK_BLOCK_STATUS(partialMaximumBlock) - algorithmFPType * partialMaximumArray = partialMaximumBlock.get(); + WriteOnlyRows partialMaximumBlock(get(partialMaximum).get(), 0, 1); + DAAL_CHECK_BLOCK_STATUS(partialMaximumBlock) + algorithmFPType * partialMaximumArray = partialMaximumBlock.get(); - size_t nColumns = input->get(data)->getNumberOfColumns(); + size_t nColumns = input->get(data)->getNumberOfColumns(); - for (size_t j = 0; j < nColumns; j++) - { - partialMinimumArray[j] = firstRow[j]; - partialMaximumArray[j] = firstRow[j]; - } + for (size_t j = 0; j < nColumns; j++) + { + partialMinimumArray[j] = firstRow[j]; + partialMaximumArray[j] = firstRow[j]; } return s; diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.cl b/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.cl deleted file mode 100644 index 9f349275ee3..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.cl +++ /dev/null @@ -1,958 +0,0 @@ -/* file: low_order_moments_kernels_all.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of low order moments kernels. -//-- -*/ - -#define CONCAT(n, suff) n##suff -#define FULLNAME(n, p) CONCAT(n, p) - -#define singlePassBlockProcessor FULLNAME(singlePassBlockProcessor, FNAMESUFF) -#define singlePass FULLNAME(singlePass, FNAMESUFF) -#define blockProcessor FULLNAME(blockProcessor, FNAMESUFF) -#define processBlocks FULLNAME(processBlocks, FNAMESUFF) -#define mergeBlocks FULLNAME(mergeBlocks, FNAMESUFF) -#define finalize FULLNAME(finalize, FNAMESUFF) - -/* single pass kernels common */ -void singlePassBlockProcessor(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize -#if (defined _ONLINE_) - , - const algorithmFPType nObservations -#endif -#if (defined _RMIN_) - , - __global algorithmFPType * gMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * gMax -#endif -#if (defined _RSUM_) \ - || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - __global algorithmFPType * gSum -#endif -#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_) - , - __global algorithmFPType * gSum2 -#endif -#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - __global algorithmFPType * gSum2Cent -#endif -#if !(defined _ONLINE_) - #if (defined _RMEAN_) - , - __global algorithmFPType * gMean - #endif - #if (defined _RSORM_) - , - __global algorithmFPType * gSecondOrderRawMoment - #endif - #if (defined _RVARC_) - , - __global algorithmFPType * gVariance - #endif - #if (defined _RSTDEV_) - , - __global algorithmFPType * gStDev - #endif - #if (defined _RVART_) - , - __global algorithmFPType * gVariation - #endif -#endif - , - const uint rowPartIndex, const uint rowParts, const uint colPartIndex, const uint colParts, const uint tid, - const uint tnum) -{ - const uint colOffset = colPartIndex * tnum; - const uint x = tid + colOffset; - - if (x < nVectors) - { - uint rowPartSize = (vectorSize + rowParts - 1) / rowParts; - const uint rowOffset = rowPartSize * rowPartIndex; - - if (rowPartSize + rowOffset > vectorSize) - { - rowPartSize = vectorSize - rowOffset; - } - -#if (defined _ONLINE_) - // for online mode initial values of min/max are defined later depending on nObservations - #if (defined _RMIN_) - algorithmFPType min = (algorithmFPType)0; - #endif - #if (defined _RMAX_) - algorithmFPType max = (algorithmFPType)0; - #endif -#else - #if (defined _RMIN_) - algorithmFPType min = vectors[rowOffset * nVectors + x]; - #endif - #if (defined _RMAX_) - algorithmFPType max = vectors[rowOffset * nVectors + x]; - #endif -#endif - -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum = (algorithmFPType)0; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - algorithmFPType sum2 = (algorithmFPType)0; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum2Cent = (algorithmFPType)0; - algorithmFPType mean = (algorithmFPType)0; -#endif - -#if (defined _ONLINE_) - if ((algorithmFPType)0 == nObservations) - { - #if (defined _RMIN_) - min = vectors[rowOffset * nVectors + x]; - #endif - #if (defined _RMAX_) - max = vectors[rowOffset * nVectors + x]; - #endif - #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - sum = (algorithmFPType)0; - #endif - #if (defined _RSUM2_) || (defined _RSORM_) - sum2 = (algorithmFPType)0; - #endif - #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - sum2Cent = (algorithmFPType)0; - mean = (algorithmFPType)0; - #endif - } - else - { - #if (defined _RMIN_) - min = gMin[x * rowParts + rowPartIndex]; - #endif - #if (defined _RMAX_) - max = gMax[x * rowParts + rowPartIndex]; - #endif - #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - sum = gSum[x * rowParts + rowPartIndex]; - #endif - #if (defined _RSUM2_) || (defined _RSORM_) - sum2 = gSum2[x * rowParts + rowPartIndex]; - #endif - #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - sum2Cent = gSum2Cent[x * rowParts + rowPartIndex]; - mean = sum / nObservations; - #endif - } -#endif - - for (int row = 0; row < rowPartSize; row++) - { - const uint y = (row + rowOffset) * nVectors; - const algorithmFPType el = vectors[y + x]; -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - #if (defined _ONLINE_) - algorithmFPType invN = ((algorithmFPType)1) / (nObservations + (algorithmFPType)(row + 1)); - #else - algorithmFPType invN = ((algorithmFPType)1) / (algorithmFPType)(row + 1); - #endif - algorithmFPType delta = el - mean; -#endif - -#if (defined _RMIN_) - min = fmin(el, min); -#endif -#if (defined _RMAX_) - max = fmax(el, max); -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - sum += el; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - sum2 += el * el; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mean += delta * invN; - sum2Cent += delta * (el - mean); -#endif - } - -#if (defined _RMIN_) - gMin[x * rowParts + rowPartIndex] = min; -#endif -#if (defined _RMAX_) - gMax[x * rowParts + rowPartIndex] = max; -#endif -#if (defined _RSUM_) \ - || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - gSum[x * rowParts + rowPartIndex] = sum; -#endif -#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_) - gSum2[x * rowParts + rowPartIndex] = sum2; -#endif -#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - gSum2Cent[x * rowParts + rowPartIndex] = sum2Cent; -#endif - -#if !(defined _ONLINE_) - // common vars calculation - #if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType variance = sum2Cent / (rowPartSize - (algorithmFPType)1); - #endif - #if (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType stDev = (algorithmFPType)sqrt(variance); - #endif - - // output assignment - #if (defined _RMEAN_) - gMean[x * rowParts + rowPartIndex] = mean; - #endif - #if (defined _RSORM_) - gSecondOrderRawMoment[x * rowParts + rowPartIndex] = sum2 / rowPartSize; - #endif - #if (defined _RVARC_) - gVariance[x * rowParts + rowPartIndex] = variance; - #endif - #if (defined _RSTDEV_) - gStDev[x * rowParts + rowPartIndex] = stDev; - #endif - #if (defined _RVART_) - gVariation[x * rowParts + rowPartIndex] = stDev / mean; - #endif -#endif - } -} - -__kernel void singlePass(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize -#if (defined _ONLINE_) - , - const algorithmFPType nObservations -#endif -#if (defined _RMIN_) - , - __global algorithmFPType * gMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * gMax -#endif -#if (defined _RSUM_) \ - || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - __global algorithmFPType * gSum -#endif -#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_) - , - __global algorithmFPType * gSum2 -#endif -#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - __global algorithmFPType * gSum2Cent -#endif -#if !(defined _ONLINE_) - #if (defined _RMEAN_) - , - __global algorithmFPType * gMean - #endif - #if (defined _RSORM_) - , - __global algorithmFPType * gSecondOrderRawMoment - #endif - #if (defined _RVARC_) - , - __global algorithmFPType * gVariance - #endif - #if (defined _RSTDEV_) - , - __global algorithmFPType * gStDev - #endif - #if (defined _RVART_) - , - __global algorithmFPType * gVariation - #endif -#endif -) -{ - const uint tid = get_local_id(0); - const uint tnum = get_local_size(0); - const uint gid = get_group_id(0); - const uint gnum = get_num_groups(0); - - const uint colParts = (nVectors + tnum - 1) / tnum; - const uint rowParts = gnum / colParts; - - const uint rowPartIndex = gid / colParts; - const uint colPartIndex = gid - rowPartIndex * colParts; - - singlePassBlockProcessor(vectors, nVectors, vectorSize -#if (defined _ONLINE_) - , - nObservations -#endif -#if (defined _RMIN_) - , - gMin -#endif -#if (defined _RMAX_) - , - gMax -#endif -#if (defined _RSUM_) \ - || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - gSum -#endif -#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_) - , - gSum2 -#endif -#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - gSum2Cent -#endif -#if !(defined _ONLINE_) - #if (defined _RMEAN_) - , - gMean - #endif - #if (defined _RSORM_) - , - gSecondOrderRawMoment - #endif - #if (defined _RVARC_) - , - gVariance - #endif - #if (defined _RSTDEV_) - , - gStDev - #endif - #if (defined _RVART_) - , - gVariation - #endif -#endif - , - rowPartIndex, rowParts, colPartIndex, colParts, tid, tnum); -} - -/* common kernels for blocks processing */ - -void blockProcessor(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global uint * bNVec -#endif -#if (defined _RMIN_) - , - __global algorithmFPType * bMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * bMax -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * bSum -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - , - __global algorithmFPType * bSum2 -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * bSum2Cent -#endif - , - const uint rowPartIndex, const uint rowParts, const uint colPartIndex, const uint colParts, const uint tid, const uint tnum) -{ - const uint colOffset = colPartIndex * tnum; - const uint x = tid + colOffset; - - if (x < nVectors) - { - uint rowPartSize = (vectorSize + rowParts - 1) / rowParts; - const uint rowOffset = rowPartSize * rowPartIndex; - - if (rowPartSize + rowOffset > vectorSize) - { - rowPartSize = vectorSize - rowOffset; - } - -#if (defined _RMIN_) - algorithmFPType min = vectors[rowOffset * nVectors + x]; -#endif -#if (defined _RMAX_) - algorithmFPType max = vectors[rowOffset * nVectors + x]; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum = (algorithmFPType)0; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - algorithmFPType sum2 = (algorithmFPType)0; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum2Cent = (algorithmFPType)0; - algorithmFPType mean = (algorithmFPType)0; -#endif - - for (int row = 0; row < rowPartSize; row++) - { - const uint y = (row + rowOffset) * nVectors; - const algorithmFPType el = vectors[y + x]; -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType invN = ((algorithmFPType)1) / (algorithmFPType)(row + 1); - algorithmFPType delta = el - mean; -#endif - -#if (defined _RMIN_) - min = fmin(el, min); -#endif -#if (defined _RMAX_) - max = fmax(el, max); -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - sum += el; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - sum2 += el * el; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mean += delta * invN; - sum2Cent += delta * (el - mean); -#endif - } - -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - bNVec[x * rowParts + rowPartIndex] = (uint)rowPartSize; -#endif -#if (defined _RMIN_) - bMin[x * rowParts + rowPartIndex] = min; -#endif -#if (defined _RMAX_) - bMax[x * rowParts + rowPartIndex] = max; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - bSum[x * rowParts + rowPartIndex] = sum; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - bSum2[x * rowParts + rowPartIndex] = sum2; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - bSum2Cent[x * rowParts + rowPartIndex] = sum2Cent; -#endif - } -} - -__kernel void processBlocks(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global uint * bNVec -#endif -#if (defined _RMIN_) - , - __global algorithmFPType * bMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * bMax -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * bSum -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - , - __global algorithmFPType * bSum2 -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * bSum2Cent -#endif -) -{ - const uint tid = get_local_id(0); - const uint tnum = get_local_size(0); - const uint gid = get_group_id(0); - const uint gnum = get_num_groups(0); - - const uint colParts = (nVectors + tnum - 1) / tnum; - const uint rowParts = gnum / colParts; - - const uint rowPartIndex = gid / colParts; - const uint colPartIndex = gid - rowPartIndex * colParts; - - blockProcessor(vectors, nVectors, vectorSize -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - bNVec -#endif -#if (defined _RMIN_) - , - bMin -#endif -#if (defined _RMAX_) - , - bMax -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - bSum -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - , - bSum2 -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - bSum2Cent -#endif - , - rowPartIndex, rowParts, colPartIndex, colParts, tid, tnum); -} - -/* merge blocks kernel */ -__kernel void mergeBlocks(const uint vectorSize -#if (defined _ONLINE_) - , - const algorithmFPType nObservations -#endif -#if (defined _RMIN_) - , - __global algorithmFPType * gMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * gMax -#endif -#if (defined _RSUM_) \ - || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - __global algorithmFPType * gSum -#endif -#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_) - , - __global algorithmFPType * gSum2 -#endif -#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - , - __global algorithmFPType * gSum2Cent -#endif -#if !(defined _ONLINE_) - #if (defined _RMEAN_) - , - __global algorithmFPType * gMean - #endif - #if (defined _RSORM_) - , - __global algorithmFPType * gSecondOrderRawMoment - #endif - #if (defined _RVARC_) - , - __global algorithmFPType * gVariance - #endif - #if (defined _RSTDEV_) - , - __global algorithmFPType * gStDev - #endif - #if (defined _RVART_) - , - __global algorithmFPType * gVariation - #endif -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global uint * bNVec -#endif -#if (defined _RMIN_) - , - __global algorithmFPType * bMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * bMax -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * bSum -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - , - __global algorithmFPType * bSum2 -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * bSum2Cent -#endif -) -{ -#if (defined _RMIN_) - __local algorithmFPType lMin[LOCAL_BUFFER_SIZE]; -#endif -#if (defined _RMAX_) - __local algorithmFPType lMax[LOCAL_BUFFER_SIZE]; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - __local algorithmFPType lSum[LOCAL_BUFFER_SIZE]; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - __local algorithmFPType lSum2[LOCAL_BUFFER_SIZE]; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - __local uint lNVec[LOCAL_BUFFER_SIZE]; - __local algorithmFPType lSum2Cent[LOCAL_BUFFER_SIZE]; - __local algorithmFPType lMean[LOCAL_BUFFER_SIZE]; -#endif - - const uint localSize = get_local_size(0); - const uint globalDim = vectorSize; - const uint localDim = 1; - const uint itemId = get_local_id(0); - const uint groupId = get_group_id(0); - -#if (defined _RMIN_) - algorithmFPType mrgMin = bMin[groupId * globalDim + itemId * localDim]; -#endif -#if (defined _RMAX_) - algorithmFPType mrgMax = bMax[groupId * globalDim + itemId * localDim]; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType mrgSum = (algorithmFPType)0; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - algorithmFPType mrgSum2 = (algorithmFPType)0; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType mrgVectors = (algorithmFPType)0; - algorithmFPType mrgSum2Cent = (algorithmFPType)0; - algorithmFPType mrgMean = (algorithmFPType)0; -#endif - -#if (defined _ONLINE_) - if (0 == itemId && (algorithmFPType)0 != nObservations) - { - // item 0 in each group performs merge of previous results - #if (defined _RMIN_) - mrgMin = gMin[groupId]; - #endif - #if (defined _RMAX_) - mrgMax = gMax[groupId]; - #endif - #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgSum = gSum[groupId]; - #endif - #if (defined _RSUM2_) || (defined _RSORM_) - mrgSum2 = gSum2[groupId]; - #endif - #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgVectors = nObservations; - mrgSum2Cent = gSum2Cent[groupId]; - mrgMean = mrgSum / mrgVectors; - #endif - } -#endif - -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - lNVec[itemId] = mrgVectors; -#endif - - for (uint i = itemId; i < vectorSize; i += localSize) - { - uint offset = groupId * globalDim + i * localDim; - -#if (defined _RMIN_) - algorithmFPType min = bMin[offset]; -#endif -#if (defined _RMAX_) - algorithmFPType max = bMax[offset]; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum = bSum[offset]; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - algorithmFPType sum2 = bSum2[offset]; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - uint nVec = bNVec[offset]; - algorithmFPType sum2Cent = bSum2Cent[offset]; - algorithmFPType mean = sum / (algorithmFPType)nVec; - - algorithmFPType sumN1N2 = mrgVectors + (algorithmFPType)nVec; - algorithmFPType mulN1N2 = mrgVectors * (algorithmFPType)nVec; - algorithmFPType deltaScale = mulN1N2 / sumN1N2; - algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2; - algorithmFPType delta = mean - mrgMean; -#endif - -#if (defined _RMIN_) - mrgMin = fmin(min, mrgMin); -#endif -#if (defined _RMAX_) - mrgMax = fmax(max, mrgMax); -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgSum += sum; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - mrgSum2 += sum2; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgSum2Cent = mrgSum2Cent + sum2Cent + delta * delta * deltaScale; - mrgMean = (mrgMean * mrgVectors + mean * (algorithmFPType)nVec) * meanScale; - mrgVectors = sumN1N2; -#endif - -#if (defined _RMIN_) - lMin[itemId] = mrgMin; -#endif -#if (defined _RMAX_) - lMax[itemId] = mrgMax; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - lSum[itemId] = mrgSum; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - lSum2[itemId] = mrgSum2; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - lNVec[itemId] += nVec; - lSum2Cent[itemId] = mrgSum2Cent; - lMean[itemId] = mrgMean; -#endif - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint stride = localSize / 2; stride > 0; stride /= 2) - { - if (stride > itemId) - { - uint offset = itemId + stride; - -#if (defined _RMIN_) - algorithmFPType min = lMin[offset]; -#endif -#if (defined _RMAX_) - algorithmFPType max = lMax[offset]; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum = lSum[offset]; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - algorithmFPType sum2 = lSum2[offset]; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - uint nVec = lNVec[offset]; - algorithmFPType sum2Cent = lSum2Cent[offset]; - algorithmFPType mean = lMean[offset]; - - algorithmFPType sumN1N2 = mrgVectors + (algorithmFPType)nVec; - algorithmFPType mulN1N2 = mrgVectors * (algorithmFPType)nVec; - algorithmFPType deltaScale = mulN1N2 / sumN1N2; - algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2; - algorithmFPType delta = mean - mrgMean; -#endif - -#if (defined _RMIN_) - mrgMin = fmin(min, mrgMin); -#endif -#if (defined _RMAX_) - mrgMax = fmax(max, mrgMax); -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgSum += sum; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - mrgSum2 += sum2; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgSum2Cent = mrgSum2Cent + sum2Cent + delta * delta * deltaScale; - mrgMean = (mrgMean * mrgVectors + mean * (algorithmFPType)nVec) * meanScale; - mrgVectors = sumN1N2; -#endif - - // item 0 collects all results in private vars - // but all others need to store it - if (0 < itemId) - { -#if (defined _RMIN_) - lMin[itemId] = mrgMin; -#endif -#if (defined _RMAX_) - lMax[itemId] = mrgMax; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - lSum[itemId] = mrgSum; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - lSum2[itemId] = mrgSum2; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - lNVec[itemId] += nVec; - lSum2Cent[itemId] = mrgSum2Cent; - lMean[itemId] = mrgMean; -#endif - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (0 == itemId) - { -#if (defined _RMIN_) - gMin[groupId] = mrgMin; -#endif -#if (defined _RMAX_) - gMax[groupId] = mrgMax; -#endif -#if (defined _RSUM_) \ - || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - gSum[groupId] = mrgSum; -#endif -#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_) - gSum2[groupId] = mrgSum2; -#endif -#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)) - gSum2Cent[groupId] = mrgSum2Cent; -#endif - -#if !(defined _ONLINE_) - // common vars calculation - #if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType mrgVariance = mrgSum2Cent / (mrgVectors - (algorithmFPType)1); - #endif - #if (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType mrgStDev = (algorithmFPType)sqrt(mrgVariance); - #endif - - // output assignment - #if (defined _RMEAN_) - gMean[groupId] = mrgMean; - #endif - #if (defined _RSORM_) - gSecondOrderRawMoment[groupId] = mrgSum2 / mrgVectors; - #endif - #if (defined _RVARC_) - gVariance[groupId] = mrgVariance; - #endif - #if (defined _RSTDEV_) - gStDev[groupId] = mrgStDev; - #endif - #if (defined _RVART_) - gVariation[groupId] = mrgStDev / mrgMean; - #endif -#endif - } -} - -/* finalize kernel */ - -__kernel void finalize(const algorithmFPType nObservations -#if (defined _RMIN_) - , - __global algorithmFPType * gMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * gMax -#endif -#if (defined _RMEAN_) - , - __global algorithmFPType * gSum -#endif -#if (defined _RSORM_) - , - __global algorithmFPType * gSum2 -#endif -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * gSum2Cent -#endif -#if (defined _RMEAN_) - , - __global algorithmFPType * gMean -#endif -#if (defined _RSORM_) - , - __global algorithmFPType * gSecondOrderRawMoment -#endif -#if (defined _RVARC_) - , - __global algorithmFPType * gVariance -#endif -#if (defined _RSTDEV_) - , - __global algorithmFPType * gStDev -#endif -#if (defined _RVART_) - , - __global algorithmFPType * gVariation -#endif -) -{ - const uint tid = get_global_id(0); - -#if (defined _RMEAN_) - algorithmFPType sum = gSum[tid]; -#endif -#if (defined _RSORM_) - algorithmFPType sum2 = gSum2[tid]; -#endif -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum2Cent = gSum2Cent[tid]; -#endif -#if (defined _RMEAN_) || (defined _RVART_) - algorithmFPType mean = sum / nObservations; -#endif -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType variance = (algorithmFPType)0; -#endif -#if (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType stDev = (algorithmFPType)0; -#endif - -// common vars calculation -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - variance = sum2Cent / (nObservations - (algorithmFPType)1); -#endif -#if (defined _RSTDEV_) || (defined _RVART_) - stDev = (algorithmFPType)sqrt(variance); -#endif - -// output assignment -#if (defined _RMEAN_) - gMean[tid] = mean; -#endif -#if (defined _RSORM_) - gSecondOrderRawMoment[tid] = sum2 / nObservations; -#endif -#if (defined _RVARC_) - gVariance[tid] = variance; -#endif -#if (defined _RSTDEV_) - gStDev[tid] = stDev; -#endif -#if (defined _RVART_) - gVariation[tid] = stDev / mean; -#endif -} diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.h b/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.h deleted file mode 100644 index 2f9f6244f35..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.h +++ /dev/null @@ -1,906 +0,0 @@ -/* file: low_order_moments_kernels_all.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of low order moments kernels. -//-- -*/ - -#ifndef __low_order_moments_kernels_all__ -#define __low_order_moments_kernels_all__ - -static const char * low_order_moments_kernels_all_cl = - "\n" - "#define CONCAT(n, suff) n ## suff\n" - "#define FULLNAME(n, p) CONCAT(n, p)\n" - "\n" - "#define singlePassBlockProcessor FULLNAME(singlePassBlockProcessor, FNAMESUFF)\n" - "#define singlePass FULLNAME(singlePass, FNAMESUFF)\n" - "#define blockProcessor FULLNAME(blockProcessor, FNAMESUFF)\n" - "#define processBlocks FULLNAME(processBlocks, FNAMESUFF)\n" - "#define mergeBlocks FULLNAME(mergeBlocks, FNAMESUFF)\n" - "#define finalize FULLNAME(finalize, FNAMESUFF)\n" - "\n" - "/* single pass kernels common */\n" - "void singlePassBlockProcessor(__global const algorithmFPType* vectors,\n" - " const uint nVectors,\n" - " const uint vectorSize\n" - " #if (defined _ONLINE_)\n" - " ,const algorithmFPType nObservations\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* gMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* gMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined " - "_RSTDEV_) || (defined _RVART_))\n" - " ,__global algorithmFPType* gSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_)\n" - " ,__global algorithmFPType* gSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_))\n" - " ,__global algorithmFPType* gSum2Cent\n" - " #endif\n" - " #if !(defined _ONLINE_)\n" - " #if (defined _RMEAN_)\n" - " ,__global algorithmFPType* gMean\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,__global algorithmFPType* gSecondOrderRawMoment\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " ,__global algorithmFPType* gVariance\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " ,__global algorithmFPType* gStDev\n" - " #endif\n" - " #if (defined _RVART_)\n" - " ,__global algorithmFPType* gVariation\n" - " #endif\n" - " #endif\n" - " ,const uint rowPartIndex,\n" - " const uint rowParts,\n" - " const uint colPartIndex,\n" - " const uint colParts,\n" - " const uint tid,\n" - " const uint tnum)\n" - "{\n" - " const uint colOffset = colPartIndex * tnum;\n" - " const uint x = tid + colOffset;\n" - "\n" - " if (x < nVectors)\n" - " {\n" - " uint rowPartSize = (vectorSize + rowParts - 1) / rowParts;\n" - " const uint rowOffset = rowPartSize * rowPartIndex;\n" - "\n" - " if (rowPartSize + rowOffset > vectorSize)\n" - " {\n" - " rowPartSize = vectorSize - rowOffset;\n" - " }\n" - "\n" - "#if (defined _ONLINE_)\n" - " // for online mode initial values of min/max are defined later depending on nObservations \n" - " #if (defined _RMIN_)\n" - " algorithmFPType min = (algorithmFPType)0;\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " algorithmFPType max = (algorithmFPType)0;\n" - " #endif\n" - "#else\n" - " #if (defined _RMIN_)\n" - " algorithmFPType min = vectors[rowOffset * nVectors + x];\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " algorithmFPType max = vectors[rowOffset * nVectors + x];\n" - " #endif\n" - "#endif\n" - "\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum = (algorithmFPType)0; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " algorithmFPType sum2 = (algorithmFPType)0; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum2Cent = (algorithmFPType)0; \n" - " algorithmFPType mean = (algorithmFPType)0; \n" - "#endif\n" - "\n" - "#if (defined _ONLINE_)\n" - " if((algorithmFPType)0 == nObservations)\n" - " {\n" - " #if (defined _RMIN_)\n" - " min = vectors[rowOffset * nVectors + x];\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " max = vectors[rowOffset * nVectors + x];\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " sum = (algorithmFPType)0; \n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _RSORM_)\n" - " sum2 = (algorithmFPType)0; \n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " sum2Cent = (algorithmFPType)0; \n" - " mean = (algorithmFPType)0; \n" - " #endif\n" - " }\n" - " else\n" - " {\n" - " #if (defined _RMIN_)\n" - " min = gMin [x * rowParts + rowPartIndex];\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " max = gMax [x * rowParts + rowPartIndex];\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " sum = gSum [x * rowParts + rowPartIndex]; \n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _RSORM_)\n" - " sum2 = gSum2[x * rowParts + rowPartIndex]; \n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " sum2Cent = gSum2Cent[x * rowParts + rowPartIndex]; \n" - " mean = sum/nObservations; \n" - " #endif\n" - " }\n" - "#endif \n" - "\n" - " for (int row = 0; row < rowPartSize; row++)\n" - " {\n" - " const uint y = (row + rowOffset) * nVectors;\n" - " const algorithmFPType el = vectors[y + x];\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " #if (defined _ONLINE_)\n" - " algorithmFPType invN = ((algorithmFPType)1) / (nObservations + (algorithmFPType)(row + 1));\n" - " #else \n" - " algorithmFPType invN = ((algorithmFPType)1) / (algorithmFPType)(row + 1);\n" - " #endif\n" - " algorithmFPType delta = el - mean;\n" - "#endif\n" - "\n" - "#if (defined _RMIN_)\n" - " min = fmin(el, min);\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " max = fmax(el, max);\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " sum += el; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " sum2 += el * el; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mean += delta * invN;\n" - " sum2Cent += delta * (el - mean);\n" - "#endif\n" - " }\n" - "\n" - "#if (defined _RMIN_)\n" - " gMin [x * rowParts + rowPartIndex] = min;\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " gMax [x * rowParts + rowPartIndex] = max;\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined " - "_RVART_))\n" - " gSum [x * rowParts + rowPartIndex] = sum; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_)\n" - " gSum2[x * rowParts + rowPartIndex] = sum2;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_))\n" - " gSum2Cent[x * rowParts + rowPartIndex] = sum2Cent;\n" - "#endif\n" - "\n" - "#if !(defined _ONLINE_)\n" - " // common vars calculation\n" - " #if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType variance = sum2Cent / (rowPartSize - (algorithmFPType)1);\n" - " #endif\n" - " #if (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType stDev = (algorithmFPType)sqrt(variance);\n" - " #endif\n" - "\n" - " // output assignment\n" - " #if (defined _RMEAN_)\n" - " gMean[x * rowParts + rowPartIndex] = mean;\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " gSecondOrderRawMoment[x * rowParts + rowPartIndex] = sum2/rowPartSize;\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " gVariance[x * rowParts + rowPartIndex] = variance;\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " gStDev[x * rowParts + rowPartIndex] = stDev; \n" - " #endif\n" - " #if (defined _RVART_)\n" - " gVariation[x * rowParts + rowPartIndex] = stDev/mean;\n" - " #endif\n" - "#endif\n" - " }\n" - "}\n" - "\n" - "__kernel void singlePass(__global const algorithmFPType* vectors,\n" - " const uint nVectors,\n" - " const uint vectorSize\n" - " #if (defined _ONLINE_)\n" - " ,const algorithmFPType nObservations\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* gMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* gMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined " - "_RSTDEV_) || (defined _RVART_))\n" - " ,__global algorithmFPType* gSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_)\n" - " ,__global algorithmFPType* gSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_))\n" - " ,__global algorithmFPType* gSum2Cent\n" - " #endif\n" - " #if !(defined _ONLINE_)\n" - " #if (defined _RMEAN_)\n" - " ,__global algorithmFPType* gMean\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,__global algorithmFPType* gSecondOrderRawMoment\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " ,__global algorithmFPType* gVariance\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " ,__global algorithmFPType* gStDev\n" - " #endif\n" - " #if (defined _RVART_)\n" - " ,__global algorithmFPType* gVariation\n" - " #endif\n" - " #endif\n" - " )\n" - "{\n" - " const uint tid = get_local_id(0);\n" - " const uint tnum = get_local_size(0);\n" - " const uint gid = get_group_id(0);\n" - " const uint gnum = get_num_groups(0);\n" - "\n" - " const uint colParts = (nVectors + tnum - 1) / tnum;\n" - " const uint rowParts = gnum / colParts;\n" - "\n" - " const uint rowPartIndex = gid / colParts;\n" - " const uint colPartIndex = gid - rowPartIndex * colParts;\n" - "\n" - " singlePassBlockProcessor(vectors, nVectors, vectorSize \n" - " #if (defined _ONLINE_)\n" - " ,nObservations\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,gMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,gMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined " - "_RSTDEV_) || (defined _RVART_))\n" - " ,gSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_)\n" - " ,gSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_))\n" - " ,gSum2Cent\n" - " #endif\n" - " #if !(defined _ONLINE_)\n" - " #if (defined _RMEAN_)\n" - " ,gMean\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,gSecondOrderRawMoment\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " ,gVariance\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " ,gStDev\n" - " #endif\n" - " #if (defined _RVART_)\n" - " ,gVariation\n" - " #endif\n" - " #endif\n" - " ,rowPartIndex, rowParts,\n" - " colPartIndex, colParts,\n" - " tid, tnum);\n" - "}\n" - "\n" - "/* common kernels for blocks processing */\n" - "\n" - "void blockProcessor(__global const algorithmFPType* vectors,\n" - " const uint nVectors,\n" - " const uint vectorSize\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global uint* bNVec\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* bMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* bMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global algorithmFPType* bSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _RSORM_)\n" - " ,__global algorithmFPType* bSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global algorithmFPType* bSum2Cent\n" - " #endif\n" - " ,const uint rowPartIndex,\n" - " const uint rowParts,\n" - " const uint colPartIndex,\n" - " const uint colParts,\n" - " const uint tid,\n" - " const uint tnum)\n" - "{\n" - " const uint colOffset = colPartIndex * tnum;\n" - " const uint x = tid + colOffset;\n" - "\n" - " if (x < nVectors)\n" - " {\n" - " uint rowPartSize = (vectorSize + rowParts - 1) / rowParts;\n" - " const uint rowOffset = rowPartSize * rowPartIndex;\n" - "\n" - " if (rowPartSize + rowOffset > vectorSize)\n" - " {\n" - " rowPartSize = vectorSize - rowOffset;\n" - " }\n" - "\n" - "#if (defined _RMIN_)\n" - " algorithmFPType min = vectors[rowOffset * nVectors + x];\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " algorithmFPType max = vectors[rowOffset * nVectors + x];\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum = (algorithmFPType)0; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " algorithmFPType sum2 = (algorithmFPType)0; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum2Cent = (algorithmFPType)0; \n" - " algorithmFPType mean = (algorithmFPType)0; \n" - "#endif\n" - "\n" - " for (int row = 0; row < rowPartSize; row++)\n" - " {\n" - " const uint y = (row + rowOffset) * nVectors;\n" - " const algorithmFPType el = vectors[y + x];\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType invN = ((algorithmFPType)1) / (algorithmFPType)(row + 1);\n" - " algorithmFPType delta = el - mean;\n" - "#endif\n" - "\n" - "#if (defined _RMIN_)\n" - " min = fmin(el, min);\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " max = fmax(el, max);\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " sum += el; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " sum2 += el * el; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mean += delta * invN;\n" - " sum2Cent += delta * (el - mean);\n" - "#endif\n" - " }\n" - "\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " bNVec[x * rowParts + rowPartIndex] = (uint)rowPartSize;\n" - "#endif\n" - "#if (defined _RMIN_)\n" - " bMin [x * rowParts + rowPartIndex] = min;\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " bMax [x * rowParts + rowPartIndex] = max;\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " bSum [x * rowParts + rowPartIndex] = sum; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " bSum2[x * rowParts + rowPartIndex] = sum2;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " bSum2Cent[x * rowParts + rowPartIndex] = sum2Cent;\n" - "#endif\n" - " }\n" - "}\n" - "\n" - "__kernel void processBlocks(__global const algorithmFPType* vectors,\n" - " const uint nVectors,\n" - " const uint vectorSize\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global uint* bNVec\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* bMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* bMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined " - "_RVART_)\n" - " ,__global algorithmFPType* bSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _RSORM_)\n" - " ,__global algorithmFPType* bSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global algorithmFPType* bSum2Cent\n" - " #endif\n" - " )\n" - "{\n" - " const uint tid = get_local_id(0);\n" - " const uint tnum = get_local_size(0);\n" - " const uint gid = get_group_id(0);\n" - " const uint gnum = get_num_groups(0);\n" - "\n" - " const uint colParts = (nVectors + tnum - 1) / tnum;\n" - " const uint rowParts = gnum / colParts;\n" - "\n" - " const uint rowPartIndex = gid / colParts;\n" - " const uint colPartIndex = gid - rowPartIndex * colParts;\n" - "\n" - " blockProcessor(vectors, nVectors, vectorSize\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,bNVec\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,bMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,bMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,bSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _RSORM_)\n" - " ,bSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,bSum2Cent\n" - " #endif\n" - " ,rowPartIndex, rowParts,\n" - " colPartIndex, colParts,\n" - " tid, tnum);\n" - "}\n" - "\n" - "/* merge blocks kernel */\n" - "__kernel void mergeBlocks(const uint vectorSize\n" - " #if (defined _ONLINE_)\n" - " ,const algorithmFPType nObservations\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* gMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* gMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined " - "_RSTDEV_) || (defined _RVART_))\n" - " ,__global algorithmFPType* gSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_)\n" - " ,__global algorithmFPType* gSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_))\n" - " ,__global algorithmFPType* gSum2Cent\n" - " #endif\n" - " #if !(defined _ONLINE_)\n" - " #if (defined _RMEAN_)\n" - " ,__global algorithmFPType* gMean\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,__global algorithmFPType* gSecondOrderRawMoment\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " ,__global algorithmFPType* gVariance\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " ,__global algorithmFPType* gStDev\n" - " #endif\n" - " #if (defined _RVART_)\n" - " ,__global algorithmFPType* gVariation\n" - " #endif\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global uint* bNVec\n" - " #endif\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* bMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* bMax\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined " - "_RVART_)\n" - " ,__global algorithmFPType* bSum\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _RSORM_)\n" - " ,__global algorithmFPType* bSum2\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global algorithmFPType* bSum2Cent\n" - " #endif\n" - " )\n" - "{\n" - "#if (defined _RMIN_)\n" - " __local algorithmFPType lMin[LOCAL_BUFFER_SIZE];\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " __local algorithmFPType lMax[LOCAL_BUFFER_SIZE];\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " __local algorithmFPType lSum[LOCAL_BUFFER_SIZE];\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " __local algorithmFPType lSum2[LOCAL_BUFFER_SIZE];\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " __local uint lNVec[LOCAL_BUFFER_SIZE];\n" - " __local algorithmFPType lSum2Cent[LOCAL_BUFFER_SIZE];\n" - " __local algorithmFPType lMean[LOCAL_BUFFER_SIZE];\n" - "#endif\n" - "\n" - " const uint localSize = get_local_size(0);\n" - " const uint globalDim = vectorSize;\n" - " const uint localDim = 1;\n" - " const uint itemId = get_local_id(0);\n" - " const uint groupId = get_group_id(0);\n" - "\n" - "#if (defined _RMIN_)\n" - " algorithmFPType mrgMin = bMin[groupId*globalDim + itemId*localDim];\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " algorithmFPType mrgMax = bMax[groupId*globalDim + itemId*localDim];\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType mrgSum = (algorithmFPType)0;\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " algorithmFPType mrgSum2 = (algorithmFPType)0;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType mrgVectors = (algorithmFPType)0;\n" - " algorithmFPType mrgSum2Cent = (algorithmFPType)0;\n" - " algorithmFPType mrgMean = (algorithmFPType)0;\n" - "#endif\n" - "\n" - "#if (defined _ONLINE_)\n" - " if(0 == itemId && (algorithmFPType)0 != nObservations)\n" - " {\n" - " // item 0 in each group performs merge of previous results\n" - " #if (defined _RMIN_)\n" - " mrgMin = gMin[groupId];\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " mrgMax = gMax[groupId];\n" - " #endif\n" - " #if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgSum = gSum[groupId];\n" - " #endif\n" - " #if (defined _RSUM2_) || (defined _RSORM_)\n" - " mrgSum2 = gSum2[groupId];\n" - " #endif\n" - " #if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgVectors = nObservations; \n" - " mrgSum2Cent = gSum2Cent[groupId];\n" - " mrgMean = mrgSum/mrgVectors;\n" - " #endif\n" - " }\n" - "#endif\n" - "\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " lNVec[itemId] = mrgVectors; \n" - "#endif\n" - "\n" - " for(uint i = itemId; i < vectorSize; i+= localSize)\n" - " {\n" - " uint offset = groupId*globalDim + i*localDim;\n" - "\n" - "#if (defined _RMIN_)\n" - " algorithmFPType min = bMin[offset]; \n" - "#endif\n" - "#if (defined _RMAX_)\n" - " algorithmFPType max = bMax[offset];\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum = bSum[offset]; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " algorithmFPType sum2 = bSum2[offset]; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " uint nVec = bNVec[offset];\n" - " algorithmFPType sum2Cent = bSum2Cent[offset]; \n" - " algorithmFPType mean = sum/(algorithmFPType)nVec;\n" - " \n" - " algorithmFPType sumN1N2 = mrgVectors + (algorithmFPType)nVec;\n" - " algorithmFPType mulN1N2 = mrgVectors * (algorithmFPType)nVec;\n" - " algorithmFPType deltaScale = mulN1N2 / sumN1N2;\n" - " algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2;\n" - " algorithmFPType delta = mean - mrgMean;\n" - "#endif\n" - "\n" - "#if (defined _RMIN_)\n" - " mrgMin = fmin(min, mrgMin);\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " mrgMax = fmax(max, mrgMax);\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgSum += sum; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " mrgSum2 += sum2;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgSum2Cent = mrgSum2Cent + sum2Cent + delta*delta*deltaScale;\n" - " mrgMean = (mrgMean * mrgVectors + mean * (algorithmFPType)nVec)* meanScale;\n" - " mrgVectors = sumN1N2;\n" - "#endif\n" - "\n" - "#if (defined _RMIN_)\n" - " lMin[itemId] = mrgMin; \n" - "#endif\n" - "#if (defined _RMAX_)\n" - " lMax[itemId] = mrgMax; \n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " lSum[itemId] = mrgSum; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " lSum2[itemId] = mrgSum2; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " lNVec[itemId] += nVec; \n" - " lSum2Cent[itemId] = mrgSum2Cent;\n" - " lMean[itemId] = mrgMean;\n" - "#endif\n" - " }\n" - "\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - "\n" - " for (uint stride = localSize / 2; stride > 0; stride /= 2)\n" - " {\n" - " if (stride > itemId)\n" - " {\n" - " uint offset = itemId + stride;\n" - "\n" - "#if (defined _RMIN_)\n" - " algorithmFPType min = lMin[offset]; \n" - "#endif\n" - "#if (defined _RMAX_)\n" - " algorithmFPType max = lMax[offset];\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum = lSum[offset]; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " algorithmFPType sum2 = lSum2[offset]; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " uint nVec = lNVec[offset];\n" - " algorithmFPType sum2Cent = lSum2Cent[offset]; \n" - " algorithmFPType mean = lMean[offset];\n" - "\n" - " algorithmFPType sumN1N2 = mrgVectors + (algorithmFPType) nVec;\n" - " algorithmFPType mulN1N2 = mrgVectors * (algorithmFPType) nVec;\n" - " algorithmFPType deltaScale = mulN1N2 / sumN1N2;\n" - " algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2;\n" - " algorithmFPType delta = mean - mrgMean;\n" - "#endif\n" - "\n" - "#if (defined _RMIN_)\n" - " mrgMin = fmin(min, mrgMin);\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " mrgMax = fmax(max, mrgMax);\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgSum += sum; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " mrgSum2 += sum2;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgSum2Cent = mrgSum2Cent + sum2Cent + delta*delta*deltaScale;\n" - " mrgMean = (mrgMean * mrgVectors + mean * (algorithmFPType)nVec)* meanScale;\n" - " mrgVectors = sumN1N2;\n" - "#endif\n" - "\n" - " // item 0 collects all results in private vars\n" - " // but all others need to store it\n" - " if(0 < itemId)\n" - " {\n" - "#if (defined _RMIN_)\n" - " lMin[itemId] = mrgMin; \n" - "#endif\n" - "#if (defined _RMAX_)\n" - " lMax[itemId] = mrgMax; \n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " lSum[itemId] = mrgSum; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " lSum2[itemId] = mrgSum2; \n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " lNVec[itemId] += nVec; \n" - " lSum2Cent[itemId] = mrgSum2Cent;\n" - " lMean[itemId] = mrgMean;\n" - "#endif\n" - " }\n" - " }\n" - " barrier(CLK_LOCAL_MEM_FENCE);\n" - " }\n" - " \n" - " if (0 == itemId)\n" - " {\n" - "#if (defined _RMIN_)\n" - " gMin[groupId] = mrgMin;\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " gMax[groupId] = mrgMax;\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _ONLINE_) && ((defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined " - "_RVART_))\n" - " gSum[groupId] = mrgSum; \n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _ONLINE_) && (defined _RSORM_)\n" - " gSum2[groupId] = mrgSum2;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _ONLINE_) && ((defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_))\n" - " gSum2Cent[groupId] = mrgSum2Cent;\n" - "#endif\n" - "\n" - "#if !(defined _ONLINE_)\n" - " // common vars calculation\n" - " #if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType mrgVariance = mrgSum2Cent / (mrgVectors - (algorithmFPType)1);\n" - " #endif\n" - " #if (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType mrgStDev = (algorithmFPType)sqrt(mrgVariance);\n" - " #endif\n" - "\n" - " // output assignment\n" - " #if (defined _RMEAN_)\n" - " gMean[groupId] = mrgMean;\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " gSecondOrderRawMoment[groupId] = mrgSum2/mrgVectors;\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " gVariance[groupId] = mrgVariance;\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " gStDev[groupId] = mrgStDev; \n" - " #endif\n" - " #if (defined _RVART_)\n" - " gVariation[groupId] = mrgStDev/mrgMean;\n" - " #endif\n" - "#endif\n" - " }\n" - "}\n" - "\n" - "/* finalize kernel */\n" - "\n" - "__kernel void finalize(const algorithmFPType nObservations\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* gMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* gMax\n" - " #endif\n" - " #if (defined _RMEAN_)\n" - " ,__global algorithmFPType* gSum\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,__global algorithmFPType* gSum2\n" - " #endif\n" - " #if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global algorithmFPType* gSum2Cent\n" - " #endif\n" - " #if (defined _RMEAN_)\n" - " ,__global algorithmFPType* gMean\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,__global algorithmFPType* gSecondOrderRawMoment\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " ,__global algorithmFPType* gVariance\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " ,__global algorithmFPType* gStDev\n" - " #endif\n" - " #if (defined _RVART_)\n" - " ,__global algorithmFPType* gVariation\n" - " #endif\n" - " )\n" - "{\n" - " const uint tid = get_global_id(0);\n" - "\n" - "#if (defined _RMEAN_)\n" - " algorithmFPType sum = gSum [tid]; \n" - "#endif\n" - "#if (defined _RSORM_)\n" - " algorithmFPType sum2 = gSum2[tid]; \n" - "#endif\n" - "#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum2Cent = gSum2Cent[tid]; \n" - "#endif\n" - "#if (defined _RMEAN_) || (defined _RVART_)\n" - " algorithmFPType mean = sum / nObservations;\n" - "#endif\n" - "#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType variance = (algorithmFPType)0; \n" - "#endif\n" - "#if (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType stDev = (algorithmFPType)0; \n" - "#endif\n" - "\n" - "// common vars calculation\n" - "#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " variance = sum2Cent / (nObservations - (algorithmFPType)1);\n" - "#endif\n" - "#if (defined _RSTDEV_) || (defined _RVART_)\n" - " stDev = (algorithmFPType)sqrt(variance);\n" - "#endif\n" - "\n" - "// output assignment\n" - "#if (defined _RMEAN_)\n" - " gMean[tid] = mean;\n" - "#endif\n" - "#if (defined _RSORM_)\n" - " gSecondOrderRawMoment[tid] = sum2 / nObservations;\n" - "#endif\n" - "#if (defined _RVARC_)\n" - " gVariance[tid] = variance;\n" - "#endif\n" - "#if (defined _RSTDEV_)\n" - " gStDev[tid] = stDev; \n" - "#endif\n" - "#if (defined _RVART_)\n" - " gVariation[tid] = stDev / mean;\n" - "#endif\n" - "}\n" - "\n"; -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.cl b/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.cl deleted file mode 100644 index 9f2df848a0c..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.cl +++ /dev/null @@ -1,252 +0,0 @@ -/* file: low_order_moments_kernels_distr.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of low order moments kernels. -//-- -*/ - -#define CONCAT(n, suff) n##suff -#define FULLNAME(n, p) CONCAT(n, p) - -#define mergeDistrBlocks FULLNAME(mergeDistrBlocks, FNAMESUFF) -#define finalize FULLNAME(finalize, FNAMESUFF) - -/* merge distributed blocks kernel */ -__kernel void mergeDistrBlocks(uint nFeatures, uint nBlocks, uint stride -#if (defined _RMIN_) - , - __global algorithmFPType * gMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * gMax -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * gSum -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - , - __global algorithmFPType * gSum2 -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * gSum2Cent -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - const __global algorithmFPType * bNVec -#endif -#if (defined _RMIN_) - , - const __global algorithmFPType * bMin -#endif -#if (defined _RMAX_) - , - const __global algorithmFPType * bMax -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - const __global algorithmFPType * bSum -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - , - const __global algorithmFPType * bSum2 -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - const __global algorithmFPType * bSum2Cent -#endif -) -{ - const uint itemId = get_global_id(0); - -#if (defined _RMIN_) - algorithmFPType mrgMin = bMin[itemId]; -#endif -#if (defined _RMAX_) - algorithmFPType mrgMax = bMax[itemId]; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType mrgSum = (algorithmFPType)0; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - algorithmFPType mrgSum2 = (algorithmFPType)0; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType mrgVectors = (algorithmFPType)0; - algorithmFPType mrgSum2Cent = (algorithmFPType)0; - algorithmFPType mrgMean = (algorithmFPType)0; -#endif - - for (uint i = 0; i < nBlocks; i++) - { - uint offset = i * stride; - -#if (defined _RMIN_) - algorithmFPType min = bMin[offset]; -#endif -#if (defined _RMAX_) - algorithmFPType max = bMax[offset]; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum = bSum[offset]; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - algorithmFPType sum2 = bSum2[offset]; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType nVec = bNVec[i]; - algorithmFPType sum2Cent = bSum2Cent[offset]; - algorithmFPType mean = sum / nVec; - - algorithmFPType sumN1N2 = mrgVectors + nVec; - algorithmFPType mulN1N2 = mrgVectors * nVec; - algorithmFPType deltaScale = mulN1N2 / sumN1N2; - algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2; - algorithmFPType delta = mean - mrgMean; -#endif - -#if (defined _RMIN_) - mrgMin = fmin(min, mrgMin); -#endif -#if (defined _RMAX_) - mrgMax = fmax(max, mrgMax); -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgSum += sum; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - mrgSum2 += sum2; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - mrgSum2Cent = mrgSum2Cent + sum2Cent + delta * delta * deltaScale; - mrgMean = (mrgMean * mrgVectors + mean * nVec) * meanScale; - mrgVectors = sumN1N2; -#endif - } - -#if (defined _RMIN_) - gMin[itemId] = mrgMin; -#endif -#if (defined _RMAX_) - gMax[itemId] = mrgMax; -#endif -#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - gSum[itemId] = mrgSum; -#endif -#if (defined _RSUM2_) || (defined _RSORM_) - gSum2[itemId] = mrgSum2; -#endif -#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - gSum2Cent[itemId] = mrgSum2Cent; -#endif -} - -/* finalize kernel */ - -__kernel void finalize(const algorithmFPType nObservations -#if (defined _RMIN_) - , - __global algorithmFPType * gMin -#endif -#if (defined _RMAX_) - , - __global algorithmFPType * gMax -#endif -#if (defined _RMEAN_) - , - __global algorithmFPType * gSum -#endif -#if (defined _RSORM_) - , - __global algorithmFPType * gSum2 -#endif -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - , - __global algorithmFPType * gSum2Cent -#endif -#if (defined _RMEAN_) - , - __global algorithmFPType * gMean -#endif -#if (defined _RSORM_) - , - __global algorithmFPType * gSecondOrderRawMoment -#endif -#if (defined _RVARC_) - , - __global algorithmFPType * gVariance -#endif -#if (defined _RSTDEV_) - , - __global algorithmFPType * gStDev -#endif -#if (defined _RVART_) - , - __global algorithmFPType * gVariation -#endif -) -{ - const uint tid = get_global_id(0); - -#if (defined _RMEAN_) - algorithmFPType sum = gSum[tid]; -#endif -#if (defined _RSORM_) - algorithmFPType sum2 = gSum2[tid]; -#endif -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType sum2Cent = gSum2Cent[tid]; -#endif -#if (defined _RMEAN_) || (defined _RVART_) - algorithmFPType mean = sum / nObservations; -#endif -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType variance = (algorithmFPType)0; -#endif -#if (defined _RSTDEV_) || (defined _RVART_) - algorithmFPType stDev = (algorithmFPType)0; -#endif - -// common vars calculation -#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_) - variance = sum2Cent / (nObservations - (algorithmFPType)1); -#endif -#if (defined _RSTDEV_) || (defined _RVART_) - stDev = (algorithmFPType)sqrt(variance); -#endif - -// output assignment -#if (defined _RMEAN_) - gMean[tid] = mean; -#endif -#if (defined _RSORM_) - gSecondOrderRawMoment[tid] = sum2 / nObservations; -#endif -#if (defined _RVARC_) - gVariance[tid] = variance; -#endif -#if (defined _RSTDEV_) - gStDev[tid] = stDev; -#endif -#if (defined _RVART_) - gVariation[tid] = stDev / mean; -#endif -} diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.h b/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.h deleted file mode 100644 index 7517b9bdaa2..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.h +++ /dev/null @@ -1,248 +0,0 @@ -/* file: low_order_moments_kernels_distr.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of low order moments kernels. -//-- -*/ - -#ifndef __low_order_moments_kernels_distr__ -#define __low_order_moments_kernels_distr__ - -static const char * low_order_moments_kernels_distr_cl = - "\n" - "#define CONCAT(n, suff) n ## suff\n" - "#define FULLNAME(n, p) CONCAT(n, p)\n" - "\n" - "#define mergeDistrBlocks FULLNAME(mergeDistrBlocks, FNAMESUFF)\n" - "#define finalize FULLNAME(finalize, FNAMESUFF)\n" - "\n" - "/* merge distributed blocks kernel */\n" - "__kernel void mergeDistrBlocks(uint nFeatures, uint nBlocks, uint stride\n" - "#if (defined _RMIN_)\n" - " ,\n" - " __global algorithmFPType * gMin\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " ,\n" - " __global algorithmFPType * gMax\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,\n" - " __global algorithmFPType * gSum\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " ,\n" - " __global algorithmFPType * gSum2\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,\n" - " __global algorithmFPType * gSum2Cent\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,\n" - " const __global algorithmFPType * bNVec\n" - "#endif\n" - "#if (defined _RMIN_)\n" - " ,\n" - " const __global algorithmFPType * bMin\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " ,\n" - " const __global algorithmFPType * bMax\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,\n" - " const __global algorithmFPType * bSum\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " ,\n" - " const __global algorithmFPType * bSum2\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,\n" - " const __global algorithmFPType * bSum2Cent\n" - "#endif\n" - ")\n" - "{\n" - " const uint itemId = get_global_id(0);\n" - "\n" - "#if (defined _RMIN_)\n" - " algorithmFPType mrgMin = bMin[itemId];\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " algorithmFPType mrgMax = bMax[itemId];\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType mrgSum = (algorithmFPType)0;\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " algorithmFPType mrgSum2 = (algorithmFPType)0;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType mrgVectors = (algorithmFPType)0;\n" - " algorithmFPType mrgSum2Cent = (algorithmFPType)0;\n" - " algorithmFPType mrgMean = (algorithmFPType)0;\n" - "#endif\n" - "\n" - " for (uint i = 0; i < nBlocks; i++)\n" - " {\n" - " uint offset = i * stride + itemId;\n" - "\n" - "#if (defined _RMIN_)\n" - " algorithmFPType min = bMin[offset];\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " algorithmFPType max = bMax[offset];\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum = bSum[offset];\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " algorithmFPType sum2 = bSum2[offset];\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType nVec = bNVec[i];\n" - " algorithmFPType sum2Cent = bSum2Cent[offset];\n" - " algorithmFPType mean = sum / nVec;\n" - "\n" - " algorithmFPType sumN1N2 = mrgVectors + nVec;\n" - " algorithmFPType mulN1N2 = mrgVectors * nVec;\n" - " algorithmFPType deltaScale = mulN1N2 / sumN1N2;\n" - " algorithmFPType meanScale = (algorithmFPType)1 / sumN1N2;\n" - " algorithmFPType delta = mean - mrgMean;\n" - "#endif\n" - "\n" - "#if (defined _RMIN_)\n" - " mrgMin = fmin(min, mrgMin);\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " mrgMax = fmax(max, mrgMax);\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgSum += sum;\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " mrgSum2 += sum2;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " mrgSum2Cent = mrgSum2Cent + sum2Cent + delta * delta * deltaScale;\n" - " mrgMean = (mrgMean * mrgVectors + mean * nVec) * meanScale;\n" - " mrgVectors = sumN1N2;\n" - "#endif\n" - " }\n" - "\n" - "#if (defined _RMIN_)\n" - " gMin[itemId] = mrgMin;\n" - "#endif\n" - "#if (defined _RMAX_)\n" - " gMax[itemId] = mrgMax;\n" - "#endif\n" - "#if (defined _RSUM_) || (defined _RMEAN_) || (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " gSum[itemId] = mrgSum;\n" - "#endif\n" - "#if (defined _RSUM2_) || (defined _RSORM_)\n" - " gSum2[itemId] = mrgSum2;\n" - "#endif\n" - "#if (defined _RSUM2C_) || (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " gSum2Cent[itemId] = mrgSum2Cent;\n" - "#endif\n" - "}\n" - "/* finalize kernel */\n" - "\n" - "__kernel void finalize(const algorithmFPType nObservations\n" - " #if (defined _RMIN_)\n" - " ,__global algorithmFPType* gMin\n" - " #endif\n" - " #if (defined _RMAX_)\n" - " ,__global algorithmFPType* gMax\n" - " #endif\n" - " #if (defined _RMEAN_)\n" - " ,__global algorithmFPType* gSum\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,__global algorithmFPType* gSum2\n" - " #endif\n" - " #if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " ,__global algorithmFPType* gSum2Cent\n" - " #endif\n" - " #if (defined _RMEAN_)\n" - " ,__global algorithmFPType* gMean\n" - " #endif\n" - " #if (defined _RSORM_)\n" - " ,__global algorithmFPType* gSecondOrderRawMoment\n" - " #endif\n" - " #if (defined _RVARC_)\n" - " ,__global algorithmFPType* gVariance\n" - " #endif\n" - " #if (defined _RSTDEV_)\n" - " ,__global algorithmFPType* gStDev\n" - " #endif\n" - " #if (defined _RVART_)\n" - " ,__global algorithmFPType* gVariation\n" - " #endif\n" - " )\n" - "{\n" - " const uint tid = get_global_id(0);\n" - "\n" - "#if (defined _RMEAN_)\n" - " algorithmFPType sum = gSum [tid]; \n" - "#endif\n" - "#if (defined _RSORM_)\n" - " algorithmFPType sum2 = gSum2[tid]; \n" - "#endif\n" - "#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType sum2Cent = gSum2Cent[tid]; \n" - "#endif\n" - "#if (defined _RMEAN_) || (defined _RVART_)\n" - " algorithmFPType mean = sum / nObservations;\n" - "#endif\n" - "#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType variance = (algorithmFPType)0; \n" - "#endif\n" - "#if (defined _RSTDEV_) || (defined _RVART_)\n" - " algorithmFPType stDev = (algorithmFPType)0; \n" - "#endif\n" - "\n" - "// common vars calculation\n" - "#if (defined _RVARC_) || (defined _RSTDEV_) || (defined _RVART_)\n" - " variance = sum2Cent / (nObservations - (algorithmFPType)1);\n" - "#endif\n" - "#if (defined _RSTDEV_) || (defined _RVART_)\n" - " stDev = (algorithmFPType)sqrt(variance);\n" - "#endif\n" - "\n" - "// output assignment\n" - "#if (defined _RMEAN_)\n" - " gMean[tid] = mean;\n" - "#endif\n" - "#if (defined _RSORM_)\n" - " gSecondOrderRawMoment[tid] = sum2 / nObservations;\n" - "#endif\n" - "#if (defined _RVARC_)\n" - " gVariance[tid] = variance;\n" - "#endif\n" - "#if (defined _RSTDEV_)\n" - " gStDev[tid] = stDev; \n" - "#endif\n" - "#if (defined _RVART_)\n" - " gVariation[tid] = stDev / mean;\n" - "#endif\n" - "}\n" - "\n"; -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_batch_oneapi_impl.i b/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_batch_oneapi_impl.i deleted file mode 100644 index b914ad5ac0a..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_batch_oneapi_impl.i +++ /dev/null @@ -1,415 +0,0 @@ -/* file: low_order_moments_batch_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Low order moments algorithm implementation in batch mode. -//-- -*/ - -#ifndef __LOW_ORDER_MOMENTS_BATCH_ONEAPI_IMPL_I__ -#define __LOW_ORDER_MOMENTS_BATCH_ONEAPI_IMPL_I__ - -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_batch_oneapi.h" -#include "src/externals/service_profiler.h" -#include "services/internal/execution_context.h" -#include "services/daal_defines.h" - -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -#define CHECK_AND_RET_IF_FAIL(st, expr) \ - (st) |= (expr); \ - if (!st) \ - { \ - return; \ - } - -template <> -const char * TaskInfoBatch::kSinglePassName = "singlePassMinMax"; - -template <> -const char * TaskInfoBatch::kProcessBlocksName = "processBlocksMinMax"; - -template <> -const char * TaskInfoBatch::kMergeBlocksName = "mergeBlocksMinMax"; - -template <> -const char * TaskInfoBatch::kBldOptFNameSuff = " -DFNAMESUFF=MinMax "; - -template <> -const char * TaskInfoBatch::kBldOptScope = " -D_RMIN_ -D_RMAX_ "; - -template <> -const char * TaskInfoBatch::kCacheKey = "__daal_algorithms_low_order_moments_batch_kernels_minmax"; - -template <> -const char * TaskInfoBatch::kSinglePassName = "singlePassMeanVariance"; - -template <> -const char * TaskInfoBatch::kProcessBlocksName = "processBlocksMeanVariance"; - -template <> -const char * TaskInfoBatch::kMergeBlocksName = "mergeBlocksMeanVariance"; - -template <> -const char * TaskInfoBatch::kBldOptFNameSuff = " -DFNAMESUFF=MeanVariance "; - -template <> -const char * TaskInfoBatch::kBldOptScope = " -D_RMEAN_ -D_RVARC_ "; - -template <> -const char * TaskInfoBatch::kCacheKey = "__daal_algorithms_low_order_moments_batch_kernels_mean_variance"; - -template <> -const char * TaskInfoBatch::kSinglePassName = "singlePassAll"; - -template <> -const char * TaskInfoBatch::kProcessBlocksName = "processBlocksAll"; - -template <> -const char * TaskInfoBatch::kMergeBlocksName = "mergeBlocksAll"; - -template <> -const char * TaskInfoBatch::kBldOptFNameSuff = " -DFNAMESUFF=All "; - -template <> -const char * TaskInfoBatch::kBldOptScope = - " -D_RMIN_ -D_RMAX_ -D_RSUM_ -D_RSUM2_ -D_RSUM2C_ -D_RMEAN_ -D_RSORM_ -D_RVARC_ -D_RSTDEV_ -D_RVART_ "; - -template <> -const char * TaskInfoBatch::kCacheKey = "__daal_algorithms_low_order_moments_batch_kernels_all"; - -/* - Kernel methods implementation -*/ -template -services::Status LowOrderMomentsBatchKernelOneAPI::compute(NumericTable * dataTable, Result * result, - const Parameter * parameter) -{ - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - - if (method == defaultDense) - { - if (parameter->estimatesToCompute == estimatesMinMax) - { - LowOrderMomentsBatchTaskOneAPI task(context, dataTable, result, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else if (parameter->estimatesToCompute == estimatesMeanVariance) - { - LowOrderMomentsBatchTaskOneAPI task(context, dataTable, result, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else - { - /* estimatesAll */ - LowOrderMomentsBatchTaskOneAPI task(context, dataTable, result, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - } - - return services::Status(ErrorMethodNotImplemented); -} - -template -static inline services::Status overflowCheckByMultiplication(const Q & v1, const P & v2) -{ - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(T, v1, v2); - return services::Status(); -} - -template -static inline services::Status buildProgram(ClKernelFactoryIface & factory, const char * buildOptions = nullptr) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(buildProgram); - - services::Status status; - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - - build_options.add(" -cl-std=CL1.2 -D LOCAL_BUFFER_SIZE=256 "); - build_options.add(TaskInfoBatch::kBldOptFNameSuff); - build_options.add(TaskInfoBatch::kBldOptScope); - - if (buildOptions) - { - build_options.add(buildOptions); - } - - services::String cachekey(TaskInfoBatch::kCacheKey); - cachekey.add(fptype_name); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), low_order_moments_kernels_all_cl, build_options.c_str(), status); - - return status; -} - -/* - Batch task methods implementations -*/ -template -LowOrderMomentsBatchTaskOneAPI::LowOrderMomentsBatchTaskOneAPI(ExecutionContextIface & context, NumericTable * dataTable, - Result * result, services::Status & status) - : dataTable(dataTable) -{ - if (dataTable->getNumberOfRows() > _uint32max) - { - status |= services::ErrorIncorrectNumberOfRowsInInputNumericTable; - return; - } - if (dataTable->getNumberOfColumns() > _uint32max) - { - status |= services::ErrorIncorrectNumberOfColumnsInInputNumericTable; - return; - } - - nVectors = static_cast(dataTable->getNumberOfRows()); - nFeatures = static_cast(dataTable->getNumberOfColumns()); - - nColsBlocks = (nFeatures + maxWorkItemsPerGroup - 1) / maxWorkItemsPerGroup; - - nRowsBlocks = 128; - if (nVectors < 5000) - nRowsBlocks = 1; - else if (nVectors < 10000) - nRowsBlocks = 8; - else if (nVectors < 20000) - nRowsBlocks = 16; - else if (nVectors < 50000) - nRowsBlocks = 32; - else if (nVectors < 100000) - nRowsBlocks = 64; - - workItemsPerGroup = (maxWorkItemsPerGroup < nFeatures) ? maxWorkItemsPerGroup : nFeatures; - - CHECK_AND_RET_IF_FAIL(status, dataTable->getBlockOfRows(0, nVectors, readOnly, dataBD)); - - for (uint32_t i = 0; i < TaskInfoBatch::nResults; i++) - { - resultTable[i] = result->get((ResultId)TaskInfoBatch::resIds[i]); - CHECK_AND_RET_IF_FAIL(status, resultTable[i]->getBlockOfRows(0, 1, writeOnly, resultBD[i])); - } - - status |= overflowCheckByMultiplication(nRowsBlocks, nFeatures); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - if (TaskInfoBatch::isRowsInBlockInfoRequired) - { - if (nRowsBlocks > 1) - { - bNVec = context.allocate(TypeIds::uint32, nFeatures * nRowsBlocks, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - } - - if (nRowsBlocks > 1) - { - for (uint32_t i = 0; i < TaskInfoBatch::nBuffers; i++) - { - bAuxBuffers[i] = context.allocate(TypeIds::id(), nFeatures * nRowsBlocks, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - } -} - -template -LowOrderMomentsBatchTaskOneAPI::~LowOrderMomentsBatchTaskOneAPI() -{ - if (dataTable) - { - dataTable->releaseBlockOfRows(dataBD); - } - - for (uint32_t i = 0; i < TaskInfoBatch::nResults; i++) - { - if (resultTable[i]) - { - resultTable[i]->releaseBlockOfRows(resultBD[i]); - } - } -} - -template -services::Status LowOrderMomentsBatchTaskOneAPI::compute() -{ - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsBatchTaskOneAPI.compute); - - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - if (nRowsBlocks > 1) - { - /* process rows by blocks first */ - auto kProcessBlocks = factory.getKernel(TaskInfoBatch::kProcessBlocksName, status); - DAAL_CHECK_STATUS_VAR(status); - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRowsBlocks, nColsBlocks); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRowsBlocks * nColsBlocks, workItemsPerGroup); - KernelRange localRange(workItemsPerGroup); - KernelRange globalRange(nRowsBlocks * nColsBlocks * workItemsPerGroup); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args( - 3 + TaskInfoBatch::nBuffers + (TaskInfoBatch::isRowsInBlockInfoRequired ? 1 : 0), - status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - DAAL_ASSERT(dataBD.getBuffer().size() == nVectors * nFeatures); - args.set(argsI++, dataBD.getBuffer(), AccessModeIds::read); - args.set(argsI++, nFeatures); - args.set(argsI++, nVectors); - - if (TaskInfoBatch::isRowsInBlockInfoRequired) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bNVec, uint32_t, nFeatures * nRowsBlocks); - args.set(argsI++, bNVec, AccessModeIds::write); - } - - for (uint32_t i = 0; i < TaskInfoBatch::nBuffers; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bAuxBuffers[i], algorithmFPType, nFeatures * nRowsBlocks); - args.set(argsI++, bAuxBuffers[i], AccessModeIds::write); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsBatchTaskOneAPI.ProcessBlocks); - context.run(range, kProcessBlocks, args, status); - } - DAAL_CHECK_STATUS_VAR(status); - } - - /* merge blocks */ - auto kMergeBlocks = factory.getKernel(TaskInfoBatch::kMergeBlocksName, status); - DAAL_CHECK_STATUS_VAR(status); - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nFeatures, maxWorkItemsPerGroupToMerge); - KernelRange localRange(maxWorkItemsPerGroupToMerge); - KernelRange globalRange(maxWorkItemsPerGroupToMerge * nFeatures); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(1 + TaskInfoBatch::nResults + TaskInfoBatch::nBuffers - + (TaskInfoBatch::isRowsInBlockInfoRequired ? 1 : 0), - status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - args.set(argsI++, nRowsBlocks); // num of values to merge - for (uint32_t i = 0; i < TaskInfoBatch::nResults; i++) - { - DAAL_ASSERT(resultBD[i].getBuffer().size() == nFeatures); - args.set(argsI++, resultBD[i].getBuffer(), AccessModeIds::readwrite); - } - - if (TaskInfoBatch::isRowsInBlockInfoRequired) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bNVec, uint32_t, nFeatures * nRowsBlocks); - args.set(argsI++, bNVec, AccessModeIds::write); - } - - for (uint32_t i = 0; i < TaskInfoBatch::nBuffers; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bAuxBuffers[i], algorithmFPType, nFeatures * nRowsBlocks); - args.set(argsI++, bAuxBuffers[i], AccessModeIds::write); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsBatchTaskOneAPI.MergeBlocks); - context.run(range, kMergeBlocks, args, status); - } - DAAL_CHECK_STATUS_VAR(status); - } - } - else - { - auto kSinglePass = factory.getKernel(TaskInfoBatch::kSinglePassName, status); - DAAL_CHECK_STATUS_VAR(status); - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nColsBlocks, workItemsPerGroup); - KernelRange localRange(workItemsPerGroup); - KernelRange globalRange(nColsBlocks * workItemsPerGroup); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(3 + TaskInfoBatch::nResults, status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - DAAL_ASSERT(dataBD.getBuffer().size() == nVectors * nFeatures); - args.set(argsI++, dataBD.getBuffer(), AccessModeIds::read); - args.set(argsI++, nFeatures); - args.set(argsI++, nVectors); - for (uint32_t i = 0; i < TaskInfoBatch::nResults; i++) - { - DAAL_ASSERT(resultBD[i].getBuffer().size() == nFeatures); - args.set(argsI++, resultBD[i].getBuffer(), AccessModeIds::readwrite); - } - - context.run(range, kSinglePass, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - } - - return status; -} - -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_distributed_oneapi_impl.i b/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_distributed_oneapi_impl.i deleted file mode 100644 index a9122903764..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_distributed_oneapi_impl.i +++ /dev/null @@ -1,479 +0,0 @@ -/* file: low_order_moments_distributed_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Low order moments algorithm implementation in distributed mode. -//-- -*/ - -#ifndef __LOW_ORDER_MOMENTS_DISTRIBUTED_ONEAPI_IMPL_I__ -#define __LOW_ORDER_MOMENTS_DISTRIBUTED_ONEAPI_IMPL_I__ - -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_distr.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_distributed_oneapi.h" -#include "src/externals/service_profiler.h" -#include "services/internal/execution_context.h" -#include "services/daal_defines.h" - -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -/* task info MinMax parameters definitions */ -template <> -const char * TaskInfoDistributed::kMergeDistrBlocksName = "mergeDistrBlocksMinMax"; - -template <> -const char * TaskInfoDistributed::kBldOptFNameSuff = " -DFNAMESUFF=MinMax "; - -template <> -const char * TaskInfoDistributed::kBldOptScope = " -D_RMIN_ -D_RMAX_ "; - -template <> -const char * TaskInfoDistributed::kCacheKey = "__daal_algorithms_low_order_moments_distributed_kernels_minmax"; - -/* itask info MeanVariance parameters definitions */ -template <> -const char * TaskInfoDistributed::kMergeDistrBlocksName = "mergeDistrBlocksMeanVariance"; - -template <> -const char * TaskInfoDistributed::kFinalizeName = "finalizeMeanVariance"; - -template <> -const char * TaskInfoDistributed::kBldOptFNameSuff = " -DFNAMESUFF=MeanVariance "; - -template <> -const char * TaskInfoDistributed::kBldOptScope = " -D_RMEAN_ -D_RVARC_ "; - -template <> -const char * TaskInfoDistributed::kCacheKey = - "__daal_algorithms_low_order_moments_distributed_kernels_mean_variance"; - -/* All task info estimatesAll parameters definitions */ -template <> -const char * TaskInfoDistributed::kMergeDistrBlocksName = "mergeDistrBlocksAll"; - -template <> -const char * TaskInfoDistributed::kFinalizeName = "finalizeAll"; - -template <> -const char * TaskInfoDistributed::kBldOptFNameSuff = " -DFNAMESUFF=All "; - -template <> -const char * TaskInfoDistributed::kBldOptScope = - " -D_RMIN_ -D_RMAX_ -D_RSUM_ -D_RSUM2_ -D_RSUM2C_ -D_RMEAN_ -D_RSORM_ -D_RVARC_ -D_RSTDEV_ -D_RVART_ "; -template <> -const char * TaskInfoDistributed::kCacheKey = "__daal_algorithms_low_order_moments_distributed_kernels_all"; - -/* - Kernel methods implementation -*/ - -template -services::Status LowOrderMomentsDistributedKernelOneAPI::compute(data_management::DataCollection * partialResultsCollection, - PartialResult * partialResult, const Parameter * parameter) -{ - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - - if (method == defaultDense) - { - if (parameter->estimatesToCompute == estimatesMinMax) - { - LowOrderMomentsDistributedTaskOneAPI task(context, partialResultsCollection, partialResult, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else if (parameter->estimatesToCompute == estimatesMeanVariance) - { - LowOrderMomentsDistributedTaskOneAPI task(context, partialResultsCollection, partialResult, - status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else - { - /* estimatesAll */ - LowOrderMomentsDistributedTaskOneAPI task(context, partialResultsCollection, partialResult, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - } - - return services::Status(ErrorMethodNotImplemented); -} - -template -services::Status LowOrderMomentsDistributedKernelOneAPI::finalizeCompute(PartialResult * partialResult, Result * result, - const Parameter * parameter) -{ - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - if (method == defaultDense) - { - /*nothing is done in case of finalizing results which are already available in partialResult (i.e. min max ...), - due to they should be already assigned into result from partial results by level up caller - */ - if (parameter->estimatesToCompute == estimatesMeanVariance) - { - LowOrderMomentsDistributedFinalizeTaskOneAPI task(context, partialResult, result, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else if (parameter->estimatesToCompute == estimatesAll) - { - /* estimatesAll */ - LowOrderMomentsDistributedFinalizeTaskOneAPI task(context, partialResult, result, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - } - - return status; -} - -template -static inline services::Status overflowCheckByMultiplication(const Q & v1, const P & v2) -{ - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(T, v1, v2); - return services::Status(); -} - -template -static inline services::Status buildProgram(ClKernelFactoryIface & factory, const char * buildOptions = nullptr) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(buildProgram); - - services::Status status; - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - - build_options.add(" -cl-std=CL1.2 -D LOCAL_BUFFER_SIZE=256 "); - build_options.add(TaskInfoDistributed::kBldOptFNameSuff); - build_options.add(TaskInfoDistributed::kBldOptScope); - - if (buildOptions) - { - build_options.add(buildOptions); - } - - services::String cachekey(TaskInfoDistributed::kCacheKey); - cachekey.add(fptype_name); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), low_order_moments_kernels_distr_cl, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -/* - Distributed task methods implementations -*/ -template -LowOrderMomentsDistributedTaskOneAPI::LowOrderMomentsDistributedTaskOneAPI( - ExecutionContextIface & context, data_management::DataCollection * partialResultsCollection, PartialResult * partialResult, - services::Status & status) - : partResultsCollection(partialResultsCollection) -{ - auto pRTable = partialResult->get((PartialResultId)TaskInfoDistributed::resPartialIds[0]); - status |= pRTable ? services::Status() : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - nFeatures = pRTable->getNumberOfColumns(); - - if (partialResultsCollection->size() > _uint32max) - { - status |= services::ErrorIncorrectNumberOfElementsInInputCollection; - return; - } - nDistrBlocks = partialResultsCollection->size(); - bNVec = context.allocate(TypeIds::id(), nDistrBlocks, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - status |= overflowCheckByMultiplication(nFeatures, sizeof(algorithmFPType)); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - size_t nElemsInStrideST = (((nFeatures * sizeof(algorithmFPType)) + _blockAlignment - 1) & ~(_blockAlignment - 1)) / sizeof(algorithmFPType); - if (nElemsInStrideST > _uint32max) - { - status |= services::ErrorIncorrectNumberOfColumnsInInputNumericTable; - return; - } - - nElemsInStride = static_cast(nElemsInStrideST); - - status |= overflowCheckByMultiplication(nDistrBlocks, nElemsInStride); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - for (uint32_t i = 0; i < TaskInfoDistributed::nBuffers; i++) - { - bAuxHostBuffers[i].reset(nDistrBlocks * nElemsInStride); - status |= bAuxHostBuffers[i].get() ? services::Status() : services::Status(services::ErrorMemoryAllocationFailed); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - bAuxBuffers[i] = context.allocate(TypeIds::id(), nDistrBlocks * nElemsInStride, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - - nObservationsTable = partialResult->get((PartialResultId)nObservations); - status |= nObservationsTable ? nObservationsTable->getBlockOfRows(0, 1, readWrite, nObservationsBD) : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - pNObservations = nObservationsBD.getBlockPtr(); - - for (uint32_t i = 0; i < TaskInfoDistributed::nPartialResults; i++) - { - resultTable[i] = partialResult->get((PartialResultId)TaskInfoDistributed::resPartialIds[i]); - status |= resultTable[i] ? resultTable[i]->getBlockOfRows(0, 1, readWrite, resultBD[i]) : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } -} - -template -LowOrderMomentsDistributedTaskOneAPI::~LowOrderMomentsDistributedTaskOneAPI() -{ - if (nObservationsTable) - { - nObservationsTable->releaseBlockOfRows(nObservationsBD); - } - - for (uint32_t i = 0; i < TaskInfoOnline::nPartialResults; i++) - { - if (resultTable[i]) - { - resultTable[i]->releaseBlockOfRows(resultBD[i]); - } - } -} - -template -services::Status LowOrderMomentsDistributedTaskOneAPI::compute() -{ - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsDistributedTaskOneAPI.compute); - - DAAL_ASSERT(partResultsCollection); - - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_MALLOC(pNObservations); - *pNObservations = (algorithmFPType)0; - - DAAL_ASSERT_UNIVERSAL_BUFFER(bNVec, algorithmFPType, nDistrBlocks); - - { - auto bNVecHost = bNVec.template get().toHost(ReadWriteMode::writeOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - for (uint32_t distrBlockId = 0; distrBlockId < nDistrBlocks; distrBlockId++) - { - PartialResult * inputPartialResult = static_cast((*partResultsCollection)[distrBlockId].get()); - BlockDescriptor blockDesc; - - NumericTablePtr tablePtr = inputPartialResult->get((PartialResultId)nObservations); - DAAL_CHECK_STATUS_VAR(tablePtr ? tablePtr->getBlockOfRows(0, 1, readOnly, blockDesc) : services::Status(ErrorNullPartialResult)); - algorithmFPType * pBlockObsCount = blockDesc.getBlockPtr(); - DAAL_CHECK_MALLOC(pBlockObsCount); - - bNVecHost.get()[distrBlockId] = *pBlockObsCount; - - *pNObservations += *pBlockObsCount; - tablePtr->releaseBlockOfRows(blockDesc); - } - } - - const size_t rowSize = nFeatures * sizeof(algorithmFPType); - - for (uint32_t i = 0; i < TaskInfoDistributed::nPartialResults; i++) - { - // copy partial results from each block into one common buffer - DAAL_ASSERT(bAuxHostBuffers[i].size() == nDistrBlocks * nElemsInStride); - for (uint32_t distrBlockId = 0; distrBlockId < nDistrBlocks; distrBlockId++) - { - PartialResult * inputPartialResult = static_cast((*partResultsCollection)[distrBlockId].get()); - BlockDescriptor blockDesc; - NumericTablePtr tablePtr = inputPartialResult->get((PartialResultId)TaskInfoDistributed::resPartialIds[i]); - DAAL_CHECK_STATUS_VAR(tablePtr ? tablePtr->getBlockOfRows(0, 1, readOnly, blockDesc) : services::Status(ErrorNullPartialResult)); - DAAL_ASSERT(blockDesc.getBuffer().size() == nFeatures); - int result = daal::services::internal::daal_memcpy_s(bAuxHostBuffers[i].get() + distrBlockId * nElemsInStride, rowSize, - blockDesc.getBlockPtr(), rowSize); - DAAL_CHECK(!result, services::ErrorMemoryCopyFailedInternal); - tablePtr->releaseBlockOfRows(blockDesc); - } - DAAL_ASSERT_UNIVERSAL_BUFFER(bAuxBuffers[i], algorithmFPType, nDistrBlocks * nElemsInStride); - context.copy(bAuxBuffers[i], 0, (void *)bAuxHostBuffers[i].get(), nDistrBlocks * nElemsInStride, 0, nDistrBlocks * nElemsInStride, status); - DAAL_CHECK_STATUS_VAR(status); - } - - /* merge blocks */ - auto kMergeDistrBlocks = factory.getKernel(TaskInfoDistributed::kMergeDistrBlocksName, status); - DAAL_CHECK_STATUS_VAR(status); - { - KernelRange range(nFeatures); - - KernelArguments args(3 + TaskInfoDistributed::nPartialResults + TaskInfoDistributed::nBuffers - + 1 /*rows in block info*/, - status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - args.set(argsI++, nFeatures); - args.set(argsI++, nDistrBlocks); // num of values to merge - args.set(argsI++, nElemsInStride); // stride between feature values from different blocks - for (uint32_t i = 0; i < TaskInfoDistributed::nPartialResults; i++) - { - DAAL_ASSERT(resultBD[i].getBuffer().size() == nFeatures); - args.set(argsI++, resultBD[i].getBuffer(), AccessModeIds::readwrite); - } - - DAAL_ASSERT_UNIVERSAL_BUFFER(bNVec, algorithmFPType, nDistrBlocks); - args.set(argsI++, bNVec, AccessModeIds::read); - - for (uint32_t i = 0; i < TaskInfoDistributed::nBuffers; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bAuxBuffers[i], algorithmFPType, nFeatures * nDistrBlocks); - args.set(argsI++, bAuxBuffers[i], AccessModeIds::read); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsDistributedTaskOneAPI.MergeDistrBlocks); - context.run(range, kMergeDistrBlocks, args, status); - } - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} -/* - finalize task methods implementations -*/ -template -LowOrderMomentsDistributedFinalizeTaskOneAPI::LowOrderMomentsDistributedFinalizeTaskOneAPI(ExecutionContextIface & context, - PartialResult * partialResult, - Result * result, - services::Status & status) -{ - uint32_t resIdx = 0; - for (uint32_t i = 0; i < TaskInfoDistributed::nPartialResults; i++) - { - resultTable[resIdx] = partialResult->get((PartialResultId)TaskInfoDistributed::resPartialIds[i]); - status |= - resultTable[resIdx] ? resultTable[resIdx]->getBlockOfRows(0, 1, readWrite, resultBD[resIdx]) : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - resIdx++; - } - - for (uint32_t i = 0; i < TaskInfoDistributed::nFinalizeResults; i++) - { - resultTable[resIdx] = result->get((ResultId)TaskInfoDistributed::resFinalizeIds[i]); - status |= - resultTable[resIdx] ? resultTable[resIdx]->getBlockOfRows(0, 1, readWrite, resultBD[resIdx]) : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - resIdx++; - } - - nFeatures = resultTable[0]->getNumberOfColumns(); - - nObservationsTable = partialResult->get((PartialResultId)nObservations); - status |= nObservationsTable ? nObservationsTable->getBlockOfRows(0, 1, readWrite, nObservationsBD) : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - pNObservations = nObservationsBD.getBlockPtr(); -} - -template -LowOrderMomentsDistributedFinalizeTaskOneAPI::~LowOrderMomentsDistributedFinalizeTaskOneAPI() -{ - if (nObservationsTable) - { - nObservationsTable->releaseBlockOfRows(nObservationsBD); - } - - for (uint32_t i = 0; i < nTotalResults; i++) - { - if (resultTable[i]) - { - resultTable[i]->releaseBlockOfRows(resultBD[i]); - } - } -} - -template -services::Status LowOrderMomentsDistributedFinalizeTaskOneAPI::compute() -{ - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsDistributedTaskOneAPI.finalize); - - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kFinalize = factory.getKernel(TaskInfoDistributed::kFinalizeName, status); - DAAL_CHECK_STATUS_VAR(status); - { - KernelRange range(nFeatures); - - KernelArguments args(1 + nTotalResults, status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - DAAL_CHECK_MALLOC(pNObservations); - args.set(argsI++, *pNObservations); - - for (uint32_t i = 0; i < nTotalResults; i++) - { - DAAL_ASSERT(resultBD[i].getBuffer().size() == nFeatures); - args.set(argsI++, resultBD[i].getBuffer(), - (i < TaskInfoDistributed::nPartialResults ? AccessModeIds::read : AccessModeIds::write)); - } - - context.run(range, kFinalize, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_batch_oneapi.h b/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_batch_oneapi.h deleted file mode 100644 index 10e96341666..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_batch_oneapi.h +++ /dev/null @@ -1,153 +0,0 @@ -/* file: low_order_moments_kernel_batch_oneapi.h */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template function that calculate low order moments. -//-- -*/ - -#ifndef __LOW_ORDER_MOMENTS_KERNEL_BATCH_ONEAPI_H__ -#define __LOW_ORDER_MOMENTS_KERNEL_BATCH_ONEAPI_H__ - -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/moments/low_order_moments_types.h" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -template -struct TaskInfoBatch; - -template -struct TaskInfoBatch -{ - constexpr static uint32_t nResults = 2; - constexpr static uint32_t nBuffers = 2; - constexpr static bool isRowsInBlockInfoRequired = false; - // names of used kernels - static const char * kSinglePassName; - static const char * kProcessBlocksName; - static const char * kMergeBlocksName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resIds[nResults]; // required set of results' ids - TaskInfoBatch() : resIds { minimum, maximum } {} -}; - -template -struct TaskInfoBatch -{ - constexpr static uint32_t nResults = 2; - constexpr static uint32_t nBuffers = 2; - constexpr static bool isRowsInBlockInfoRequired = true; - // names of used kernels - static const char * kSinglePassName; - static const char * kProcessBlocksName; - static const char * kMergeBlocksName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resIds[nResults]; // required set of results' ids - TaskInfoBatch() : resIds { mean, variance } {} -}; - -template -struct TaskInfoBatch -{ - constexpr static uint32_t nResults = lastResultId + 1; - constexpr static uint32_t nBuffers = 5; - constexpr static bool isRowsInBlockInfoRequired = true; - // names of used kernels - static const char * kSinglePassName; - static const char * kProcessBlocksName; - static const char * kMergeBlocksName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resIds[nResults]; // required set of results' ids - TaskInfoBatch() - : resIds { minimum, maximum, sum, sumSquares, sumSquaresCentered, mean, secondOrderRawMoment, variance, standardDeviation, variation } - {} -}; - -template -class LowOrderMomentsBatchKernelOneAPI : public daal::algorithms::Kernel -{ -public: - services::Status compute(data_management::NumericTable * dataTable, Result * result, const Parameter * parameter); -}; - -template -class LowOrderMomentsBatchTaskOneAPI : public TaskInfoBatch -{ -public: - LowOrderMomentsBatchTaskOneAPI(services::internal::sycl::ExecutionContextIface & context, data_management::NumericTable * dataTable, - Result * result, services::Status & status); - LowOrderMomentsBatchTaskOneAPI(const LowOrderMomentsBatchTaskOneAPI &) = delete; - LowOrderMomentsBatchTaskOneAPI & operator=(const LowOrderMomentsBatchTaskOneAPI &) = delete; - virtual ~LowOrderMomentsBatchTaskOneAPI(); - services::Status compute(); - -private: - static constexpr size_t _uint32max = static_cast(services::internal::MaxVal::get()); - - uint32_t nVectors; - uint32_t nFeatures; - - const uint32_t maxWorkItemsPerGroup = 256; - const uint32_t maxWorkItemsPerGroupToMerge = 16; - - uint32_t nRowsBlocks; - uint32_t nColsBlocks; - uint32_t workItemsPerGroup; - - data_management::NumericTable * dataTable; - data_management::BlockDescriptor dataBD; - - services::internal::sycl::UniversalBuffer bNVec; // contains info about num of vectors in block - - data_management::NumericTablePtr resultTable[TaskInfoBatch::nResults]; - services::internal::sycl::UniversalBuffer bAuxBuffers[TaskInfoBatch::nBuffers]; - - data_management::BlockDescriptor resultBD[TaskInfoBatch::nResults]; -}; - -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_distributed_oneapi.h b/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_distributed_oneapi.h deleted file mode 100644 index d1237bcfdaf..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_distributed_oneapi.h +++ /dev/null @@ -1,191 +0,0 @@ -/* file: low_order_moments_kernel_distributed_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template function that calculate low order moments. -//-- -*/ - -#ifndef __LOW_ORDER_MOMENTS_KERNEL_DISTRIBUTED_ONEAPI_H__ -#define __LOW_ORDER_MOMENTS_KERNEL_DISTRIBUTED_ONEAPI_H__ - -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/moments/low_order_moments_types.h" -#include "src/services/service_data_utils.h" -#include "src/services/service_arrays.h" - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -template -struct TaskInfoDistributed; - -template -struct TaskInfoDistributed -{ - constexpr static uint32_t nResults = 2; - constexpr static uint32_t nBuffers = 2; - constexpr static uint32_t nPartialResults = 2; - // names of used kernels - static const char * kMergeDistrBlocksName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resPartialIds[nPartialResults]; // required set of partial results' ids - int resIds[nResults]; // required set of results' ids - - TaskInfoDistributed() : resPartialIds { partialMinimum, partialMaximum }, resIds { minimum, maximum } {} -}; - -template -struct TaskInfoDistributed -{ - constexpr static uint32_t nResults = 2; - constexpr static uint32_t nBuffers = 2; - constexpr static uint32_t nPartialResults = 2; - constexpr static uint32_t nFinalizeResults = 2; - // names of used kernels - static const char * kMergeDistrBlocksName; - static const char * kFinalizeName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resIds[nResults]; // required set of results' ids - int resPartialIds[nPartialResults]; // required set of partial results' ids - int resFinalizeIds[nFinalizeResults]; // set of results' ids which will be processed on finalize stage - - TaskInfoDistributed() : resPartialIds { partialSum, partialSumSquaresCentered }, resIds { mean, variance }, resFinalizeIds { mean, variance } {} -}; - -template -struct TaskInfoDistributed -{ - constexpr static uint32_t nResults = lastResultId + 1; - constexpr static uint32_t nBuffers = 5; - constexpr static uint32_t nPartialResults = lastPartialResultId; // removed '+1' due to nObservations is mapped separately - constexpr static uint32_t nFinalizeResults = 5; - // names of used kernels - static const char * kMergeDistrBlocksName; - static const char * kFinalizeName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resIds[nResults]; // required set of results' ids - int resPartialIds[nPartialResults]; // required set of partial results' ids - int resFinalizeIds[nFinalizeResults]; // set of results' ids which will be processed on finalize stage - - TaskInfoDistributed() - : resPartialIds { partialMinimum, partialMaximum, partialSum, partialSumSquares, partialSumSquaresCentered }, - resIds { minimum, maximum, sum, sumSquares, sumSquaresCentered, mean, secondOrderRawMoment, variance, standardDeviation, variation }, - resFinalizeIds { mean, secondOrderRawMoment, variance, standardDeviation, variation } - {} -}; - -/* distributed kernel class */ -template -class LowOrderMomentsDistributedKernelOneAPI : public daal::algorithms::Kernel -{ -public: - services::Status compute(data_management::DataCollection * partialResultsCollection, PartialResult * partialResult, const Parameter * parameter); - services::Status finalizeCompute(PartialResult * partialResult, Result * result, const Parameter * parameter); -}; - -/* distributed task class */ -template -class LowOrderMomentsDistributedTaskOneAPI : public TaskInfoDistributed -{ -public: - LowOrderMomentsDistributedTaskOneAPI(services::internal::sycl::ExecutionContextIface & context, - data_management::DataCollection * partialResultsCollection, PartialResult * partialResult, - services::Status & status); - LowOrderMomentsDistributedTaskOneAPI(const LowOrderMomentsDistributedTaskOneAPI &) = delete; - LowOrderMomentsDistributedTaskOneAPI & operator=(const LowOrderMomentsDistributedTaskOneAPI &) = delete; - virtual ~LowOrderMomentsDistributedTaskOneAPI(); - Status compute(); - -private: - static constexpr size_t _uint32max = static_cast(services::internal::MaxVal::get()); - - static constexpr uint32_t _blockAlignment = 64; // alignment (in bytes) for distibuted data blocks - - uint32_t nDistrBlocks; - uint32_t nFeatures; - uint32_t nElemsInStride; // num of elems between feature values from different distributed blocks - - data_management::DataCollection * partResultsCollection; - - NumericTablePtr nObservationsTable; - BlockDescriptor nObservationsBD; - algorithmFPType * pNObservations; - - services::internal::sycl::UniversalBuffer bNVec; // contains info about num of vectors in distributed block - - NumericTablePtr resultTable[TaskInfoDistributed::nPartialResults]; - services::internal::sycl::UniversalBuffer bAuxBuffers[TaskInfoDistributed::nBuffers]; - daal::services::internal::TArray bAuxHostBuffers[TaskInfoDistributed::nBuffers]; - - BlockDescriptor resultBD[TaskInfoDistributed::nPartialResults]; -}; - -/* finalize task class */ -template -class LowOrderMomentsDistributedFinalizeTaskOneAPI : public TaskInfoDistributed -{ -public: - LowOrderMomentsDistributedFinalizeTaskOneAPI(services::internal::sycl::ExecutionContextIface & context, PartialResult * partialResult, - Result * result, services::Status & status); - LowOrderMomentsDistributedFinalizeTaskOneAPI(const LowOrderMomentsDistributedFinalizeTaskOneAPI &) = delete; - LowOrderMomentsDistributedFinalizeTaskOneAPI & operator=(const LowOrderMomentsDistributedFinalizeTaskOneAPI &) = delete; - virtual ~LowOrderMomentsDistributedFinalizeTaskOneAPI(); - Status compute(); - -private: - uint32_t nFeatures; - constexpr static uint32_t nTotalResults = - TaskInfoDistributed::nPartialResults + TaskInfoDistributed::nFinalizeResults; - - NumericTablePtr nObservationsTable; - BlockDescriptor nObservationsBD; - algorithmFPType * pNObservations; - - NumericTablePtr resultTable[nTotalResults]; - - BlockDescriptor resultBD[nTotalResults]; -}; - -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_online_oneapi.h b/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_online_oneapi.h deleted file mode 100644 index 1d64b7e625f..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_online_oneapi.h +++ /dev/null @@ -1,202 +0,0 @@ -/* file: low_order_moments_kernel_online_oneapi.h */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template function that calculate low order moments. -//-- -*/ - -#ifndef __LOW_ORDER_MOMENTS_KERNEL_ONLINE_ONEAPI_H__ -#define __LOW_ORDER_MOMENTS_KERNEL_ONLINE_ONEAPI_H__ - -#include "data_management/data/numeric_table.h" -#include "algorithms/algorithm_base_common.h" -#include "algorithms/moments/low_order_moments_types.h" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -template -struct TaskInfoOnline; - -template -struct TaskInfoOnline -{ - constexpr static uint32_t nResults = 2; - constexpr static uint32_t nBuffers = 2; - constexpr static bool isRowsInBlockInfoRequired = false; - constexpr static uint32_t nPartialResults = 2; - // names of used kernels - static const char * kSinglePassName; - static const char * kProcessBlocksName; - static const char * kMergeBlocksName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resPartialIds[nPartialResults]; // required set of partial results' ids - int resIds[nResults]; // required set of results' ids - - TaskInfoOnline() : resPartialIds { partialMinimum, partialMaximum }, resIds { minimum, maximum } {} -}; - -template -struct TaskInfoOnline -{ - constexpr static uint32_t nResults = 2; - constexpr static uint32_t nBuffers = 2; - constexpr static bool isRowsInBlockInfoRequired = true; - constexpr static uint32_t nPartialResults = 2; - constexpr static uint32_t nFinalizeResults = 2; - // names of used kernels - static const char * kSinglePassName; - static const char * kProcessBlocksName; - static const char * kMergeBlocksName; - static const char * kFinalizeName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resIds[nResults]; // required set of results' ids - int resPartialIds[nPartialResults]; // required set of partial results' ids - int resFinalizeIds[nFinalizeResults]; // set of results' ids which will be processed on finalize stage - - TaskInfoOnline() : resPartialIds { partialSum, partialSumSquaresCentered }, resIds { mean, variance }, resFinalizeIds { mean, variance } {} -}; - -template -struct TaskInfoOnline -{ - constexpr static uint32_t nResults = lastResultId + 1; - constexpr static uint32_t nBuffers = 5; - constexpr static bool isRowsInBlockInfoRequired = true; - constexpr static uint32_t nPartialResults = lastPartialResultId; // removed '+1' due to nObservations is mapped separately - constexpr static uint32_t nFinalizeResults = 5; - // names of used kernels - static const char * kSinglePassName; - static const char * kProcessBlocksName; - static const char * kMergeBlocksName; - static const char * kFinalizeName; - // kernels build options - static const char * kBldOptFNameSuff; - static const char * kBldOptScope; - static const char * kCacheKey; - - int resIds[nResults]; // required set of results' ids - int resPartialIds[nPartialResults]; // required set of partial results' ids - int resFinalizeIds[nFinalizeResults]; // set of results' ids which will be processed on finalize stage - - TaskInfoOnline() - : resPartialIds { partialMinimum, partialMaximum, partialSum, partialSumSquares, partialSumSquaresCentered }, - resIds { minimum, maximum, sum, sumSquares, sumSquaresCentered, mean, secondOrderRawMoment, variance, standardDeviation, variation }, - resFinalizeIds { mean, secondOrderRawMoment, variance, standardDeviation, variation } - {} -}; - -/* online kernel class */ -template -class LowOrderMomentsOnlineKernelOneAPI : public daal::algorithms::Kernel -{ -public: - services::Status compute(NumericTable * dataTable, PartialResult * partialResult, const Parameter * parameter, bool isOnline); - services::Status finalizeCompute(PartialResult * partialResult, Result * result, const Parameter * parameter); -}; - -/* online task class */ -template -class LowOrderMomentsOnlineTaskOneAPI : public TaskInfoOnline -{ -public: - LowOrderMomentsOnlineTaskOneAPI(services::internal::sycl::ExecutionContextIface & context, NumericTable * dataTable, - PartialResult * partialResult, services::Status & status); - LowOrderMomentsOnlineTaskOneAPI(const LowOrderMomentsOnlineTaskOneAPI &) = delete; - LowOrderMomentsOnlineTaskOneAPI & operator=(const LowOrderMomentsOnlineTaskOneAPI &) = delete; - virtual ~LowOrderMomentsOnlineTaskOneAPI(); - Status compute(); - -private: - static constexpr size_t _uint32max = static_cast(services::internal::MaxVal::get()); - - uint32_t nVectors; - uint32_t nFeatures; - - const uint32_t maxWorkItemsPerGroup = 256; - const uint32_t maxWorkItemsPerGroupToMerge = 16; - - uint32_t nRowsBlocks; - uint32_t nColsBlocks; - uint32_t workItemsPerGroup; - - NumericTable * dataTable; - BlockDescriptor dataBD; - - NumericTablePtr nObservationsTable; - BlockDescriptor nObservationsBD; - algorithmFPType * pNObservations; - - services::internal::sycl::UniversalBuffer bNVec; // contains info about num of vectors in block - - NumericTablePtr resultTable[TaskInfoOnline::nPartialResults]; - services::internal::sycl::UniversalBuffer bAuxBuffers[TaskInfoOnline::nBuffers]; - - BlockDescriptor resultBD[TaskInfoOnline::nPartialResults]; -}; - -/* finalize task class */ -template -class LowOrderMomentsOnlineFinalizeTaskOneAPI : public TaskInfoOnline -{ -public: - LowOrderMomentsOnlineFinalizeTaskOneAPI(services::internal::sycl::ExecutionContextIface & context, PartialResult * partialResult, Result * result, - services::Status & status); - LowOrderMomentsOnlineFinalizeTaskOneAPI(const LowOrderMomentsOnlineFinalizeTaskOneAPI &) = delete; - LowOrderMomentsOnlineFinalizeTaskOneAPI & operator=(const LowOrderMomentsOnlineFinalizeTaskOneAPI &) = delete; - virtual ~LowOrderMomentsOnlineFinalizeTaskOneAPI(); - Status compute(); - -private: - uint32_t nFeatures; - constexpr static uint32_t nTotalResults = - TaskInfoOnline::nPartialResults + TaskInfoOnline::nFinalizeResults; - - NumericTablePtr nObservationsTable; - BlockDescriptor nObservationsBD; - algorithmFPType * pNObservations; - - NumericTablePtr resultTable[nTotalResults]; - - BlockDescriptor resultBD[nTotalResults]; -}; - -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_online_oneapi_impl.i b/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_online_oneapi_impl.i deleted file mode 100644 index b418783dced..00000000000 --- a/cpp/daal/src/algorithms/low_order_moments/oneapi/low_order_moments_online_oneapi_impl.i +++ /dev/null @@ -1,561 +0,0 @@ -/* file: low_order_moments_online_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Low order moments algorithm implementation in online mode. -//-- -*/ - -#ifndef __LOW_ORDER_MOMENTS_ONLINE_ONEAPI_IMPL_I__ -#define __LOW_ORDER_MOMENTS_ONLINE_ONEAPI_IMPL_I__ - -#include "services/internal/buffer.h" -#include "data_management/data/numeric_table.h" -#include "services/env_detect.h" -#include "services/error_indexes.h" -#include "src/algorithms/low_order_moments/oneapi/cl_kernels/low_order_moments_kernels_all.h" -#include "src/algorithms/low_order_moments/oneapi/low_order_moments_kernel_online_oneapi.h" -#include "src/externals/service_profiler.h" -#include "services/internal/execution_context.h" -#include "services/daal_defines.h" - -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; - -namespace daal -{ -namespace algorithms -{ -namespace low_order_moments -{ -namespace oneapi -{ -namespace internal -{ -#define CHECK_AND_RET_IF_FAIL(st, expr) \ - (st) |= (expr); \ - if (!st) \ - { \ - return; \ - } - -/* task info MinMax parameters definitions */ -template <> -const char * TaskInfoOnline::kSinglePassName = "singlePassMinMax"; - -template <> -const char * TaskInfoOnline::kProcessBlocksName = "processBlocksMinMax"; - -template <> -const char * TaskInfoOnline::kMergeBlocksName = "mergeBlocksMinMax"; - -template <> -const char * TaskInfoOnline::kBldOptFNameSuff = " -DFNAMESUFF=MinMax "; - -template <> -const char * TaskInfoOnline::kBldOptScope = " -D_ONLINE_ -D_RMIN_ -D_RMAX_ "; - -template <> -const char * TaskInfoOnline::kCacheKey = "__daal_algorithms_low_order_moments_online_kernels_minmax"; - -/* itask info MeanVariance parameters definitions */ -template <> -const char * TaskInfoOnline::kSinglePassName = "singlePassMeanVariance"; - -template <> -const char * TaskInfoOnline::kProcessBlocksName = "processBlocksMeanVariance"; - -template <> -const char * TaskInfoOnline::kMergeBlocksName = "mergeBlocksMeanVariance"; - -template <> -const char * TaskInfoOnline::kFinalizeName = "finalizeMeanVariance"; - -template <> -const char * TaskInfoOnline::kBldOptFNameSuff = " -DFNAMESUFF=MeanVariance "; - -template <> -const char * TaskInfoOnline::kBldOptScope = " -D_ONLINE_ -D_RMEAN_ -D_RVARC_ "; - -template <> -const char * TaskInfoOnline::kCacheKey = "__daal_algorithms_low_order_moments_online_kernels_mean_variance"; - -/* All task info estimatesAll parameters definitions */ -template <> -const char * TaskInfoOnline::kSinglePassName = "singlePassAll"; - -template <> -const char * TaskInfoOnline::kProcessBlocksName = "processBlocksAll"; - -template <> -const char * TaskInfoOnline::kMergeBlocksName = "mergeBlocksAll"; - -template <> -const char * TaskInfoOnline::kFinalizeName = "finalizeAll"; - -template <> -const char * TaskInfoOnline::kBldOptFNameSuff = " -DFNAMESUFF=All "; - -template <> -const char * TaskInfoOnline::kBldOptScope = - " -D_ONLINE_ -D_RMIN_ -D_RMAX_ -D_RSUM_ -D_RSUM2_ -D_RSUM2C_ -D_RMEAN_ -D_RSORM_ -D_RVARC_ -D_RSTDEV_ -D_RVART_ "; -template <> -const char * TaskInfoOnline::kCacheKey = "__daal_algorithms_low_order_moments_online_kernels_all"; - -/* - Kernel methods implementation -*/ -template -services::Status LowOrderMomentsOnlineKernelOneAPI::compute(NumericTable * dataTable, PartialResult * partialResult, - const Parameter * parameter, bool isOnline) -{ - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - - if (method == defaultDense) - { - if (parameter->estimatesToCompute == estimatesMinMax) - { - LowOrderMomentsOnlineTaskOneAPI task(context, dataTable, partialResult, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else if (parameter->estimatesToCompute == estimatesMeanVariance) - { - LowOrderMomentsOnlineTaskOneAPI task(context, dataTable, partialResult, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else - { - /* estimatesAll */ - LowOrderMomentsOnlineTaskOneAPI task(context, dataTable, partialResult, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - } - - return services::Status(ErrorMethodNotImplemented); -} - -template -services::Status LowOrderMomentsOnlineKernelOneAPI::finalizeCompute(PartialResult * partialResult, Result * result, - const Parameter * parameter) -{ - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - - if (method == defaultDense) - { - /*nothing is done in case of finalizing results which are already available in partialResult (i.e. min max ...), - due to they should be already assigned into result from partial results by level up caller - */ - if (parameter->estimatesToCompute == estimatesMeanVariance) - { - LowOrderMomentsOnlineFinalizeTaskOneAPI task(context, partialResult, result, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - else if (parameter->estimatesToCompute == estimatesAll) - { - /* estimatesAll */ - LowOrderMomentsOnlineFinalizeTaskOneAPI task(context, partialResult, result, status); - DAAL_CHECK_STATUS_VAR(status); - return task.compute(); - } - } - - return status; -} - -template -static inline services::Status overflowCheckByMultiplication(const Q & v1, const P & v2) -{ - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(T, v1, v2); - return services::Status(); -} - -template -static inline services::Status buildProgram(ClKernelFactoryIface & factory, const char * buildOptions = nullptr) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(buildProgram); - - services::Status status; - auto fptype_name = getKeyFPType(); - auto build_options = fptype_name; - - build_options.add(" -cl-std=CL1.2 -D LOCAL_BUFFER_SIZE=256 "); - build_options.add(TaskInfoOnline::kBldOptFNameSuff); - build_options.add(TaskInfoOnline::kBldOptScope); - - if (buildOptions) - { - build_options.add(buildOptions); - } - - services::String cachekey(TaskInfoOnline::kCacheKey); - cachekey.add(fptype_name); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), low_order_moments_kernels_all_cl, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -/* - Online task methods implementations -*/ -template -LowOrderMomentsOnlineTaskOneAPI::LowOrderMomentsOnlineTaskOneAPI(ExecutionContextIface & context, NumericTable * dataTable, - PartialResult * partialResult, services::Status & status) - : dataTable(dataTable) -{ - if (dataTable->getNumberOfRows() > _uint32max) - { - status |= services::ErrorIncorrectNumberOfRowsInInputNumericTable; - return; - } - if (dataTable->getNumberOfColumns() > _uint32max) - { - status |= services::ErrorIncorrectNumberOfColumnsInInputNumericTable; - return; - } - - nVectors = static_cast(dataTable->getNumberOfRows()); - nFeatures = static_cast(dataTable->getNumberOfColumns()); - - nColsBlocks = (nFeatures + maxWorkItemsPerGroup - 1) / maxWorkItemsPerGroup; - - nRowsBlocks = 128; - if (nVectors < 5000) - nRowsBlocks = 1; - else if (nVectors < 10000) - nRowsBlocks = 8; - else if (nVectors < 20000) - nRowsBlocks = 16; - else if (nVectors < 50000) - nRowsBlocks = 32; - else if (nVectors < 100000) - nRowsBlocks = 64; - - workItemsPerGroup = (maxWorkItemsPerGroup < nFeatures) ? maxWorkItemsPerGroup : nFeatures; - - status |= dataTable->getBlockOfRows(0, nVectors, readOnly, dataBD); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - nObservationsTable = partialResult->get((PartialResultId)nObservations); - status |= nObservationsTable ? nObservationsTable->getBlockOfRows(0, 1, readWrite, nObservationsBD) : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - pNObservations = nObservationsBD.getBlockPtr(); - - for (uint32_t i = 0; i < TaskInfoOnline::nPartialResults; i++) - { - resultTable[i] = partialResult->get((PartialResultId)TaskInfoOnline::resPartialIds[i]); - status |= resultTable[i]->getBlockOfRows(0, 1, readWrite, resultBD[i]); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - - status |= overflowCheckByMultiplication(nRowsBlocks, nFeatures); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - - if (TaskInfoOnline::isRowsInBlockInfoRequired) - { - if (nRowsBlocks > 1) - { - bNVec = context.allocate(TypeIds::uint32, nFeatures * nRowsBlocks, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - } - - if (nRowsBlocks > 1) - { - for (uint32_t i = 0; i < TaskInfoOnline::nBuffers; i++) - { - bAuxBuffers[i] = context.allocate(TypeIds::id(), nFeatures * nRowsBlocks, status); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - } - } -} - -template -LowOrderMomentsOnlineTaskOneAPI::~LowOrderMomentsOnlineTaskOneAPI() -{ - if (dataTable) - { - dataTable->releaseBlockOfRows(dataBD); - } - - if (nObservationsTable) - { - nObservationsTable->releaseBlockOfRows(nObservationsBD); - } - - for (uint32_t i = 0; i < TaskInfoOnline::nPartialResults; i++) - { - if (resultTable[i]) - { - resultTable[i]->releaseBlockOfRows(resultBD[i]); - } - } -} - -template -services::Status LowOrderMomentsOnlineTaskOneAPI::compute() -{ - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsOnlineTaskOneAPI.compute); - - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_MALLOC(pNObservations); - - if (nRowsBlocks > 1) - { - /* process rows by blocks first */ - auto kProcessBlocks = factory.getKernel(TaskInfoOnline::kProcessBlocksName, status); - DAAL_CHECK_STATUS_VAR(status); - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRowsBlocks, nColsBlocks); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nRowsBlocks * nColsBlocks, workItemsPerGroup); - KernelRange localRange(workItemsPerGroup); - KernelRange globalRange(nRowsBlocks * nColsBlocks * workItemsPerGroup); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args( - 3 + TaskInfoOnline::nBuffers + (TaskInfoOnline::isRowsInBlockInfoRequired ? 1 : 0), - status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - DAAL_ASSERT(dataBD.getBuffer().size() == nVectors * nFeatures); - args.set(argsI++, dataBD.getBuffer(), AccessModeIds::read); - args.set(argsI++, nFeatures); - args.set(argsI++, nVectors); - - if (TaskInfoOnline::isRowsInBlockInfoRequired) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bNVec, uint32_t, nFeatures * nRowsBlocks); - args.set(argsI++, bNVec, AccessModeIds::write); - } - - for (uint32_t i = 0; i < TaskInfoOnline::nBuffers; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bAuxBuffers[i], algorithmFPType, nFeatures * nRowsBlocks); - args.set(argsI++, bAuxBuffers[i], AccessModeIds::write); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsOnlineTaskOneAPI.ProcessBlocks); - context.run(range, kProcessBlocks, args, status); - } - DAAL_CHECK_STATUS_VAR(status); - } - - /* merge blocks */ - auto kMergeBlocks = factory.getKernel(TaskInfoOnline::kMergeBlocksName, status); - DAAL_CHECK_STATUS_VAR(status); - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nFeatures, maxWorkItemsPerGroupToMerge); - KernelRange localRange(maxWorkItemsPerGroupToMerge); - KernelRange globalRange(maxWorkItemsPerGroupToMerge * nFeatures); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(2 + TaskInfoOnline::nPartialResults + TaskInfoOnline::nBuffers - + (TaskInfoOnline::isRowsInBlockInfoRequired ? 1 : 0), - status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - args.set(argsI++, nRowsBlocks); // num of values to merge - args.set(argsI++, *pNObservations); - for (uint32_t i = 0; i < TaskInfoOnline::nPartialResults; i++) - { - DAAL_ASSERT(resultBD[i].getBuffer().size() == nFeatures); - args.set(argsI++, resultBD[i].getBuffer(), AccessModeIds::readwrite); - } - - if (TaskInfoOnline::isRowsInBlockInfoRequired) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bNVec, uint32_t, nFeatures * nRowsBlocks); - args.set(argsI++, bNVec, AccessModeIds::write); - } - - for (uint32_t i = 0; i < TaskInfoOnline::nBuffers; i++) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(bAuxBuffers[i], algorithmFPType, nFeatures * nRowsBlocks); - args.set(argsI++, bAuxBuffers[i], AccessModeIds::write); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsOnlineTaskOneAPI.MergeBlocks); - context.run(range, kMergeBlocks, args, status); - } - DAAL_CHECK_STATUS_VAR(status); - } - } - else - { - auto kSinglePass = factory.getKernel(TaskInfoOnline::kSinglePassName, status); - DAAL_CHECK_STATUS_VAR(status); - { - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nColsBlocks, workItemsPerGroup); - KernelRange localRange(workItemsPerGroup); - KernelRange globalRange(nColsBlocks * workItemsPerGroup); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4 + TaskInfoOnline::nPartialResults, status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - DAAL_ASSERT(dataBD.getBuffer().size() == nVectors * nFeatures); - args.set(argsI++, dataBD.getBuffer(), AccessModeIds::read); - args.set(argsI++, nFeatures); - args.set(argsI++, nVectors); - args.set(argsI++, *pNObservations); - for (uint32_t i = 0; i < TaskInfoOnline::nPartialResults; i++) - { - DAAL_ASSERT(resultBD[i].getBuffer().size() == nFeatures); - args.set(argsI++, resultBD[i].getBuffer(), AccessModeIds::readwrite); - } - - context.run(range, kSinglePass, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - } - - *pNObservations += static_cast(nVectors); - - return status; -} -/* - finalize task methods implementations -*/ -template -LowOrderMomentsOnlineFinalizeTaskOneAPI::LowOrderMomentsOnlineFinalizeTaskOneAPI(ExecutionContextIface & context, - PartialResult * partialResult, - Result * result, services::Status & status) -{ - uint32_t resIdx = 0; - for (uint32_t i = 0; i < TaskInfoOnline::nPartialResults; i++) - { - resultTable[resIdx] = partialResult->get((PartialResultId)TaskInfoOnline::resPartialIds[i]); - status |= resultTable[resIdx]->getBlockOfRows(0, 1, readOnly, resultBD[resIdx]); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - resIdx++; - } - for (uint32_t i = 0; i < TaskInfoOnline::nFinalizeResults; i++) - { - resultTable[resIdx] = result->get((ResultId)TaskInfoOnline::resFinalizeIds[i]); - status |= resultTable[resIdx]->getBlockOfRows(0, 1, readWrite, resultBD[resIdx]); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - resIdx++; - } - - nFeatures = resultTable[0]->getNumberOfColumns(); - - nObservationsTable = partialResult->get((PartialResultId)nObservations); - status |= nObservationsTable ? nObservationsTable->getBlockOfRows(0, 1, readWrite, nObservationsBD) : services::Status(ErrorNullPartialResult); - DAAL_CHECK_STATUS_RETURN_VOID_IF_FAIL(status); - pNObservations = nObservationsBD.getBlockPtr(); -} - -template -LowOrderMomentsOnlineFinalizeTaskOneAPI::~LowOrderMomentsOnlineFinalizeTaskOneAPI() -{ - if (nObservationsTable) - { - nObservationsTable->releaseBlockOfRows(nObservationsBD); - } - - for (uint32_t i = 0; i < nTotalResults; i++) - { - if (resultTable[i]) - { - resultTable[i]->releaseBlockOfRows(resultBD[i]); - } - } -} - -template -services::Status LowOrderMomentsOnlineFinalizeTaskOneAPI::compute() -{ - DAAL_ITTNOTIFY_SCOPED_TASK(LowOrderMomentsOnlineTaskOneAPI.finalize); - - services::Status status; - - auto & context = daal::services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kFinalize = factory.getKernel(TaskInfoOnline::kFinalizeName, status); - DAAL_CHECK_STATUS_VAR(status); - { - KernelRange range(nFeatures); - - KernelArguments args(1 + nTotalResults, status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t argsI = 0; - DAAL_CHECK_MALLOC(pNObservations); - args.set(argsI++, *pNObservations); - - for (uint32_t i = 0; i < nTotalResults; i++) - { - DAAL_ASSERT(resultBD[i].getBuffer().size() == nFeatures); - args.set(argsI++, resultBD[i].getBuffer(), - (i < TaskInfoOnline::nPartialResults ? AccessModeIds::read : AccessModeIds::write)); - } - - context.run(range, kFinalize, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -} // namespace internal -} // namespace oneapi -} // namespace low_order_moments -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/objective_function/BUILD b/cpp/daal/src/algorithms/objective_function/BUILD index 3e5d70dd75a..d834b277691 100644 --- a/cpp/daal/src/algorithms/objective_function/BUILD +++ b/cpp/daal/src/algorithms/objective_function/BUILD @@ -4,7 +4,6 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", ], diff --git a/cpp/daal/src/algorithms/objective_function/common/oneapi/cl_kernel/objective_function_utils.cl b/cpp/daal/src/algorithms/objective_function/common/oneapi/cl_kernel/objective_function_utils.cl deleted file mode 100644 index 3022ba44545..00000000000 --- a/cpp/daal/src/algorithms/objective_function/common/oneapi/cl_kernel/objective_function_utils.cl +++ /dev/null @@ -1,144 +0,0 @@ -/* file: objective_function_utils.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Log Loss OpenCL kernels. -//-- -*/ - -#ifndef __OBJECTIVE_FUCTION_KERNELS_CL__ -#define __OBJECTIVE_FUCTION_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelObjectiveFunction, - - inline void __sum(__global algorithmFPType * partialSums, __local algorithmFPType * localSum) { - const uint global_group_id = get_group_id(0); - const uint group_size = get_local_size(0); - const uint local_id = get_local_id(0); - - for (uint stride = group_size / 2; stride > 0; stride /= 2) - { - barrier(CLK_LOCAL_MEM_FENCE); - - if (local_id < stride) - { - localSum[local_id] += localSum[local_id + stride]; - } - } - - if (local_id == 0) - { - partialSums[global_group_id] = localSum[0]; - } - } - - __kernel void regularization(const __global algorithmFPType * const beta, const uint nBeta, const uint n, __global algorithmFPType * partialSums, - const algorithmFPType l1, const algorithmFPType l2) { - __local algorithmFPType localSum[LOCAL_SUM_SIZE]; - - const uint global_id = get_global_id(0); - const uint local_id = get_local_id(0); - - if (global_id % nBeta == 0 || global_id >= n) - { - localSum[local_id] = (algorithmFPType)0; - } - else - { - localSum[local_id] = l1 * fabs(beta[global_id]) + l2 * beta[global_id] * beta[global_id]; - } - - __sum(partialSums, localSum); - } - - __kernel void transpose(const __global float * x, __global float * xt, const int n, const int m) { - const uint i = get_global_id(0); - const uint j = get_global_id(1); - xt[i + m * j] = x[i * n + j]; - } - - __kernel void setElem(const uint index, const algorithmFPType elem, __global algorithmFPType * buffer) { buffer[index] = elem; } - - __kernel void setColElem(const uint icol, const algorithmFPType elem, __global algorithmFPType * buffer, const uint ld) { - const uint i = get_global_id(0); - buffer[i * ld + icol] = elem; - } - - __kernel void subVectors(const __global algorithmFPType * const x, const __global algorithmFPType * const y, __global algorithmFPType * c) { - const uint i = get_global_id(0); - c[i] = x[i] - y[i]; - } - - __kernel void addVectorScalar(__global algorithmFPType * x, const algorithmFPType alpha) { - const uint i = get_global_id(0); - x[i] += alpha; - } - - __kernel void addVectorScalar2(__global algorithmFPType * x, const __global algorithmFPType * const y, const uint id) { - const uint i = get_global_id(0); - x[i] += y[id]; - } - - // TODO: replace local sum reduction - __kernel void sumReduction(const __global algorithmFPType * const values, const uint n, __global algorithmFPType * partialSums) { - __local algorithmFPType localSum[LOCAL_SUM_SIZE]; - - uint local_id = get_local_id(0); - uint global_id = get_global_id(0); - - if (global_id >= n) - { - localSum[local_id] = (algorithmFPType)0; - } - else - { - localSum[local_id] = values[global_id]; - } - - __sum(partialSums, localSum); - } - - __kernel void getXY(const __global algorithmFPType * const x, const __global algorithmFPType * const y, const __global int * const ind, - const uint ldx, const algorithmFPType interceptValue, __global algorithmFPType * newX, __global algorithmFPType * newY) { - const uint index = get_global_id(1); - const uint jCol = get_global_id(0); - - const int iRow = ind[index]; - - const __global algorithmFPType * const xi = &x[iRow * ldx]; - __global algorithmFPType * newXi = &newX[index * (ldx + 1)]; - - newXi[jCol + 1] = xi[jCol]; - - if (jCol == 0) - { - newY[index] = y[iRow]; - newXi[0] = interceptValue; - } - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h b/cpp/daal/src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h deleted file mode 100644 index 40f7df06f8c..00000000000 --- a/cpp/daal/src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h +++ /dev/null @@ -1,459 +0,0 @@ -/* file: objective_function_utils_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __OBJECTIVE_FUNCTION_UTILS_H__ -#define __OBJECTIVE_FUNCTION_UTILS_H__ - -#include "src/algorithms/objective_function/common/oneapi/cl_kernel/objective_function_utils.cl" -#include "src/data_management/service_numeric_table.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace objective_function -{ -namespace internal -{ -using namespace daal::services::internal; - -template -struct HelperObjectiveFunction -{ - static services::Status lazyAllocate(services::internal::sycl::UniversalBuffer & x, const uint32_t n) - { - services::Status status; - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - const services::internal::sycl::TypeIds::Id idType = services::internal::sycl::TypeIds::id(); - - if (x.empty() || x.get().size() < n) - { - x = ctx.allocate(idType, n, status); - } - - return status; - } - - static uint32_t getWorkgroupsCount(const uint32_t n, const uint32_t localWorkSize) - { - DAAL_ASSERT(localWorkSize > 0); - const uint32_t elementsPerGroup = localWorkSize; - uint32_t workgroupsCount = n / elementsPerGroup; - - if (workgroupsCount * elementsPerGroup < n) - { - workgroupsCount++; // no need on overflow check since its always smaller than n - } - return workgroupsCount; - } - - // sigma = (y - sigma) - static services::Status subVectors(const services::internal::Buffer & x, const services::internal::Buffer & y, - services::internal::Buffer & result, const uint32_t n) - { - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "subVectors"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(x.size() >= n); - DAAL_ASSERT(y.size() >= n); - DAAL_ASSERT(result.size() >= n); - - args.set(0, x, services::internal::sycl::AccessModeIds::read); - args.set(1, y, services::internal::sycl::AccessModeIds::read); - args.set(2, result, services::internal::sycl::AccessModeIds::write); - - services::internal::sycl::KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; - } - - static services::Status setElem(const uint32_t index, const algorithmFPType element, services::internal::Buffer & buffer) - { - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "setElem"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(buffer.size() > index); - - services::internal::sycl::KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, index); - args.set(1, element); - args.set(2, buffer, services::internal::sycl::AccessModeIds::write); - - services::internal::sycl::KernelRange range(1); - - ctx.run(range, kernel, args, status); - - return status; - } - - static services::Status setColElem(const uint32_t icol, const algorithmFPType element, services::internal::Buffer & buffer, - const uint32_t n, const uint32_t m) - { - services::Status status; - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "setColElem"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK(icol < m, services::ErrorIncorrectParameter); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, n, m); - DAAL_ASSERT(buffer.size() >= n * m); - - args.set(0, icol); - args.set(1, element); - args.set(2, buffer, services::internal::sycl::AccessModeIds::write); - args.set(3, m); - - services::internal::sycl::KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; - } - - static services::Status transpose(const services::internal::Buffer & x, services::internal::Buffer & xt, - const uint32_t n, const uint32_t p) - { - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "transpose"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, n, p); - DAAL_ASSERT(x.size() == n * p); - DAAL_ASSERT(xt.size() == n * p); - - args.set(0, x, services::internal::sycl::AccessModeIds::read); - args.set(1, xt, services::internal::sycl::AccessModeIds::write); - args.set(2, n); - args.set(3, p); - - services::internal::sycl::KernelRange range(n, p); - - ctx.run(range, kernel, args, status); - - return status; - } - - static services::Status sumReduction(const services::internal::Buffer & reductionBuffer, const size_t nWorkGroups, - algorithmFPType & result) - { - services::Status status; - - auto sumReductionArrayPtr = reductionBuffer.toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_ASSERT(reductionBuffer.size() == nWorkGroups); - - const auto * sumReductionArray = sumReductionArrayPtr.get(); - - // Final summation with CPU - for (size_t i = 0; i < nWorkGroups; i++) - { - result += sumReductionArray[i]; - } - - return status; - } - - // l1*||beta|| + l2*||beta||**2 - static services::Status regularization(const services::internal::Buffer & beta, const uint32_t nBeta, const uint32_t nClasses, - algorithmFPType & reg, const algorithmFPType l1, const algorithmFPType l2) - { - services::Status status; - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nBeta, nClasses); - const uint32_t n = nBeta * nClasses; - - const services::internal::sycl::TypeIds::Id idType = services::internal::sycl::TypeIds::id(); - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "regularization"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelNDRange range(1); - - size_t workItemsPerGroup = 256; - - const size_t nWorkGroups = getWorkgroupsCount(n, workItemsPerGroup); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, workItemsPerGroup, nWorkGroups); - services::internal::sycl::KernelRange localRange(workItemsPerGroup); - services::internal::sycl::KernelRange globalRange(workItemsPerGroup * nWorkGroups); - - range.local(localRange, status); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::UniversalBuffer buffer = ctx.allocate(idType, nWorkGroups, status); - DAAL_CHECK_STATUS_VAR(status); - services::internal::Buffer reductionBuffer = buffer.get(); - - services::internal::sycl::KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(beta.size() == n); - - args.set(0, beta, services::internal::sycl::AccessModeIds::read); - args.set(1, nBeta); - args.set(2, n); - args.set(3, reductionBuffer, services::internal::sycl::AccessModeIds::write); - args.set(4, l1); - args.set(5, l2); - - ctx.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, sumReduction(reductionBuffer, nWorkGroups, reg)); - - return status; - } - - // s1 + s2 + .. + sn - static services::Status sum(const services::internal::Buffer & x, algorithmFPType & result, const uint32_t n) - { - services::Status status; - const services::internal::sycl::TypeIds::Id idType = services::internal::sycl::TypeIds::id(); - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "sumReduction"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelNDRange range(1); - - size_t workItemsPerGroup = 256; - - const size_t nWorkGroups = getWorkgroupsCount(n, workItemsPerGroup); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, workItemsPerGroup, nWorkGroups); - services::internal::sycl::KernelRange localRange(workItemsPerGroup); - services::internal::sycl::KernelRange globalRange(workItemsPerGroup * nWorkGroups); - - range.local(localRange, status); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::UniversalBuffer buffer = ctx.allocate(idType, nWorkGroups, status); - DAAL_CHECK_STATUS_VAR(status); - services::internal::Buffer reductionBuffer = buffer.get(); - - services::internal::sycl::KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(x.size() >= n); - - args.set(0, x, services::internal::sycl::AccessModeIds::read); - args.set(1, n); - args.set(2, reductionBuffer, services::internal::sycl::AccessModeIds::write); - - ctx.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, sumReduction(reductionBuffer, nWorkGroups, result)); - - return status; - } - - // x = x + alpha - // Where x - vector; alpha - scalar - static services::Status addVectorScalar(services::internal::Buffer & x, const algorithmFPType alpha, const uint32_t n) - { - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "addVectorScalar"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_ASSERT(x.size() >= n); - - args.set(0, x, services::internal::sycl::AccessModeIds::write); - args.set(1, alpha); - - services::internal::sycl::KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; - } - - // x = x + y[id] - // Where x - vector; y - vector, id - index - static services::Status addVectorScalar(services::internal::Buffer & x, const services::internal::Buffer & y, - const uint32_t id, const uint32_t n) - { - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "addVectorScalar2"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(x.size() >= n); - DAAL_ASSERT(y.size() > id); - - args.set(0, x, services::internal::sycl::AccessModeIds::write); - args.set(1, y, services::internal::sycl::AccessModeIds::read); - args.set(2, id); - - services::internal::sycl::KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; - } - - static services::Status getXY(const services::internal::Buffer & xBuff, - const services::internal::Buffer & yBuff, const services::internal::Buffer & indBuff, - services::internal::Buffer & aX, services::internal::Buffer & aY, uint32_t nBatch, - uint32_t p, bool interceptFlag) - { - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType interceptValue = interceptFlag ? algorithmFPType(1) : algorithmFPType(0); - - const char * const kernelName = "getXY"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nBatch, (p + 1)); - DAAL_ASSERT(xBuff.size() >= nBatch * p); - DAAL_ASSERT(aX.size() == nBatch * (p + 1)); - DAAL_ASSERT(indBuff.size() == nBatch); - // we do not check index values since they do not exceed n by the construction algorithm. - - DAAL_ASSERT(yBuff.size() >= nBatch); - DAAL_ASSERT(aY.size() == nBatch); - - args.set(0, xBuff, services::internal::sycl::AccessModeIds::read); - args.set(1, yBuff, services::internal::sycl::AccessModeIds::read); - args.set(2, indBuff, services::internal::sycl::AccessModeIds::read); - args.set(3, p); - args.set(4, interceptValue); - args.set(5, aX, services::internal::sycl::AccessModeIds::write); - args.set(6, aY, services::internal::sycl::AccessModeIds::write); - - services::internal::sycl::KernelRange range(p, nBatch); - - ctx.run(range, kernel, args, status); - - return status; - } - -private: - static services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory) - { - services::Status status; - services::String options = services::internal::sycl::getKeyFPType(); - - services::String cachekey("__daal_algorithms_optimization_solver_objective_function_"); - cachekey.add(options); - - options.add(" -D LOCAL_SUM_SIZE=256 "); //depends on workItemsPerGroup value - - factory.build(services::internal::sycl::ExecutionTargetIds::device, cachekey.c_str(), clKernelObjectiveFunction, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; - } -}; - -} // namespace internal -} // namespace objective_function -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/BUILD b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/BUILD index def0fe672b9..17933cd2b5e 100644 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/BUILD +++ b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/objective_function:kernel", ], ) diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_container.h b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_container.h index 09b46c97c27..6518e0818e6 100644 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_container.h +++ b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_container.h @@ -26,7 +26,6 @@ #include "algorithms/optimization_solver/objective_function/cross_entropy_loss_batch.h" #include "src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_kernel.h" -#include "src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_kernel_oneapi.h" namespace daal { @@ -41,17 +40,7 @@ namespace interface2 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::CrossEntropyLossKernel, algorithmFPType, method); - } - else - { - _kernel = new internal::CrossEntropyLossKernelOneAPI(); - } + __DAAL_INITIALIZE_KERNELS(internal::CrossEntropyLossKernel, algorithmFPType, method); } template @@ -104,22 +93,10 @@ services::Status BatchContainer::compute() lipschitzConstant = result->get(objective_function::lipschitzConstantIdx).get(); } - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::CrossEntropyLossKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - input->get(cross_entropy_loss::data).get(), input->get(cross_entropy_loss::dependentVariables).get(), - input->get(cross_entropy_loss::argument).get(), value, hessian, gradient, nonSmoothTermValue, proximalProjection, - lipschitzConstant, parameter); - } - else - { - return ((internal::CrossEntropyLossKernelOneAPI *)(_kernel)) - ->compute(input->get(cross_entropy_loss::data).get(), input->get(cross_entropy_loss::dependentVariables).get(), - input->get(cross_entropy_loss::argument).get(), value, hessian, gradient, nonSmoothTermValue, proximalProjection, - lipschitzConstant, parameter); - } + __DAAL_CALL_KERNEL(env, internal::CrossEntropyLossKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, + input->get(cross_entropy_loss::data).get(), input->get(cross_entropy_loss::dependentVariables).get(), + input->get(cross_entropy_loss::argument).get(), value, hessian, gradient, nonSmoothTermValue, proximalProjection, + lipschitzConstant, parameter); } } // namespace interface2 diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_fpt_dispatcher.cpp index d470ae7350c..359d25bd317 100644 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_batch_fpt_dispatcher.cpp @@ -25,8 +25,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::cross_entropy_loss::interface2::BatchContainer, batch, DAAL_FPTYPE, - optimization_solver::cross_entropy_loss::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::cross_entropy_loss::interface2::BatchContainer, batch, DAAL_FPTYPE, + optimization_solver::cross_entropy_loss::defaultDense) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_kernel_oneapi_fpt.cpp b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_kernel_oneapi_fpt.cpp deleted file mode 100644 index 014539a451f..00000000000 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/cross_entropy_loss_dense_default_kernel_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: cross_entropy_loss_dense_default_kernel_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Logistic Loss Batch Kernel for GPU. -//-- -*/ - -#include "src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_kernel_oneapi.h" -#include "src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace cross_entropy_loss -{ -namespace internal -{ -template class CrossEntropyLossKernelOneAPI; - -} // namespace internal -} // namespace cross_entropy_loss -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cl_kernel/cross_entropy_loss_dense_default.cl b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cl_kernel/cross_entropy_loss_dense_default.cl deleted file mode 100644 index aa4a992aaf7..00000000000 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cl_kernel/cross_entropy_loss_dense_default.cl +++ /dev/null @@ -1,94 +0,0 @@ -/* file: cross_entropy_loss_dense_default.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Cross-Entropy Loss OpenCL kernels. -//-- -*/ - -#ifndef __CROSS_ENTROPY_LOSS_KERNELS_CL__ -#define __CROSS_ENTROPY_LOSS_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelCrossEntropyLoss, - - void __softmax(const __global algorithmFPType * const xi, __global algorithmFPType * resi, const uint nClasses, - const algorithmFPType expThreshold) { - algorithmFPType maxxi = xi[0]; - for (uint j = 1; j < nClasses; j++) - { - maxxi = fmax(maxxi, xi[j]); - } - - algorithmFPType sum = (algorithmFPType)0; - for (uint j = 0; j < nClasses; j++) - { - const algorithmFPType arg = fmax(xi[j] - maxxi, expThreshold); - const algorithmFPType value = exp(arg); - sum += value; - resi[j] = value; - } - - for (uint j = 0; j < nClasses; j++) - { - resi[j] = resi[j] / sum; - } - } - - __kernel void softmax(const __global algorithmFPType * const x, __global algorithmFPType * result, const uint nClasses, - const algorithmFPType expThreshold) { - const uint i = get_global_id(0); - const __global algorithmFPType * const xi = &x[i * nClasses]; - __global algorithmFPType * resi = &result[i * nClasses]; - __softmax(xi, resi, nClasses, expThreshold); - } - - __kernel void softmaxAndUpdateProba(const __global algorithmFPType * const x, const __global algorithmFPType * const y, - __global algorithmFPType * result, const uint nClasses, const algorithmFPType expThreshold) { - const uint i = get_global_id(0); - const __global algorithmFPType * const xi = &x[i * nClasses]; - __global algorithmFPType * resi = &result[i * nClasses]; - - __softmax(xi, resi, nClasses, expThreshold); - - resi[(uint)y[i]] -= 1; - } - - __kernel void crossEntropy(const __global algorithmFPType * const y, const __global algorithmFPType * const s, __global algorithmFPType * result, - const uint nClasses) { - const uint i = get_global_id(0); - - result[i] = log(s[i * nClasses + (uint)y[i]]); - } - - __kernel void updateProba(const __global algorithmFPType * const y, __global algorithmFPType * sigma, const uint nClasses, - const algorithmFPType value) { - const uint i = get_global_id(0); - - sigma[i * nClasses + (uint)y[i]] += value; - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_kernel_oneapi.h b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_kernel_oneapi.h deleted file mode 100644 index b5982165b27..00000000000 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_kernel_oneapi.h +++ /dev/null @@ -1,108 +0,0 @@ -/* file: cross_entropy_loss_dense_default_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Cross-entopy Loss Batch Kernel for GPU. -//-- -*/ - -#ifndef __CROSS_ENTROPY_LOSS_DENSE_DEFAULT_KERNEL_ONEAPI_H__ -#define __CROSS_ENTROPY_LOSS_DENSE_DEFAULT_KERNEL_ONEAPI_H__ - -#include "src/sycl/blas_gpu.h" -#include "src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h" -#include "algorithms/optimization_solver/objective_function/cross_entropy_loss_types.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace cross_entropy_loss -{ -namespace internal -{ -template -class CrossEntropyLossKernelOneAPI : public Kernel -{}; - -template -class CrossEntropyLossKernelOneAPI : public Kernel -{ - using HelperObjectiveFunction = objective_function::internal::HelperObjectiveFunction; - -public: - services::Status compute(NumericTable * data, NumericTable * dependentVariables, NumericTable * argument, NumericTable * value, - NumericTable * hessian, NumericTable * gradient, NumericTable * nonSmoothTermValue, NumericTable * proximalProjection, - NumericTable * lipschitzConstant, Parameter * parameter); - - static services::Status applyBeta(const services::internal::Buffer & x, const services::internal::Buffer & beta, - services::internal::Buffer & xb, const uint32_t n, const uint32_t nClasses, const uint32_t ldX, - const uint32_t nBeta, const uint32_t offset); - - static services::Status applyGradient(const services::internal::Buffer & x, - const services::internal::Buffer & g, - services::internal::Buffer & gradient, const algorithmFPType alpha, const uint32_t n, - const uint32_t p, const uint32_t nBeta, uint32_t nClasses, const algorithmFPType beta, - const uint32_t offset); - - static services::Status betaIntercept(const services::internal::Buffer & one, - const services::internal::Buffer & arg, services::internal::Buffer & f, - const uint32_t n, const uint32_t nClasses, const uint32_t nBeta); - - static services::Status softmax(const services::internal::Buffer & x, services::internal::Buffer & result, - const uint32_t n, const uint32_t nClasses); - - static services::Status softmaxAndUpdateProba(const services::internal::Buffer & x, - const services::internal::Buffer & y, - services::internal::Buffer & result, const uint32_t n, const uint32_t nClasses); - - static services::Status crossEntropy(const services::internal::Buffer & y, - const services::internal::Buffer & sigma, - services::internal::Buffer & result, const uint32_t n, const uint32_t nClasses); - - static services::Status updateProba(const services::internal::Buffer & y, services::internal::Buffer & sigma, - const uint32_t n, const uint32_t nClasses, const algorithmFPType value); - -private: - services::Status doCompute(const uint32_t nBatch, const uint32_t nFeatures, const uint32_t nClasses, - const daal::services::internal::Buffer & xBuff, - const daal::services::internal::Buffer & yBuff, - const daal::services::internal::Buffer & argBuff, NumericTable * valueNT, NumericTable * gradientNT, - NumericTable * hessianNT, NumericTable * nonSmoothTermValueNT, NumericTable * proximalProjectionNT, - NumericTable * lipschitzConstantNT, const algorithmFPType l1reg, const algorithmFPType l2reg, const bool interceptFlag, - const bool isSourceData); - - static services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory); - - services::internal::sycl::UniversalBuffer _uX; - services::internal::sycl::UniversalBuffer _uY; - services::internal::sycl::UniversalBuffer _fUniversal; - services::internal::sycl::UniversalBuffer _softmaxUniversal; - services::internal::sycl::UniversalBuffer _oneVector; - services::internal::sycl::UniversalBuffer _crossEntropyUniversal; -}; - -} // namespace internal -} // namespace cross_entropy_loss -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_oneapi_impl.i b/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_oneapi_impl.i deleted file mode 100644 index 927390f523c..00000000000 --- a/cpp/daal/src/algorithms/objective_function/cross_entropy_loss/oneapi/cross_entropy_loss_dense_default_oneapi_impl.i +++ /dev/null @@ -1,497 +0,0 @@ -/* file: cross_entropy_loss_dense_default_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Cross-Entropy Loss algorithm for GPU. -//-- -*/ - -#include "src/sycl/math_service_types.h" -#include "src/sycl/blas_gpu.h" -#include "src/algorithms/objective_function/cross_entropy_loss/oneapi/cl_kernel/cross_entropy_loss_dense_default.cl" -#include "src/externals/service_profiler.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace cross_entropy_loss -{ -namespace internal -{ -using namespace daal::services::internal::sycl; -using namespace daal::internal; - -// Calculate X^T*beta -template -services::Status CrossEntropyLossKernelOneAPI::applyBeta(const services::internal::Buffer & x, - const services::internal::Buffer & beta, - services::internal::Buffer & xb, - const uint32_t n, const uint32_t nClasses, const uint32_t ldX, - const uint32_t nBeta, const uint32_t offset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(applyBeta); - DAAL_ASSERT(x.size() >= size_t(n) * size_t(ldX)); // overflows checked in the algorithm - DAAL_ASSERT(beta.size() >= size_t(offset) + size_t(ldX) * size_t(nClasses)); - DAAL_ASSERT(xb.size() >= size_t(n) * size_t(nClasses)); - - return BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::Trans, n, nClasses, ldX, - algorithmFPType(1), x, ldX, 0, beta, nBeta, offset, algorithmFPType(0), xb, nClasses, 0); -} - -template -services::Status CrossEntropyLossKernelOneAPI::betaIntercept(const services::internal::Buffer & one, - const services::internal::Buffer & arg, - services::internal::Buffer & f, - const uint32_t n, const uint32_t nClasses, - const uint32_t nBeta) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(betaIntercept); - DAAL_ASSERT(one.size() >= size_t(n)); - DAAL_ASSERT(arg.size() >= size_t(nClasses)); - DAAL_ASSERT(f.size() >= size_t(n) * size_t(nClasses)); // overflows checked in the algorithm - - return BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::Trans, n, nClasses, 1, - algorithmFPType(1), one, 1, 0, arg, nBeta, 0, algorithmFPType(1), f, nClasses, 0); -} - -// Calculate (y - sigma)^T*X + 2*L2*beta -template -services::Status CrossEntropyLossKernelOneAPI::applyGradient(const services::internal::Buffer & x, - const services::internal::Buffer & g, - services::internal::Buffer & gradient, - const algorithmFPType alpha, const uint32_t n, - const uint32_t p, const uint32_t nBeta, uint32_t nClasses, - const algorithmFPType beta, const uint32_t offset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(applyGradient); - DAAL_ASSERT(g.size() >= size_t(n) * size_t(nClasses)); - DAAL_ASSERT(x.size() >= size_t(n) * size_t(p)); - DAAL_ASSERT(gradient.size() >= size_t(offset) + size_t(p) * size_t(nClasses)); // overflows checked in the algorithm - - return BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::Trans, math::Transpose::NoTrans, nClasses, p, n, alpha, g, - nClasses, 0, x, p, 0, beta, gradient, nBeta, offset); -} - -template -services::Status CrossEntropyLossKernelOneAPI::softmax(const services::internal::Buffer & x, - services::internal::Buffer & result, - const uint32_t n, const uint32_t nClasses) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(softmax); - - DAAL_ASSERT(x.size() >= size_t(n) * size_t(nClasses)); // overflows checked in the algorithm - DAAL_ASSERT(result.size() >= size_t(n) * size_t(nClasses)); - - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "softmax"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType expThreshold = math::expThreshold(); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, result, AccessModeIds::readwrite); - args.set(2, nClasses); - args.set(3, expThreshold); - - KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; -} - -template -services::Status CrossEntropyLossKernelOneAPI::softmaxAndUpdateProba( - const services::internal::Buffer & x, const services::internal::Buffer & y, - services::internal::Buffer & result, const uint32_t n, const uint32_t nClasses) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(softmaxAndUpdateProba); - - DAAL_ASSERT(x.size() >= size_t(n) * size_t(nClasses)); // overflows checked in the algorithm - DAAL_ASSERT(result.size() >= size_t(n) * size_t(nClasses)); - DAAL_ASSERT(y.size() >= n); - - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "softmaxAndUpdateProba"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType expThreshold = math::expThreshold(); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, y, AccessModeIds::read); - args.set(2, result, AccessModeIds::readwrite); - args.set(3, nClasses); - args.set(4, expThreshold); - - KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; -} - -// resulti = [yi=K]*log(sigmai) -template -services::Status CrossEntropyLossKernelOneAPI::crossEntropy(const services::internal::Buffer & y, - const services::internal::Buffer & sigma, - services::internal::Buffer & result, - const uint32_t n, const uint32_t nClasses) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(crossEntropy); - services::Status status; - - DAAL_ASSERT(sigma.size() == size_t(n) * size_t(nClasses)); // overflows checked in the algorithm - DAAL_ASSERT(result.size() == n); - DAAL_ASSERT(y.size() == n); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "crossEntropy"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, y, AccessModeIds::read); - args.set(1, sigma, AccessModeIds::read); - args.set(2, result, AccessModeIds::write); - args.set(3, nClasses); - - KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; -} - -// resulti = [yi=K]*log(sigmai) -template -services::Status CrossEntropyLossKernelOneAPI::updateProba(const services::internal::Buffer & y, - services::internal::Buffer & sigma, - const uint32_t n, const uint32_t nClasses, - const algorithmFPType value) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(updateProba); - services::Status status; - - DAAL_ASSERT(sigma.size() == size_t(n) * size_t(nClasses)); // overflows checked in the algorithm - DAAL_ASSERT(y.size() == n); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "updateProba"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, y, AccessModeIds::read); - args.set(1, sigma, AccessModeIds::readwrite); - args.set(2, nClasses); - args.set(3, value); - - KernelRange range(n); - { - ctx.run(range, kernel, args, status); - } - - return status; -} - -template -services::Status CrossEntropyLossKernelOneAPI::buildProgram(ClKernelFactoryIface & factory) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(buildProgram); - services::Status status; - services::String options = getKeyFPType(); - services::String cachekey("__daal_algorithms_optimization_solver_cross_entropy_loss_"); - cachekey.add(options); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelCrossEntropyLoss, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status CrossEntropyLossKernelOneAPI::doCompute( - const uint32_t nBatch, const uint32_t nFeatures, const uint32_t nClasses, const daal::services::internal::Buffer & xBuff, - const daal::services::internal::Buffer & yBuff, const daal::services::internal::Buffer & argBuff, - NumericTable * valueNT, NumericTable * gradientNT, NumericTable * hessianNT, NumericTable * nonSmoothTermValueNT, - NumericTable * proximalProjectionNT, NumericTable * lipschitzConstantNT, const algorithmFPType l1reg, const algorithmFPType l2reg, - const bool interceptFlag, const bool isSourceData) -{ - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - const uint32_t nBeta = nFeatures + 1; - DAAL_ASSERT(nBeta > nFeatures); - - const uint32_t ldX = isSourceData || interceptFlag ? nFeatures : nBeta; - const uint32_t offsetX = isSourceData || interceptFlag ? 1 : 0; - - const uint32_t n = nBatch; - - if (hessianNT || nonSmoothTermValueNT || proximalProjectionNT || lipschitzConstantNT) - { - return services::ErrorMethodNotImplemented; - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, n, nClasses); - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_fUniversal, n * nClasses)); - services::internal::Buffer fBuf = _fUniversal.get(); - - //f = X*W + W0 - DAAL_CHECK_STATUS(status, applyBeta(xBuff, argBuff, fBuf, n, nClasses, ldX, nBeta, offsetX)); - - if (interceptFlag) - { - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_oneVector, n)); - services::internal::Buffer oneVectorBuf = _oneVector.get(); - - ctx.fill(_oneVector, 1.0, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_STATUS(status, betaIntercept(oneVectorBuf, argBuff, fBuf, n, nClasses, nBeta)); - } - - const bool isNotOnlyGrad = valueNT != nullptr || hessianNT != nullptr; - - services::internal::Buffer softmaxBuf; - if (isNotOnlyGrad) - { - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_softmaxUniversal, n * nClasses)); - softmaxBuf = _softmaxUniversal.get(); - DAAL_CHECK_STATUS(status, softmax(fBuf, softmaxBuf, n, nClasses)); - } - - const algorithmFPType div = algorithmFPType(1) / algorithmFPType(n); - - if (valueNT) - { - DAAL_ITTNOTIFY_SCOPED_TASK(doCompute.valueNT); - DAAL_ASSERT(valueNT->getNumberOfRows() == 1); - - BlockDescriptor vr; - DAAL_CHECK_STATUS(status, valueNT->getBlockOfRows(0, 1, ReadWriteMode::readWrite, vr)); - algorithmFPType & value = *vr.getBlockPtr(); - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_crossEntropyUniversal, n)); - services::internal::Buffer crossEntropyBuff = _crossEntropyUniversal.get(); - - DAAL_CHECK_STATUS(status, crossEntropy(yBuff, softmaxBuf, crossEntropyBuff, n, nClasses)); - - // TODO replace mean - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::sum(crossEntropyBuff, value, n)); - value *= -div; - - if (l1reg > 0 || l2reg > 0) - { - algorithmFPType reg = algorithmFPType(0); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::regularization(argBuff, nBeta, nClasses, reg, l1reg, l2reg)); - - value += reg; - } - DAAL_CHECK_STATUS(status, valueNT->releaseBlockOfRows(vr)); - } - - if (gradientNT) - { - DAAL_ASSERT(gradientNT->getNumberOfRows() == nClasses * nBeta); - - BlockDescriptor gr; - DAAL_CHECK_STATUS(status, gradientNT->getBlockOfRows(0, nClasses * nBeta, ReadWriteMode::readWrite, gr)); - daal::services::internal::Buffer gradientBuff = gr.getBuffer(); - - const algorithmFPType zero = algorithmFPType(0); - - // fBuf = fBuf - 1 if y_i == K - if (isNotOnlyGrad) - { - const algorithmFPType minusOne = algorithmFPType(-1); - DAAL_CHECK_STATUS(status, updateProba(yBuff, softmaxBuf, n, nClasses, minusOne)); - } - else - { - softmaxBuf = fBuf; - DAAL_CHECK_STATUS(status, softmaxAndUpdateProba(fBuf, yBuff, softmaxBuf, n, nClasses)); - } - - const algorithmFPType coeffBeta = algorithmFPType(2) * l2reg; - if (l2reg > 0) - { - // overflow checked in compute() - ctx.copy(gradientBuff, 0, argBuff, 0, nBeta * nClasses, status); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::setColElem(0, zero, gradientBuff, nClasses, nBeta)); - } - - // gradient = (X^T(sigmoid(Xb) - y)/n + 2*l2*||Beta|| - DAAL_CHECK_STATUS(status, applyGradient(xBuff, softmaxBuf, gradientBuff, div, n, ldX, nBeta, nClasses, coeffBeta, offsetX)); - - if (interceptFlag) - { - DAAL_ITTNOTIFY_SCOPED_TASK(doCompute.gradientNT.interceptFlag); - // g[0] = sum(sigmoid(Xb) - y)/n - DAAL_CHECK_STATUS(status, - applyGradient(_oneVector.get(), softmaxBuf, gradientBuff, div, n, 1, nBeta, nClasses, zero, 0u)); - } - - if (hessianNT) - { - // fBuf = fBuf + 1 if y_i == K - const algorithmFPType one = algorithmFPType(1); - DAAL_CHECK_STATUS(status, updateProba(yBuff, softmaxBuf, n, nClasses, one)); - } - DAAL_CHECK_STATUS(status, gradientNT->releaseBlockOfRows(gr)); - } - - return status; -} - -template -services::Status CrossEntropyLossKernelOneAPI::compute(NumericTable * data, NumericTable * dependentVariables, - NumericTable * argument, NumericTable * value, - NumericTable * hessian, NumericTable * gradient, - NumericTable * nonSmoothTermValue, - NumericTable * proximalProjectionNT, - NumericTable * lipschitzConstantNT, Parameter * parameter) -{ - services::Status status; - DAAL_ASSERT(data != nullptr); - DAAL_ASSERT(parameter != nullptr); - DAAL_ASSERT(dependentVariables != nullptr); - DAAL_ASSERT(argument != nullptr); - - const size_t nRows = data->getNumberOfRows(); - const size_t p = data->getNumberOfColumns(); - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, p, 1); - const size_t nBeta = p + 1; - const size_t nClasses = parameter->nClasses; - - DAAL_ASSERT(argument->getNumberOfColumns() == 1); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, nBeta, nClasses); - DAAL_ASSERT(argument->getNumberOfRows() == nClasses * nBeta); - - BlockDescriptor agrBlock; - DAAL_CHECK_STATUS(status, argument->getBlockOfRows(0, nClasses * nBeta, ReadWriteMode::readOnly, agrBlock)); - - services::internal::Buffer argBuff = agrBlock.getBuffer(); - - NumericTable * ntInd = parameter->batchIndices.get(); - const algorithmFPType l1reg = parameter->penaltyL1; - const algorithmFPType l2reg = parameter->penaltyL2; - - if (ntInd == nullptr || (ntInd != nullptr && ntInd->getNumberOfColumns() == nRows)) - { - BlockDescriptor xBlock; - BlockDescriptor yBlock; - - DAAL_CHECK_STATUS(status, data->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, yBlock)); - - const services::internal::Buffer xBuff = xBlock.getBuffer(); - const services::internal::Buffer yBuff = yBlock.getBuffer(); - - const size_t nBatch = nRows; - const bool isSourceData = true; - const bool interceptFlag = parameter->interceptFlag; - - status = doCompute(nBatch, p, nClasses, xBuff, yBuff, argBuff, value, gradient, hessian, nonSmoothTermValue, proximalProjectionNT, - lipschitzConstantNT, l1reg, l2reg, interceptFlag, isSourceData); - - DAAL_CHECK_STATUS(status, data->releaseBlockOfRows(xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->releaseBlockOfRows(yBlock)); - } - else - { - const size_t nBatch = ntInd->getNumberOfColumns(); - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_uX, nBatch * nBeta)); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_uY, nBatch)); - - services::internal::Buffer xBuff = _uX.get(); - services::internal::Buffer yBuff = _uY.get(); - - const bool isSourceData = false; - const bool interceptFlag = false; - - BlockDescriptor rInd; - DAAL_CHECK_STATUS(status, ntInd->getBlockOfRows(0, 1, ReadWriteMode::readOnly, rInd)); - services::internal::Buffer indBuff = rInd.getBuffer(); - - BlockDescriptor xBlock; - BlockDescriptor yBlock; - - DAAL_CHECK_STATUS(status, data->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, yBlock)); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(getXY); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::getXY(xBlock.getBuffer(), yBlock.getBuffer(), indBuff, xBuff, yBuff, nBatch, p, - parameter->interceptFlag)); - } - - DAAL_CHECK_STATUS(status, ntInd->releaseBlockOfRows(rInd)); - - status = doCompute(nBatch, p, nClasses, xBuff, yBuff, argBuff, value, gradient, hessian, nonSmoothTermValue, proximalProjectionNT, - lipschitzConstantNT, l1reg, l2reg, interceptFlag, isSourceData); - - DAAL_CHECK_STATUS(status, data->releaseBlockOfRows(xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->releaseBlockOfRows(yBlock)); - } - - DAAL_CHECK_STATUS(status, argument->releaseBlockOfRows(agrBlock)); - return status; -} - -} // namespace internal -} // namespace cross_entropy_loss -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/BUILD b/cpp/daal/src/algorithms/objective_function/logistic_loss/BUILD index def0fe672b9..17933cd2b5e 100644 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/BUILD +++ b/cpp/daal/src/algorithms/objective_function/logistic_loss/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/objective_function:kernel", ], ) diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_container.h b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_container.h index 731c5093bc8..f9d8f8cf999 100644 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_container.h +++ b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_container.h @@ -26,7 +26,6 @@ #include "algorithms/optimization_solver/objective_function/logistic_loss_batch.h" #include "src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_kernel.h" -#include "src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_kernel_oneapi.h" namespace daal { @@ -41,17 +40,7 @@ namespace interface2 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::LogLossKernel, algorithmFPType, method); - } - else - { - _kernel = new internal::LogLossKernelOneAPI(); - } + __DAAL_INITIALIZE_KERNELS(internal::LogLossKernel, algorithmFPType, method); } template @@ -104,23 +93,9 @@ services::Status BatchContainer::compute() lipschitzConstant = result->get(objective_function::lipschitzConstantIdx).get(); } - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu || nonSmoothTermValue || proximalProjection || lipschitzConstant) - { - __DAAL_CALL_KERNEL(env, internal::LogLossKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - input->get(logistic_loss::data).get(), input->get(logistic_loss::dependentVariables).get(), - input->get(logistic_loss::argument).get(), value, hessian, gradient, nonSmoothTermValue, proximalProjection, - lipschitzConstant, parameter); - } - else - { - return ((internal::LogLossKernelOneAPI *)(_kernel)) - ->compute(input->get(logistic_loss::data).get(), input->get(logistic_loss::dependentVariables).get(), - input->get(logistic_loss::argument).get(), value, hessian, gradient, nonSmoothTermValue, proximalProjection, lipschitzConstant, - parameter); - } + __DAAL_CALL_KERNEL(env, internal::LogLossKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, input->get(logistic_loss::data).get(), + input->get(logistic_loss::dependentVariables).get(), input->get(logistic_loss::argument).get(), value, hessian, gradient, + nonSmoothTermValue, proximalProjection, lipschitzConstant, parameter); } } // namespace interface2 diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_fpt_dispatcher.cpp index 9e92158b390..41aa6656be4 100644 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_fpt_dispatcher.cpp @@ -25,8 +25,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::logistic_loss::interface2::BatchContainer, batch, DAAL_FPTYPE, - optimization_solver::logistic_loss::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::logistic_loss::interface2::BatchContainer, batch, DAAL_FPTYPE, + optimization_solver::logistic_loss::defaultDense) namespace optimization_solver { namespace logistic_loss diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_kernel_oneapi_fpt.cpp b/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_kernel_oneapi_fpt.cpp deleted file mode 100644 index 50970e60472..00000000000 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_kernel_oneapi_fpt.cpp +++ /dev/null @@ -1,44 +0,0 @@ -/* file: logistic_loss_dense_default_kernel_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Logistic Loss Batch Kernel for GPU. -//-- -*/ - -#include "src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_kernel_oneapi.h" -#include "src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_oneapi_impl.i" -#include "src/algorithms/objective_function/logistic_loss/logistic_loss_dense_default_batch_container.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace logistic_loss -{ -namespace internal -{ -template class LogLossKernelOneAPI; - -} // namespace internal -} // namespace logistic_loss -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/cl_kernel/logistic_loss_dense_default.cl b/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/cl_kernel/logistic_loss_dense_default.cl deleted file mode 100644 index 53be50d6115..00000000000 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/cl_kernel/logistic_loss_dense_default.cl +++ /dev/null @@ -1,146 +0,0 @@ -/* file: logistic_loss_dense_default.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Log Loss OpenCL kernels. -//-- -*/ - -#ifndef __OBJECTIVE_LOGISTIC_LOSS_KERNELS_CL__ -#define __OBJECTIVE_LOGISTIC_LOSS_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelLogLoss, - - inline void __sum(__global algorithmFPType * partialSums, __local algorithmFPType * localSum) { - const uint global_group_id = get_group_id(0); - const uint group_size = get_local_size(0); - const uint local_id = get_local_id(0); - - for (uint stride = group_size / 2; stride > 0; stride /= 2) - { - barrier(CLK_LOCAL_MEM_FENCE); - - if (local_id < stride) - { - localSum[local_id] += localSum[local_id + stride]; - } - } - - if (local_id == 0) - { - partialSums[global_group_id] = localSum[0]; - } - } - - __kernel void logLoss(const __global algorithmFPType * const y, const __global algorithmFPType * const sigma, __global algorithmFPType * result) { - const uint i = get_global_id(0); - const algorithmFPType one = (algorithmFPType)1.0; - - result[i] = y[i] * log(sigma[i]) + (one - y[i]) * log(one - sigma[i]); - } - - __kernel void sigmoid(const __global algorithmFPType * const xb, const algorithmFPType expThreshold, const uint calculateInverse, - __global algorithmFPType * result) { - const uint i = get_global_id(0); - const algorithmFPType one = (algorithmFPType)1.0; - - const algorithmFPType f = fmax(-xb[i], expThreshold); - const algorithmFPType p = one / (one + exp(f)); - - if (calculateInverse != 0) - { - const uint firstColIdx = 2 * i; - - result[firstColIdx] = one - p; - result[firstColIdx + 1] = p; - } - else - { - result[i] = p; - } - } - - __kernel void hessian(const __global algorithmFPType * const x, const uint ldx, const __global algorithmFPType * const sigma, const uint n, - __global algorithmFPType * h, const uint ldh, const uint offset, const algorithmFPType alpha) { - const uint row = get_global_id(0); - const uint col = get_global_id(1); - const algorithmFPType one = (algorithmFPType)1.0; - - if (col < row) return; - - algorithmFPType sum = (algorithmFPType)0.0; - - for (uint i = 0; i < n; i++) - { - sum += x[i * ldx + row] * x[i * ldx + col] * sigma[i] * (one - sigma[i]); - } - - h[(row + offset) * ldh + (col + offset)] = sum * alpha; - h[(col + offset) * ldh + (row + offset)] = sum * alpha; - } - - __kernel void hessianIntercept(const __global algorithmFPType * const x, const uint ldx, const __global algorithmFPType * const sigma, - const uint n, __global algorithmFPType * h, const uint ldh, const algorithmFPType alpha) { - const uint row = get_global_id(0); - const algorithmFPType one = (algorithmFPType)1.0; - - algorithmFPType sum = (algorithmFPType)0.0; - for (uint i = 0; i < n; i++) - { - sum += x[i * ldx + row] * sigma[i] * (one - sigma[i]); - } - - h[(row + 1) * ldh] = sum * alpha; - h[(row + 1)] = sum * alpha; - } - - __kernel void hessianInterceptH0(const __global algorithmFPType * const sigma, const uint n, __global algorithmFPType * partialSums) { - __local algorithmFPType localSum[LOCAL_SUM_SIZE]; - - uint local_id = get_local_id(0); - uint global_id = get_global_id(0); - const algorithmFPType one = (algorithmFPType)1.0; - - if (global_id >= n) - { - localSum[local_id] = (algorithmFPType)0; - } - else - { - localSum[local_id] = sigma[global_id] * (one - sigma[global_id]); - } - - __sum(partialSums, localSum); - } - - __kernel void hessianRegulization(__global algorithmFPType * h, const uint ldh, const algorithmFPType beta) { - // not regulated b0 - const uint row = get_global_id(0) + 1; - h[row * ldh + row] += beta; - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_kernel_oneapi.h b/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_kernel_oneapi.h deleted file mode 100644 index 6270695dc25..00000000000 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_kernel_oneapi.h +++ /dev/null @@ -1,108 +0,0 @@ -/* file: logistic_loss_dense_default_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Logistic Loss Batch Kernel for GPU. -//-- -*/ - -#ifndef __LOGISTIC_LOSS_DENSE_DEFAULT_KERNEL_ONEAPI_H__ -#define __LOGISTIC_LOSS_DENSE_DEFAULT_KERNEL_ONEAPI_H__ - -#include "src/sycl/blas_gpu.h" -#include "src/algorithms/objective_function/common/oneapi/objective_function_utils_oneapi.h" -#include "algorithms/optimization_solver/objective_function/logistic_loss_types.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace logistic_loss -{ -namespace internal -{ -template -class LogLossKernelOneAPI : public Kernel -{}; - -template -class LogLossKernelOneAPI : public Kernel -{ - using HelperObjectiveFunction = objective_function::internal::HelperObjectiveFunction; - -public: - services::Status compute(NumericTable * data, NumericTable * dependentVariables, NumericTable * argument, NumericTable * value, - NumericTable * hessian, NumericTable * gradient, NumericTable * nonSmoothTermValue, NumericTable * proximalProjection, - NumericTable * lipschitzConstant, Parameter * parameter); - - static services::Status applyBeta(const services::internal::Buffer & x, const services::internal::Buffer & beta, - services::internal::Buffer & xb, const uint32_t n, const uint32_t p, const uint32_t offset); - - static services::Status applyGradient(const services::internal::Buffer & x, - const services::internal::Buffer & sub, - services::internal::Buffer & gradient, const algorithmFPType alpha, const uint32_t n, - const uint32_t p, const algorithmFPType beta, const uint32_t offset); - - static services::Status applyHessian(const services::internal::Buffer & x, - const services::internal::Buffer & sigma, const uint32_t n, const uint32_t p, - services::internal::Buffer & h, const uint32_t nBeta, const uint32_t offset, - const algorithmFPType alpha); - - // TODO: move in common services - static services::Status logLoss(const services::internal::Buffer & y, const services::internal::Buffer & sigma, - services::internal::Buffer & result, const uint32_t n); - - // TODO: move in common services - static services::Status sigmoids(const services::internal::Buffer & x, services::internal::Buffer & result, - const uint32_t n, bool calculateInverse = false); - - static services::Status betaIntercept(const services::internal::Buffer & arg, services::internal::Buffer & x, - const uint32_t n); - -private: - services::Status doCompute(const uint32_t nBatch, const uint32_t nFeatures, const daal::services::internal::Buffer & xBuff, - const daal::services::internal::Buffer & yBuff, - const daal::services::internal::Buffer & argBuff, NumericTable * valueNT, NumericTable * gradientNT, - NumericTable * hessianNT, NumericTable * nonSmoothTermValueNT, NumericTable * proximalProjectionNT, - NumericTable * lipschitzConstantNT, const algorithmFPType l1reg, const algorithmFPType l2reg, const bool interceptFlag, - const bool isSourceData); - - static services::Status hessianRegulization(services::internal::Buffer & h, const uint32_t nBeta, const algorithmFPType l2); - - static services::Status hessianIntercept(const services::internal::Buffer & x, - const services::internal::Buffer & sigma, const uint32_t n, const uint32_t p, - services::internal::Buffer & h, const uint32_t nBeta, const algorithmFPType alpha); - - static services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory); - - services::internal::sycl::UniversalBuffer _uX; - services::internal::sycl::UniversalBuffer _uY; - services::internal::sycl::UniversalBuffer _fUniversal; - services::internal::sycl::UniversalBuffer _sigmoidUniversal; - services::internal::sycl::UniversalBuffer _subSigmoidYUniversal; -}; - -} // namespace internal -} // namespace logistic_loss -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_oneapi_impl.i b/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_oneapi_impl.i deleted file mode 100644 index 69e124d1ed4..00000000000 --- a/cpp/daal/src/algorithms/objective_function/logistic_loss/oneapi/logistic_loss_dense_default_oneapi_impl.i +++ /dev/null @@ -1,576 +0,0 @@ -/* file: logistic_loss_dense_default_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of Logistic Loss algorithm for GPU. -//-- -*/ - -#include "src/algorithms/objective_function/logistic_loss/oneapi/cl_kernel/logistic_loss_dense_default.cl" -#include "src/services/service_utils.h" -#include "src/externals/service_math.h" - -#include "src/externals/service_profiler.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace logistic_loss -{ -namespace internal -{ -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; -using namespace daal::internal; - -// Calculate X^T*beta -template -services::Status LogLossKernelOneAPI::applyBeta(const services::internal::Buffer & x, - const services::internal::Buffer & beta, - services::internal::Buffer & xb, const uint32_t n, - const uint32_t p, const uint32_t offset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(applyBeta); - DAAL_ASSERT(x.size() == size_t(n) * size_t(p)); // overflows checked in the algorithm - DAAL_ASSERT(beta.size() >= size_t(offset) + size_t(p)); - DAAL_ASSERT(xb.size() >= size_t(n)); - return BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::NoTrans, n, 1, p, algorithmFPType(1), x, - p, 0, beta, 1, offset, algorithmFPType(0), xb, 1, 0); -} - -// Calculate X^T*(y - sigma) + 2*L2*beta -template -services::Status LogLossKernelOneAPI::applyGradient(const services::internal::Buffer & x, - const services::internal::Buffer & sub, - services::internal::Buffer & gradient, - const algorithmFPType alpha, const uint32_t n, const uint32_t p, - const algorithmFPType beta, const uint32_t offset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(applyGradient); - DAAL_ASSERT(x.size() == size_t(n) * size_t(p)); // overflows checked in the algorithm - DAAL_ASSERT(sub.size() >= size_t(n)); - DAAL_ASSERT(gradient.size() >= size_t(offset) + size_t(p)); - return BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::Trans, math::Transpose::NoTrans, p, 1, n, alpha, x, p, 0, sub, 1, - 0, beta, gradient, 1, offset); -} - -template -services::Status LogLossKernelOneAPI::applyHessian( - const services::internal::Buffer & x, const services::internal::Buffer & sigma, const uint32_t n, - const uint32_t p, services::internal::Buffer & h, const uint32_t nBeta, const uint32_t offset, const algorithmFPType alpha) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(applyHessian); - services::Status status; - - DAAL_ASSERT(x.size() == n * p); //overflows checked in the algorithm - DAAL_ASSERT(h.size() == nBeta * nBeta); - DAAL_ASSERT(sigma.size() == n); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "hessian"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(8, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, p); - args.set(2, sigma, AccessModeIds::read); - args.set(3, n); - args.set(4, h, AccessModeIds::write); - args.set(5, nBeta); - args.set(6, offset); - args.set(7, alpha); - - KernelRange range(p, p); - - ctx.run(range, kernel, args, status); - - return services::Status(); -} - -// ylog(sigm) + (1-y)log(1-sigma) -template -services::Status LogLossKernelOneAPI::logLoss(const services::internal::Buffer & y, - const services::internal::Buffer & sigma, - services::internal::Buffer & result, const uint32_t n) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(logLoss); - services::Status status; - - DAAL_ASSERT(y.size() == n); - DAAL_ASSERT(sigma.size() == n); - DAAL_ASSERT(result.size() == n); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "logLoss"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, y, AccessModeIds::read); - args.set(1, sigma, AccessModeIds::read); - args.set(2, result, AccessModeIds::write); - - KernelRange range(n); - ctx.run(range, kernel, args, status); - return status; -} - -// sigmoid(x) = 1/(1+exp(-x)) -// if calculateInverse = true, x[i][1] = 1 - sigmoid(x[i][0]) -template -services::Status LogLossKernelOneAPI::sigmoids(const services::internal::Buffer & x, - services::internal::Buffer & result, const uint32_t n, - bool calculateInverse) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(sigmoids); - services::Status status; - - DAAL_ASSERT(x.size() >= n); - DAAL_ASSERT(calculateInverse ? result.size() >= 2 * n : result.size() >= n); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "sigmoid"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType expThreshold = math::expThreshold(); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, expThreshold); - args.set(2, uint32_t(calculateInverse)); - args.set(3, result, AccessModeIds::write); - - KernelRange range(n); - ctx.run(range, kernel, args, status); - return status; -} - -template -services::Status LogLossKernelOneAPI::betaIntercept(const services::internal::Buffer & arg, - services::internal::Buffer & x, const uint32_t n) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(betaIntercept); - services::Status status; - - // xb += b0 - const algorithmFPType zero = algorithmFPType(0); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::setElem(0, zero, x)); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::addVectorScalar(x, arg, 0, n)); - - return status; -} - -template -services::Status LogLossKernelOneAPI::hessianIntercept(const services::internal::Buffer & x, - const services::internal::Buffer & sigma, - const uint32_t n, const uint32_t p, - services::internal::Buffer & h, - const uint32_t nBeta, const algorithmFPType alpha) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(hessianIntercept); - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const TypeIds::Id idType = TypeIds::id(); - - DAAL_ASSERT(x.size() == n * p); //overflows checked in the algorithm - DAAL_ASSERT(h.size() == nBeta * nBeta); - DAAL_ASSERT(sigma.size() == n); - - { - const char * const kernelName = "hessianIntercept"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, p); - args.set(2, sigma, AccessModeIds::read); - args.set(3, n); - args.set(4, h, AccessModeIds::write); - args.set(5, nBeta); - args.set(6, alpha); - - KernelRange range(nBeta); - - ctx.run(range, kernel, args, status); - } - { - // h[0][0] = alpha*sigma[i]*(1-sima[i]) - algorithmFPType h00 = algorithmFPType(0); - const char * const kernelName = "hessianInterceptH0"; - - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelNDRange range(1); - - size_t workItemsPerGroup = 256; - - const size_t nWorkGroups = HelperObjectiveFunction::getWorkgroupsCount(n, workItemsPerGroup); - - KernelRange localRange(workItemsPerGroup); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, nWorkGroups, workItemsPerGroup); - KernelRange globalRange(nWorkGroups * workItemsPerGroup); - - range.local(localRange, status); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - - UniversalBuffer buffer = ctx.allocate(idType, nWorkGroups, status); - DAAL_CHECK_STATUS_VAR(status); - services::internal::Buffer reductionBuffer = buffer.get(); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, sigma, AccessModeIds::read); - args.set(1, n); - args.set(2, reductionBuffer, AccessModeIds::write); - - ctx.run(range, kernel, args, status); - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::sumReduction(reductionBuffer, nWorkGroups, h00)); - - h00 *= alpha; - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::setElem(0, h00, h)); - } - return services::Status(); -} - -// ylog(sigm) + (1-y)log(1-sigma) -template -services::Status LogLossKernelOneAPI::hessianRegulization(services::internal::Buffer & h, - const uint32_t nBeta, const algorithmFPType l2) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(hessianRegulization); - - DAAL_ASSERT(h.size() == nBeta * nBeta); //overflows checked in the algorithm - - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "hessianRegulization"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - const algorithmFPType beta = l2 * algorithmFPType(2); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, h, AccessModeIds::write); - args.set(1, nBeta); - args.set(2, beta); - - KernelRange range(nBeta - 1); - - ctx.run(range, kernel, args, status); - - return status; -} - -template -services::Status LogLossKernelOneAPI::buildProgram(ClKernelFactoryIface & factory) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(buildProgram); - - services::Status status; - services::String options = getKeyFPType(); - - services::String cachekey("__daal_algorithms_optimization_solver_logistic_loss_"); - cachekey.add(options); - - options.add(" -D LOCAL_SUM_SIZE=256 "); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelLogLoss, options.c_str(), status); - return status; -} - -template -services::Status LogLossKernelOneAPI::doCompute( - const uint32_t nBatch, const uint32_t nFeatures, const daal::services::internal::Buffer & xBuff, - const daal::services::internal::Buffer & yBuff, const daal::services::internal::Buffer & argBuff, - NumericTable * valueNT, NumericTable * gradientNT, NumericTable * hessianNT, NumericTable * nonSmoothTermValueNT, - NumericTable * proximalProjectionNT, NumericTable * lipschitzConstantNT, const algorithmFPType l1reg, const algorithmFPType l2reg, - const bool interceptFlag, const bool isSourceData) -{ - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - const uint32_t nBeta = nFeatures + 1; - DAAL_ASSERT(nBeta > nFeatures); - const uint32_t ldX = isSourceData ? nFeatures : nBeta; - const uint32_t offsetX = isSourceData ? 1 : 0; - - const uint32_t n = nBatch; - - const TypeIds::Id idType = TypeIds::id(); - - if (valueNT == nullptr && gradientNT == nullptr && hessianNT == nullptr) - { - return services::ErrorMethodNotImplemented; - } - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_fUniversal, n)); - services::internal::Buffer fBuf = _fUniversal.get(); - - //f = X*b + b0 - DAAL_CHECK_STATUS(status, applyBeta(xBuff, argBuff, fBuf, n, ldX, offsetX)); - - if (interceptFlag) - { - DAAL_CHECK_STATUS(status, betaIntercept(argBuff, fBuf, n)); - } - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_sigmoidUniversal, n)); - services::internal::Buffer sigmoidBuf = _sigmoidUniversal.get(); - - //s = exp(-f) - DAAL_CHECK_STATUS(status, sigmoids(fBuf, sigmoidBuf, n)); - const algorithmFPType div = algorithmFPType(1) / algorithmFPType(n); - - if (valueNT) - { - DAAL_ASSERT(valueNT->getNumberOfRows() == 1); - - BlockDescriptor vr; - DAAL_CHECK_STATUS(status, valueNT->getBlockOfRows(0, 1, ReadWriteMode::readWrite, vr)); - algorithmFPType & value = *vr.getBlockPtr(); - - UniversalBuffer logLosUniversal = ctx.allocate(idType, n, status); - DAAL_CHECK_STATUS_VAR(status); - services::internal::Buffer logLossBuff = logLosUniversal.get(); - - value = algorithmFPType(0); - DAAL_CHECK_STATUS(status, logLoss(yBuff, sigmoidBuf, logLossBuff, n)); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::sum(logLossBuff, value, n)); - value *= -div; - - if (l1reg > 0 || l2reg > 0) - { - algorithmFPType reg = algorithmFPType(0); - // + l1*||Beta|| + l2*||Beta||**2 - // Beta = (B1, B2, ... Bk) - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::regularization(argBuff, nBeta, 1, reg, l1reg, l2reg)); - - value += reg; - } - DAAL_CHECK_STATUS(status, valueNT->releaseBlockOfRows(vr)); - } - - if (gradientNT) - { - DAAL_ASSERT(gradientNT->getNumberOfRows() == nBeta); - - BlockDescriptor gr; - DAAL_CHECK_STATUS(status, gradientNT->getBlockOfRows(0, nBeta, ReadWriteMode::readWrite, gr)); - daal::services::internal::Buffer gradientBuff = gr.getBuffer(); - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_subSigmoidYUniversal, n)); - services::internal::Buffer subSigmoidYBuff = _subSigmoidYUniversal.get(); - - // diff = sigmoid(Xb) - y - { - DAAL_ITTNOTIFY_SCOPED_TASK(subVectors); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::subVectors(sigmoidBuf, yBuff, subSigmoidYBuff, n)); - } - - const algorithmFPType coeffBeta = algorithmFPType(2) * l2reg; - if (l2reg > 0) - { - ctx.copy(gradientBuff, 1, argBuff, 1, nBeta - 1, status); - DAAL_CHECK_STATUS_VAR(status); - const algorithmFPType zero = algorithmFPType(0); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::setElem(0, zero, gradientBuff)); - } - - // gradient = (X^T(sigmoid(Xb) - y)/n + 2*l2*||Beta|| - DAAL_CHECK_STATUS(status, applyGradient(xBuff, subSigmoidYBuff, gradientBuff, div, n, ldX, coeffBeta, offsetX)); - - if (interceptFlag) - { - DAAL_ITTNOTIFY_SCOPED_TASK(interceptCalculate); - // g[0] = sum(sigmoid(Xb) - y)/n - algorithmFPType g0 = algorithmFPType(0); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::sum(subSigmoidYBuff, g0, n)); - g0 *= div; - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::setElem(0, g0, gradientBuff)); - } - - DAAL_CHECK_STATUS(status, gradientNT->releaseBlockOfRows(gr)); - } - - if (hessianNT) - { - DAAL_ASSERT(hessianNT->getNumberOfRows() == nBeta); - DAAL_ASSERT(hessianNT->getNumberOfColumns() == nBeta); - - BlockDescriptor hr; - DAAL_CHECK_STATUS(status, hessianNT->getBlockOfRows(0, nBeta, ReadWriteMode::readWrite, hr)); - daal::services::internal::Buffer hessianBuff = hr.getBuffer(); - - DAAL_CHECK_STATUS(status, applyHessian(xBuff, sigmoidBuf, n, ldX, hessianBuff, nBeta, offsetX, div)); - - if (interceptFlag) - { - DAAL_CHECK_STATUS(status, hessianIntercept(xBuff, sigmoidBuf, n, ldX, hessianBuff, nBeta, div)); - } - - if (l2reg > 0) - { - DAAL_CHECK_STATUS(status, hessianRegulization(hessianBuff, nBeta, l2reg)); - } - DAAL_CHECK_STATUS(status, hessianNT->releaseBlockOfRows(hr)); - } - - return status; -} - -template -services::Status LogLossKernelOneAPI::compute(NumericTable * data, NumericTable * dependentVariables, - NumericTable * argument, NumericTable * value, NumericTable * hessian, - NumericTable * gradient, NumericTable * nonSmoothTermValue, - NumericTable * proximalProjectionNT, NumericTable * lipschitzConstantNT, - Parameter * parameter) -{ - services::Status status; - DAAL_ASSERT(data != nullptr); - DAAL_ASSERT(parameter != nullptr); - DAAL_ASSERT(dependentVariables != nullptr); - DAAL_ASSERT(argument != nullptr); - - const size_t nRows = data->getNumberOfRows(); - const size_t p = data->getNumberOfColumns(); - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, p, 1); - const size_t nBeta = p + 1; - - DAAL_ASSERT(argument->getNumberOfColumns() == 1); - DAAL_ASSERT(argument->getNumberOfRows() == nBeta); - - BlockDescriptor agrBlock; - DAAL_CHECK_STATUS(status, argument->getBlockOfRows(0, nBeta, ReadWriteMode::readOnly, agrBlock)); - - const services::internal::Buffer argBuff = agrBlock.getBuffer(); - - NumericTable * ntInd = parameter->batchIndices.get(); - const algorithmFPType l1reg = parameter->penaltyL1; - const algorithmFPType l2reg = parameter->penaltyL2; - - if (ntInd == nullptr || (ntInd != nullptr && ntInd->getNumberOfColumns() == nRows)) - { - BlockDescriptor xBlock; - BlockDescriptor yBlock; - - DAAL_CHECK_STATUS(status, data->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, yBlock)); - - const services::internal::Buffer xBuff = xBlock.getBuffer(); - const services::internal::Buffer yBuff = yBlock.getBuffer(); - - const size_t nBatch = nRows; - const bool isSourceData = true; - const bool interceptFlag = parameter->interceptFlag; - - status = doCompute(nBatch, p, xBuff, yBuff, argBuff, value, gradient, hessian, nonSmoothTermValue, proximalProjectionNT, lipschitzConstantNT, - l1reg, l2reg, interceptFlag, isSourceData); - - DAAL_CHECK_STATUS(status, data->releaseBlockOfRows(xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->releaseBlockOfRows(yBlock)); - } - else - { - const size_t nBatch = ntInd->getNumberOfColumns(); - // TODO: if (nBatch == 1) - - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_uX, nBatch * nBeta)); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::lazyAllocate(_uY, nBatch)); - - services::internal::Buffer xBuff = _uX.get(); - services::internal::Buffer yBuff = _uY.get(); - - const bool isSourceData = false; - const bool interceptFlag = false; - - BlockDescriptor rInd; - DAAL_CHECK_STATUS(status, ntInd->getBlockOfRows(0, 1, ReadWriteMode::readOnly, rInd)); - services::internal::Buffer indBuff = rInd.getBuffer(); - - BlockDescriptor xBlock; - BlockDescriptor yBlock; - - DAAL_CHECK_STATUS(status, data->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->getBlockOfRows(0, nRows, ReadWriteMode::readOnly, yBlock)); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(getXY); - DAAL_CHECK_STATUS(status, HelperObjectiveFunction::getXY(xBlock.getBuffer(), yBlock.getBuffer(), indBuff, xBuff, yBuff, nBatch, p, - parameter->interceptFlag)); - } - - DAAL_CHECK_STATUS(status, ntInd->releaseBlockOfRows(rInd)); - - status = doCompute(nBatch, p, xBuff, yBuff, argBuff, value, gradient, hessian, nonSmoothTermValue, proximalProjectionNT, lipschitzConstantNT, - l1reg, l2reg, interceptFlag, isSourceData); - - DAAL_CHECK_STATUS(status, data->releaseBlockOfRows(xBlock)); - DAAL_CHECK_STATUS(status, dependentVariables->releaseBlockOfRows(yBlock)); - } - - DAAL_CHECK_STATUS(status, argument->releaseBlockOfRows(agrBlock)); - return status; -} - -} // namespace internal -} // namespace logistic_loss -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_fpt_dispatcher.cpp index a7d545ec925..06b6df63e7b 100644 --- a/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/objective_function/mse/mse_dense_default_batch_fpt_dispatcher.cpp @@ -25,8 +25,8 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::mse::interface2::BatchContainer, batch, DAAL_FPTYPE, - optimization_solver::mse::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::mse::interface2::BatchContainer, batch, DAAL_FPTYPE, + optimization_solver::mse::defaultDense) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/objective_function/objective_function_types_fpt.cpp b/cpp/daal/src/algorithms/objective_function/objective_function_types_fpt.cpp index 581b7bef99c..31277f37fe7 100644 --- a/cpp/daal/src/algorithms/objective_function/objective_function_types_fpt.cpp +++ b/cpp/daal/src/algorithms/objective_function/objective_function_types_fpt.cpp @@ -22,7 +22,7 @@ */ #include "algorithms/optimization_solver/objective_function/objective_function_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -34,7 +34,6 @@ namespace objective_function { namespace interface1 { -using daal::data_management::internal::SyclHomogenNumericTable; /** * Allocates memory for storing results of the Objective function @@ -55,23 +54,13 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - const size_t nCols = algInput->get(argument)->getNumberOfColumns(); const size_t nRows = algInput->get(argument)->getNumberOfRows(); if (algParameter->resultsToCompute & gradient && !Argument::get(gradientIdx)) { NumericTablePtr nt; - if (deviceInfo.isCpu) - { - nt = HomogenNumericTable::create(1, nRows, NumericTable::doAllocate, zero, &status); - } - else - { - nt = SyclHomogenNumericTable::create(1, nRows, NumericTable::doAllocate, zero, &status); - } + nt = HomogenNumericTable::create(1, nRows, NumericTable::doAllocate, zero, &status); Argument::set(gradientIdx, staticPointerCast(nt)); } if (algParameter->resultsToCompute & value && !Argument::get(valueIdx)) diff --git a/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_batch_fpt_dispatcher.cpp index a3c0e5a6bf5..864a48bac9d 100644 --- a/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/adagrad/adagrad_dense_default_batch_fpt_dispatcher.cpp @@ -25,8 +25,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::adagrad::BatchContainer, batch, DAAL_FPTYPE, - optimization_solver::adagrad::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::adagrad::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::adagrad::defaultDense) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/optimization_solver/iterative_solver_types_fpt.cpp b/cpp/daal/src/algorithms/optimization_solver/iterative_solver_types_fpt.cpp index d2c941954e9..d017fd8ddf6 100644 --- a/cpp/daal/src/algorithms/optimization_solver/iterative_solver_types_fpt.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/iterative_solver_types_fpt.cpp @@ -22,7 +22,7 @@ */ #include "algorithms/optimization_solver/iterative_solver/iterative_solver_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -51,20 +51,10 @@ DAAL_EXPORT services::Status Result::allocate(const daal::algorithms::Input * in services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (!get(minimum)) { dm::NumericTablePtr nt; - if (deviceInfo.isCpu) - { - nt = dm::HomogenNumericTable::create(1, nRows, dm::NumericTable::doAllocate, &status); - } - else - { - nt = dmi::SyclHomogenNumericTable::create(1, nRows, dm::NumericTable::doAllocate, &status); - } + nt = dm::HomogenNumericTable::create(1, nRows, dm::NumericTable::doAllocate, &status); set(minimum, nt); } if (!get(nIterations)) diff --git a/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_batch_fpt_dispatcher.cpp index 77c21550768..ccf756db9de 100644 --- a/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/lbfgs/lbfgs_dense_default_batch_fpt_dispatcher.cpp @@ -25,7 +25,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::lbfgs::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::lbfgs::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::lbfgs::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::lbfgs::defaultDense) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_batch_fpt_dispatcher.cpp index 88897fd2f70..3517fa462b6 100644 --- a/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/saga/saga_dense_default_batch_fpt_dispatcher.cpp @@ -25,7 +25,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::saga::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::saga::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::saga::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::saga::defaultDense) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/BUILD b/cpp/daal/src/algorithms/optimization_solver/sgd/BUILD index d20d73079d4..b55f4a38598 100644 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/BUILD +++ b/cpp/daal/src/algorithms/optimization_solver/sgd/BUILD @@ -4,7 +4,6 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", "@onedal//cpp/daal/src/algorithms/optimization_solver:kernel", diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/cl_kernel/sgd_dense_minibatch.cl b/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/cl_kernel/sgd_dense_minibatch.cl deleted file mode 100644 index cebafcbad85..00000000000 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/cl_kernel/sgd_dense_minibatch.cl +++ /dev/null @@ -1,83 +0,0 @@ -/* file: sgd_dense_minibatch.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of SGD dense minibatch OpenCL kernels. -//-- -*/ - -#ifndef __SGD_DENSE_MINIBATCH_KERNELS_CL__ -#define __SGD_DENSE_MINIBATCH_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelSGDMiniBatch, - - inline void __sum(__global algorithmFPType * partialSums, __local algorithmFPType * localSum) { - const uint global_group_id = get_group_id(0); - const uint group_size = get_local_size(0); - const uint local_id = get_local_id(0); - - for (uint stride = group_size / 2; stride > 0; stride /= 2) - { - barrier(CLK_LOCAL_MEM_FENCE); - - if (local_id < stride) - { - localSum[local_id] += localSum[local_id + stride]; - } - } - - if (local_id == 0) - { - partialSums[global_group_id] = localSum[0]; - } - } - - __kernel void makeStep(const __global algorithmFPType * const gradient, const __global algorithmFPType * const prevWorkValue, - __global algorithmFPType * workValue, const algorithmFPType learningRate, const algorithmFPType consCoeff) { - const uint j = get_global_id(0); - - workValue[j] = workValue[j] - learningRate * (gradient[j] + consCoeff * (workValue[j] - prevWorkValue[j])); - } - - __kernel void sumSq(const __global algorithmFPType * const x, const uint n, __global algorithmFPType * partialSums) { - __local algorithmFPType localSum[LOCAL_SUM_SIZE]; - const uint global_id = get_global_id(0); - const uint local_id = get_local_id(0); - - if (global_id >= n) - { - localSum[local_id] = (algorithmFPType)0; - } - else - { - localSum[local_id] = x[global_id] * x[global_id]; - } - - __sum(partialSums, localSum); - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_kernel_oneapi.h b/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_kernel_oneapi.h deleted file mode 100644 index 819afda70e5..00000000000 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_kernel_oneapi.h +++ /dev/null @@ -1,88 +0,0 @@ -/* file: sgd_dense_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -//++ -// Declaration of template function that calculate sgd. -//-- - -#ifndef __SGD_DENSE_KERNEL_ONEAPI_H__ -#define __SGD_DENSE_KERNEL_ONEAPI_H__ - -#include "algorithms/optimization_solver/sgd/sgd_batch.h" -#include "src/algorithms/kernel.h" -#include "data_management/data/numeric_table.h" -#include "src/algorithms/optimization_solver/iterative_solver_kernel.h" -#include "src/algorithms/optimization_solver/sgd/sgd_dense_kernel.h" -#include "src/services/service_algo_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace sgd -{ -namespace internal -{ -using namespace daal::data_management; - -template -class SGDKernelOneAPI : public Kernel -{ -public: - services::Status compute(HostAppIface * pHost, NumericTable * inputArgument, NumericTablePtr minimum, NumericTable * nIterations, - Parameter * parameter, NumericTable * learningRateSequence, NumericTable * batchIndices, - OptionalArgument * optionalArgument, OptionalArgument * optionalResult, engines::BatchBase & engine) - { - return services::ErrorMethodNotImplemented; - } -}; - -template -class SGDKernelOneAPI : public Kernel -{ -public: - services::Status compute(HostAppIface * pHost, NumericTable * inputArgument, NumericTablePtr minimum, NumericTable * nIterations, - Parameter * parameter, NumericTable * learningRateSequence, NumericTable * batchIndices, - OptionalArgument * optionalArgument, OptionalArgument * optionalResult, engines::BatchBase & engine); - -private: - static services::Status makeStep(const uint32_t argumentSize, const services::internal::Buffer & prevWorkValueBuff, - const services::internal::Buffer & gradientBuff, - services::internal::Buffer & workValueBuff, const algorithmFPType learningRate, - const algorithmFPType consCoeff); - - static services::Status vectorNorm(const services::internal::Buffer & x, const uint32_t n, algorithmFPType & norm); - - static services::Status buildProgram(services::internal::sycl::ClKernelFactoryIface & factory); - - enum IndicesStatus - { - random = 0, /*!< Indices of the terms are generated randomly */ - user = 1, /*!< Indices of the terms are provided by user */ - all = 2 /*!< All objective function terms are used for computations */ - }; -}; - -} // namespace internal -} // namespace sgd -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i deleted file mode 100644 index 87912b8bf11..00000000000 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ /dev/null @@ -1,416 +0,0 @@ -/* file: sgd_dense_minibatch_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of SGD dense minibatch Batch algorithm on GPU. -//-- -*/ - -#include "src/algorithms/optimization_solver/sgd/oneapi/cl_kernel/sgd_dense_minibatch.cl" -#include "src/algorithms/optimization_solver/iterative_solver_kernel.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "src/externals/service_math.h" - -#include "src/externals/service_profiler.h" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace sgd -{ -namespace internal -{ -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -using daal::services::internal::Buffer; -using daal::data_management::internal::SyclHomogenNumericTable; - -static uint32_t getWorkgroupsCount(const uint32_t n, const uint32_t localWorkSize) -{ - DAAL_ASSERT(localWorkSize > 0); - const uint32_t elementsPerGroup = localWorkSize; - uint32_t workgroupsCount = n / elementsPerGroup; - - if (workgroupsCount * elementsPerGroup < n) - { - workgroupsCount++; - } - return workgroupsCount; -} - -template -services::Status SGDKernelOneAPI::makeStep(const uint32_t argumentSize, const Buffer & prevWorkValueBuff, - const Buffer & gradientBuff, - Buffer & workValueBuff, const algorithmFPType learningRate, - const algorithmFPType consCoeff) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(makeStep); - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "makeStep"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(gradientBuff.size() == argumentSize); - DAAL_ASSERT(prevWorkValueBuff.size() == argumentSize); - DAAL_ASSERT(workValueBuff.size() == argumentSize); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, gradientBuff, AccessModeIds::read); - args.set(1, prevWorkValueBuff, AccessModeIds::read); - args.set(2, workValueBuff, AccessModeIds::readwrite); - args.set(3, learningRate); - args.set(4, consCoeff); - - KernelRange range(argumentSize); - ctx.run(range, kernel, args, status); - - return status; -} - -template -static services::Status sumReduction(const Buffer & reductionBuffer, const size_t nWorkGroups, algorithmFPType & result) -{ - services::Status status; - - DAAL_CHECK(reductionBuffer.size() == nWorkGroups, services::ErrorIncorrectSizeOfArray); - - auto sumReductionArrayPtr = reductionBuffer.toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - - const auto * sumReductionArray = sumReductionArrayPtr.get(); - - // Final summation with CPU - for (size_t i = 0; i < nWorkGroups; i++) - { - result += sumReductionArray[i]; - } - return status; -} - -template -services::Status SGDKernelOneAPI::vectorNorm(const Buffer & x, const uint32_t n, algorithmFPType & norm) -{ - services::Status status; - - const TypeIds::Id idType = TypeIds::id(); - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - status |= buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "sumSq"; - KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - size_t workItemsPerGroup = 256; - const size_t nWorkGroups = getWorkgroupsCount(n, workItemsPerGroup); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, workItemsPerGroup, nWorkGroups); - - KernelRange localRange(workItemsPerGroup); - KernelRange globalRange(workItemsPerGroup * nWorkGroups); - - KernelNDRange range(1); - - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - - UniversalBuffer buffer = ctx.allocate(idType, nWorkGroups, status); - DAAL_CHECK_STATUS_VAR(status); - Buffer reductionBuffer = buffer.get(); - - DAAL_ASSERT(x.size() == n); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::read); - args.set(1, n); - args.set(2, reductionBuffer, AccessModeIds::write); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(vectorNorm.run); - ctx.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - status = sumReduction(reductionBuffer, nWorkGroups, norm); - DAAL_CHECK_STATUS_VAR(status); - - norm = daal::internal::MathInst::sSqrt(norm); - - return status; -} - -template -services::Status SGDKernelOneAPI::buildProgram(ClKernelFactoryIface & factory) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(buildProgram); - services::Status status; - services::String options = getKeyFPType(); - - services::String cachekey("__daal_algorithms_optimization_solver_sgd_"); - cachekey.add(options); - options.add(" -D LOCAL_SUM_SIZE=256 "); - - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelSGDMiniBatch, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status SGDKernelOneAPI::compute(HostAppIface * pHost, NumericTable * inputArgument, NumericTablePtr minimum, - NumericTable * nIterations, Parameter * parameter, - NumericTable * learningRateSequence, NumericTable * batchIndices, - OptionalArgument * optionalArgument, OptionalArgument * optionalResult, - engines::BatchBase & engine) -{ - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - DAAL_ASSERT(inputArgument != nullptr); - DAAL_ASSERT(parameter != nullptr); - - const size_t argumentSize = inputArgument->getNumberOfRows(); - const size_t nIter = parameter->nIterations; - const size_t L = parameter->innerNIterations; - const size_t batchSize = parameter->batchSize; - - constexpr size_t maxInt32Value = static_cast(daal::services::internal::MaxVal::get()); - - WriteRows nIterationsBD(*nIterations, 0, 1); - DAAL_CHECK_BLOCK_STATUS(nIterationsBD); - int * nProceededIterations = nIterationsBD.get(); - DAAL_CHECK(nProceededIterations != nullptr, services::ErrorIncorrectInputNumericTable); - - // if nIter == 0, set result as start point, the number of executed iters to 0 - if (nIter == 0 || L == 0) - { - nProceededIterations[0] = 0; - return status; - } - - NumericTable * lastIterationInput = optionalArgument ? NumericTable::cast(optionalArgument->get(iterative_solver::lastIteration)).get() : nullptr; - NumericTable * pastWorkValueInput = optionalArgument ? NumericTable::cast(optionalArgument->get(sgd::pastWorkValue)).get() : nullptr; - - NumericTable * lastIterationResult = optionalResult ? NumericTable::cast(optionalResult->get(iterative_solver::lastIteration)).get() : nullptr; - NumericTable * pastWorkValueResult = optionalResult ? NumericTable::cast(optionalResult->get(sgd::pastWorkValue)).get() : nullptr; - - const double accuracyThreshold = parameter->accuracyThreshold; - - sum_of_functions::BatchPtr function = parameter->function; - const size_t nTerms = function->sumOfFunctionsParameter->numberOfTerms; - - DAAL_ASSERT(minimum == true); - DAAL_ASSERT(minimum->getNumberOfRows() == argumentSize); - - BlockDescriptor workValueBD; - DAAL_CHECK_STATUS(status, minimum->getBlockOfRows(0, argumentSize, ReadWriteMode::readWrite, workValueBD)); - Buffer workValueBuff = workValueBD.getBuffer(); - - auto workValueSNT = SyclHomogenNumericTable::create(workValueBuff, 1, argumentSize, &status); - DAAL_CHECK_STATUS_VAR(status); - - NumericTablePtr previousArgument = function->sumOfFunctionsInput->get(sum_of_functions::argument); - function->sumOfFunctionsInput->set(sum_of_functions::argument, workValueSNT); - - ReadRows learningRateBD(*learningRateSequence, 0, 1); - DAAL_CHECK_BLOCK_STATUS(learningRateBD); - const algorithmFPType * const learningRateArray = learningRateBD.get(); - DAAL_CHECK(learningRateArray != nullptr, services::ErrorIncorrectParameter); - - NumericTable * conservativeSequence = parameter->conservativeSequence.get(); - ReadRows consCoeffsBD(*conservativeSequence, 0, 1); - DAAL_CHECK_BLOCK_STATUS(consCoeffsBD); - const algorithmFPType * const consCoeffsArray = consCoeffsBD.get(); - DAAL_CHECK(consCoeffsArray != nullptr, services::ErrorIncorrectParameter); - - const size_t consCoeffsLength = conservativeSequence->getNumberOfColumns(); - const size_t learningRateLength = learningRateSequence->getNumberOfColumns(); - - const IndicesStatus indicesStatus = (batchIndices ? user : (batchSize < nTerms ? random : all)); - services::SharedPtr > ntBatchIndices; - - if (indicesStatus == user || indicesStatus == random) - { - // Replace by SyclNumericTable when will be RNG on GPU - ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); - DAAL_CHECK_STATUS_VAR(status); - } - - NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; - function->sumOfFunctionsParameter->batchIndices = ntBatchIndices; - - const TypeIds::Id idType = TypeIds::id(); - UniversalBuffer prevWorkValueU = ctx.allocate(idType, argumentSize, status); - DAAL_CHECK_STATUS_VAR(status); - Buffer prevWorkValueBuff = prevWorkValueU.get(); - - size_t startIteration = 0, nProceededIters = 0; - if (lastIterationInput) - { - ReadRows lastIterationInputBD(lastIterationInput, 0, 1); - DAAL_CHECK_BLOCK_STATUS(lastIterationInputBD); - const int * lastIterationInputArray = lastIterationInputBD.get(); - DAAL_ASSERT(lastIterationInputArray[0] > 0); - startIteration = lastIterationInputArray[0]; - } - - if (pastWorkValueInput) - { - BlockDescriptor pastWorkValueInputBD; - DAAL_CHECK_STATUS(status, pastWorkValueInput->getBlockOfRows(0, argumentSize, ReadWriteMode::readOnly, pastWorkValueInputBD)); - - const Buffer pastWorkValueInputBuff = pastWorkValueInputBD.getBuffer(); - - ctx.copy(prevWorkValueBuff, 0, pastWorkValueInputBuff, 0, argumentSize, status); - DAAL_CHECK_STATUS(status, pastWorkValueInput->releaseBlockOfRows(pastWorkValueInputBD)); - } - else - { - ctx.fill(prevWorkValueU, 0.0, status); - DAAL_CHECK_STATUS_VAR(status); - } - - // init workValue - BlockDescriptor startValueBD; - DAAL_CHECK_STATUS(status, inputArgument->getBlockOfRows(0, argumentSize, ReadWriteMode::readOnly, startValueBD)); - const Buffer startValueBuff = startValueBD.getBuffer(); - ctx.copy(workValueBuff, 0, startValueBuff, 0, argumentSize, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_STATUS(status, inputArgument->releaseBlockOfRows(startValueBD)); - - ReadRows predefinedBatchIndicesBD(batchIndices, 0, nIter); - DAAL_CHECK_BLOCK_STATUS(predefinedBatchIndicesBD); - iterative_solver::internal::RngTask rngTask(predefinedBatchIndicesBD.get(), batchSize); - rngTask.init(nTerms, engine); - - algorithmFPType learningRate = learningRateArray[0]; - algorithmFPType consCoeff = consCoeffsArray[0]; - - UniversalBuffer gradientU = ctx.allocate(idType, argumentSize, status); - DAAL_CHECK_STATUS_VAR(status); - Buffer gradientBuff = gradientU.get(); - - auto gradientSNT = SyclHomogenNumericTable::create(gradientBuff, 1, argumentSize, &status); - DAAL_CHECK_STATUS_VAR(status); - function->getResult()->set(objective_function::gradientIdx, gradientSNT); - - DAAL_CHECK(nIter <= maxInt32Value, services::ErrorIncorrectParameter); - *nProceededIterations = static_cast(nIter); - - services::internal::HostAppHelper host(pHost, 10); - DAAL_OVERFLOW_CHECK_BY_ADDING(size_t, startIteration, nIter); - for (size_t epoch = startIteration; epoch < (startIteration + nIter); epoch++) - { - if (epoch % L == 0 || epoch == startIteration) - { - learningRate = learningRateArray[(epoch / L) % learningRateLength]; - consCoeff = consCoeffsArray[(epoch / L) % consCoeffsLength]; - if (indicesStatus == user || indicesStatus == random) - { - DAAL_ITTNOTIFY_SCOPED_TASK(generateUniform); - const int * pValues = nullptr; - DAAL_CHECK_STATUS(status, rngTask.get(pValues)); - DAAL_CHECK_STATUS(status, ntBatchIndices->setArray(const_cast(pValues), ntBatchIndices->getNumberOfRows())); - } - } - - DAAL_CHECK_STATUS(status, function->computeNoThrow()); - - if (host.isCancelled(status, 1)) - { - // overflow is checked on casting nIter to int - // epoch - startIteration is always less then nIter - *nProceededIterations = static_cast(epoch - startIteration); - break; - } - - if (epoch % L == 0) - { - if (nIter > 1 && accuracyThreshold > 0) - { - algorithmFPType pointNorm = algorithmFPType(0), gradientNorm = algorithmFPType(0); - DAAL_CHECK_STATUS(status, vectorNorm(workValueBuff, argumentSize, pointNorm)); - DAAL_CHECK_STATUS(status, vectorNorm(gradientBuff, argumentSize, gradientNorm)); - const double gradientThreshold = accuracyThreshold * daal::internal::MathInst::sMax(1.0, pointNorm); - - if (gradientNorm < gradientThreshold) - { - // overflow is checked on casting nIter to int - // epoch - startIteration is always less then nIter - *nProceededIterations = static_cast(epoch - startIteration); - break; - } - } - - ctx.copy(prevWorkValueBuff, 0, workValueBuff, 0, argumentSize, status); - DAAL_CHECK_STATUS_VAR(status); - } - DAAL_CHECK_STATUS(status, makeStep(argumentSize, prevWorkValueBuff, gradientBuff, workValueBuff, learningRate, consCoeff)); - nProceededIters++; - } - - if (lastIterationResult) - { - WriteRows lastIterationResultBD(lastIterationResult, 0, 1); - DAAL_CHECK_BLOCK_STATUS(lastIterationResultBD); - int * lastIterationResultArray = lastIterationResultBD.get(); - lastIterationResultArray[0] = startIteration + nProceededIters; // overflow is already checked for (startIteration + nIter) - } - - if (pastWorkValueResult) - { - BlockDescriptor pastWorkValueResultBD; - DAAL_CHECK_STATUS(status, pastWorkValueResult->getBlockOfRows(0, argumentSize, ReadWriteMode::writeOnly, pastWorkValueResultBD)); - - Buffer pastWorkValueResultBuffer = pastWorkValueResultBD.getBuffer(); - - ctx.copy(pastWorkValueResultBuffer, 0, prevWorkValueBuff, 0, argumentSize, status); - DAAL_CHECK_STATUS(status, pastWorkValueResult->releaseBlockOfRows(pastWorkValueResultBD)); - } - - DAAL_CHECK_STATUS(status, minimum->releaseBlockOfRows(workValueBD)); - - function->sumOfFunctionsParameter->batchIndices = previousBatchIndices; - function->sumOfFunctionsInput->set(sum_of_functions::argument, previousArgument); - return status; -} - -} // namespace internal -} // namespace sgd -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_batch_container.h b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_batch_container.h index d4f8f1a1c30..52129436b7f 100644 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_batch_container.h +++ b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_batch_container.h @@ -29,7 +29,6 @@ #include "src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_kernel.h" #include "src/algorithms/optimization_solver/sgd/sgd_dense_momentum_kernel.h" #include "src/services/service_algo_utils.h" -#include "src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_kernel_oneapi.h" namespace daal { @@ -44,17 +43,7 @@ namespace interface2 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu || method == defaultDense || method == momentum) - { - __DAAL_INITIALIZE_KERNELS(internal::SGDKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::SGDKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::SGDKernel, algorithmFPType, method); } template @@ -81,21 +70,9 @@ services::Status BatchContainer::compute() NumericTable * learningRateSequence = parameter->learningRateSequence.get(); NumericTable * batchIndices = parameter->batchIndices.get(); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu || method == defaultDense || method == momentum) - { - __DAAL_CALL_KERNEL(env, internal::SGDKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), inputArgument, minimum.get(), nIterations, parameter, learningRateSequence, - batchIndices, optionalArgument, optionalResult, *parameter->engine); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::SGDKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - daal::services::internal::hostApp(*input), inputArgument, minimum, nIterations, parameter, learningRateSequence, - batchIndices, optionalArgument, optionalResult, *parameter->engine); - } + __DAAL_CALL_KERNEL(env, internal::SGDKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, daal::services::internal::hostApp(*input), + inputArgument, minimum.get(), nIterations, parameter, learningRateSequence, batchIndices, optionalArgument, optionalResult, + *parameter->engine); } } // namespace interface2 diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_default_batch_fpt_dispatcher.cpp index efb845e96c6..ec9166cee0e 100644 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_default_batch_fpt_dispatcher.cpp @@ -25,7 +25,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::sgd::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::sgd::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::sgd::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::sgd::defaultDense) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_fpt_dispatcher.cpp index 56ebd64393a..5b1bc6fe660 100644 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_fpt_dispatcher.cpp @@ -25,7 +25,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::sgd::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::sgd::miniBatch) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::sgd::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::sgd::miniBatch) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_oneapi_fpt.cpp deleted file mode 100644 index b32df275787..00000000000 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_minibatch_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,40 +0,0 @@ -/* file: sgd_dense_minibatch_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -//++ -// Implementation of sgd calculation functions. -//-- - -#include "src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_kernel_oneapi.h" -#include "src/algorithms/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace optimization_solver -{ -namespace sgd -{ -namespace internal -{ -template class SGDKernelOneAPI; -} // namespace internal -} // namespace sgd -} // namespace optimization_solver -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_momentum_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_momentum_batch_fpt_dispatcher.cpp index 9570f988a22..9ac2ea6b793 100644 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_momentum_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_dense_momentum_batch_fpt_dispatcher.cpp @@ -25,7 +25,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(optimization_solver::sgd::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::sgd::momentum) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(optimization_solver::sgd::BatchContainer, batch, DAAL_FPTYPE, optimization_solver::sgd::momentum) namespace optimization_solver { diff --git a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_types_fpt.cpp b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_types_fpt.cpp index 53eaa89d050..4ddbf7ef0fc 100644 --- a/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_types_fpt.cpp +++ b/cpp/daal/src/algorithms/optimization_solver/sgd/sgd_types_fpt.cpp @@ -24,7 +24,7 @@ #include "algorithms/optimization_solver/iterative_solver/iterative_solver_types.h" #include "algorithms/optimization_solver/sgd/sgd_types.h" #include "src/services/service_data_utils.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -80,19 +80,9 @@ services::Status Result::allocate(const daal::algorithms::Input * input, const d DAAL_ASSERT(miniBatch <= services::internal::MaxVal::get()) if (method == (int)miniBatch) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (!pOpt->get(pastWorkValue)) { - if (deviceInfo.isCpu) - { - pTbl = dm::HomogenNumericTable::create(1, argumentSize, dm::NumericTable::doAllocate, 0.0, &s); - } - else - { - pTbl = dmi::SyclHomogenNumericTable::create(1, argumentSize, dm::NumericTable::doAllocate, 0.0, &s); - } + pTbl = dm::HomogenNumericTable::create(1, argumentSize, dm::NumericTable::doAllocate, 0.0, &s); DAAL_CHECK_MALLOC(pTbl.get()) pOpt->set(pastWorkValue, pTbl); } diff --git a/cpp/daal/src/algorithms/pca/BUILD b/cpp/daal/src/algorithms/pca/BUILD index 15c3eb6dc7a..ceb5ef346fd 100644 --- a/cpp/daal/src/algorithms/pca/BUILD +++ b/cpp/daal/src/algorithms/pca/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/covariance:kernel", "@onedal//cpp/daal/src/algorithms/svd:kernel", "@onedal//cpp/daal/src/algorithms/normalization/zscore:kernel", diff --git a/cpp/daal/src/algorithms/pca/oneapi/cl_kernels/pca_cl_kernels.cl b/cpp/daal/src/algorithms/pca/oneapi/cl_kernels/pca_cl_kernels.cl deleted file mode 100644 index 50776173ffd..00000000000 --- a/cpp/daal/src/algorithms/pca/oneapi/cl_kernels/pca_cl_kernels.cl +++ /dev/null @@ -1,43 +0,0 @@ -/* file: pca_cl_kernels.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA OpenCL kernels. -//-- -*/ - -#ifndef __PCA_CL_KERNELS_CL__ -#define __PCA_CL_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - pca_cl_kernels, - - __kernel void calculateVariances(__global algorithmFPType * covariance, __global algorithmFPType * variances) { - const unsigned int tid = get_global_id(0); - const unsigned int nFeatures = get_global_size(0); - - variances[tid] = covariance[tid * nFeatures + tid]; - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi.h b/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi.h deleted file mode 100644 index 48e2091ca64..00000000000 --- a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi.h +++ /dev/null @@ -1,71 +0,0 @@ -/* file: pca_dense_correlation_batch_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA Batch Kernel for GPU. -//-- -*/ - -#ifndef __PCA_DENSE_CORRELATION_BATCH_KERNEL_UCAPI_H__ -#define __PCA_DENSE_CORRELATION_BATCH_KERNEL_UCAPI_H__ - -#include "src/algorithms/pca/pca_dense_correlation_base_iface.h" -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "algorithms/pca/pca_types.h" - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace internal -{ -template -class PCACorrelationKernelBatchUCAPI : public Kernel -{ -public: - using PCACorrelationBaseIfacePtr = services::SharedPtr >; - -public: - PCACorrelationKernelBatchUCAPI(const PCACorrelationBaseIfacePtr & host_impl); - - services::Status compute(bool isCorrelation, bool isDeterministic, data_management::NumericTable & dataTable, - covariance::BatchImpl * covarianceAlg, DAAL_UINT64 resultsToCompute, data_management::NumericTable & eigenvectors, - data_management::NumericTable & eigenvalues, data_management::NumericTable & means, - data_management::NumericTable & variances); - -private: - services::Status calculateVariances(services::internal::sycl::ExecutionContextIface & context, - const services::internal::sycl::KernelPtr & calculateVariancesKernel, - data_management::NumericTable & covariance, const services::internal::Buffer & variances); - - services::Status correlationFromCovarianceTable(uint32_t nObservations, data_management::NumericTable & covariance, - const services::internal::Buffer & variances); - -private: - PCACorrelationBaseIfacePtr _host_impl; -}; - -} // namespace internal -} // namespace pca -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi_impl.i b/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi_impl.i deleted file mode 100644 index e2aef196291..00000000000 --- a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi_impl.i +++ /dev/null @@ -1,264 +0,0 @@ -/* file: pca_dense_correlation_batch_kernel_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA Batch Kernel for GPU. -//-- -*/ - -#ifndef __PCA_DENSE_CORRELATION_BATCH_KERNEL_UCAPI_IMPL__ -#define __PCA_DENSE_CORRELATION_BATCH_KERNEL_UCAPI_IMPL__ - -#include "src/externals/service_profiler.h" - -#include "services/env_detect.h" -#include "include/services/internal/sycl/types.h" -#include "src/algorithms/pca/oneapi/cl_kernels/pca_cl_kernels.cl" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "src/sycl/blas_gpu.h" -#include "src/sycl/reducer.h" -#include "src/algorithms/covariance/oneapi/covariance_oneapi_impl.i" - -using namespace daal::services; -using namespace daal::internal; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace internal -{ -template -PCACorrelationKernelBatchUCAPI::PCACorrelationKernelBatchUCAPI(const PCACorrelationBaseIfacePtr & host_impl) -{ - _host_impl = host_impl; -} - -template -Status PCACorrelationKernelBatchUCAPI::compute(bool isCorrelation, bool isDeterministic, NumericTable & dataTable, - covariance::BatchImpl * covarianceAlg, DAAL_UINT64 resultsToCompute, - NumericTable & eigenvectors, NumericTable & eigenvalues, NumericTable & means, - NumericTable & variances) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute); - Status st; - - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - auto & kernel_factory = context.getClKernelFactory(); - - auto fptype_name = services::internal::sycl::getKeyFPType(); - auto build_options = fptype_name; - build_options.add("-cl-std=CL1.2"); - - services::String cachekey("__daal_algorithms_pca_cor_dense_batch_"); - cachekey.add(fptype_name); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.buildProgram); - kernel_factory.build(ExecutionTargetIds::device, cachekey.c_str(), pca_cl_kernels, build_options.c_str(), st); - DAAL_CHECK_STATUS_VAR(st); - } - - auto calculateVariancesKernel = kernel_factory.getKernel("calculateVariances", st); - DAAL_CHECK_STATUS_VAR(st); - - if (dataTable.getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - if (dataTable.getNumberOfRows() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - - const uint32_t N = static_cast(dataTable.getNumberOfRows()); - const uint32_t p = static_cast(dataTable.getNumberOfColumns()); - - if (isCorrelation) - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.correlation); - - if (resultsToCompute & mean) - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.correlation.fillTable(means)); - - BlockDescriptor meansBlock; - DAAL_CHECK_STATUS_VAR(means.getBlockOfRows(0, 1, readWrite, meansBlock)); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(meansBlock.getBuffer()), algorithmFPType, p); - context.fill(meansBlock.getBuffer(), (algorithmFPType)0, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(means.releaseBlockOfRows(meansBlock)); - } - - if (resultsToCompute & variance) - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.correlation.fillTable(variances)); - - BlockDescriptor varBlock; - DAAL_CHECK_STATUS_VAR(variances.getBlockOfRows(0, 1, readWrite, varBlock)); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(varBlock.getBuffer()), algorithmFPType, p); - context.fill(varBlock.getBuffer(), (algorithmFPType)1, st); - DAAL_CHECK_STATUS_VAR(st); - DAAL_CHECK_STATUS_VAR(variances.releaseBlockOfRows(varBlock)); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.correlation.eigenvalues); - DAAL_CHECK_STATUS(st, _host_impl->computeCorrelationEigenvalues(dataTable, eigenvectors, eigenvalues)); - } - } - else - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.full); - DAAL_CHECK(covarianceAlg, services::ErrorNullPtr); - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.full.covariance); - DAAL_CHECK_STATUS(st, covarianceAlg->computeNoThrow()); - } - - auto pCovarianceTable = covarianceAlg->getResult()->get(covariance::covariance); - DAAL_ASSERT(pCovarianceTable); - NumericTable & covarianceTable = *pCovarianceTable; - - // copying variances. Means are computed inplace - // with help of setResult in BatchContainer - - if (resultsToCompute & mean) - { - auto mean_cov = covarianceAlg->getResult()->get(covariance::mean); - DAAL_ASSERT(mean_cov); - - BlockDescriptor meansBlock, covMeanBlock; - DAAL_CHECK_STATUS_VAR(means.getBlockOfRows(0, 1, readWrite, meansBlock)); - DAAL_CHECK_STATUS_VAR(mean_cov->getBlockOfRows(0, 1, readOnly, covMeanBlock)); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(meansBlock.getBuffer()), algorithmFPType, p); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(covMeanBlock.getBuffer()), algorithmFPType, p); - context.copy(meansBlock.getBuffer(), 0, covMeanBlock.getBuffer(), 0, p, st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_CHECK_STATUS_VAR(means.releaseBlockOfRows(meansBlock)); - DAAL_CHECK_STATUS_VAR(mean_cov->releaseBlockOfRows(covMeanBlock)); - } - - if (resultsToCompute & variance) - { - BlockDescriptor varBlock; - DAAL_CHECK_STATUS_VAR(variances.getBlockOfRows(0, 1, readWrite, varBlock)); - DAAL_CHECK_STATUS(st, calculateVariances(context, calculateVariancesKernel, covarianceTable, varBlock.getBuffer())); - DAAL_CHECK_STATUS(st, correlationFromCovarianceTable(N, covarianceTable, varBlock.getBuffer())); - DAAL_CHECK_STATUS_VAR(variances.releaseBlockOfRows(varBlock)); - } - else - { - auto variancesBuffer = context.allocate(TypeIds::id(), p, st); - DAAL_CHECK_STATUS_VAR(st); - - DAAL_CHECK_STATUS( - st, calculateVariances(context, calculateVariancesKernel, covarianceTable, variancesBuffer.template get())); - - DAAL_CHECK_STATUS(st, correlationFromCovarianceTable(N, covarianceTable, variancesBuffer.template get())); - } - - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.full.computeEigenvalues); - DAAL_CHECK_STATUS(st, _host_impl->computeCorrelationEigenvalues(covarianceTable, eigenvectors, eigenvalues)); - } - } - - if (isDeterministic) - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.signFlipEigenvectors); - DAAL_CHECK_STATUS(st, _host_impl->signFlipEigenvectors(eigenvectors)); - } - - return st; -} - -template -services::Status PCACorrelationKernelBatchUCAPI::correlationFromCovarianceTable( - uint32_t nObservations, NumericTable & covariance, const services::internal::Buffer & variances) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.correlationFromCovarianceTable); - - if (covariance.getNumberOfRows() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - - const uint32_t nFeatures = static_cast(covariance.getNumberOfRows()); - - BlockDescriptor covBlock; - DAAL_CHECK_STATUS_VAR(covariance.getBlockOfRows(0, nFeatures, writeOnly, covBlock)); - - covariance::Parameter parameter; - parameter.outputMatrixType = covariance::correlationMatrix; - - DAAL_CHECK_STATUS_VAR(covariance::oneapi::internal::finalize(nFeatures, nObservations, covBlock.getBuffer(), - covBlock.getBuffer(), variances, ¶meter)); - - DAAL_CHECK_STATUS_VAR(covariance.releaseBlockOfRows(covBlock)); - - return services::Status(); -} - -template -services::Status PCACorrelationKernelBatchUCAPI::calculateVariances(ExecutionContextIface & context, - const KernelPtr & calculateVariancesKernel, - NumericTable & covariance, - const services::internal::Buffer & variances) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.calculateVariances); - services::Status status; - - if (covariance.getNumberOfRows() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - - uint32_t nFeatures = static_cast(covariance.getNumberOfRows()); - - BlockDescriptor covBlock; - DAAL_CHECK_STATUS_VAR(covariance.getBlockOfRows(0, nFeatures, readOnly, covBlock)); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(covBlock.getBuffer()), algorithmFPType, nFeatures); - - KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, covBlock.getBuffer(), AccessModeIds::read); - args.set(1, variances, AccessModeIds::write); - - KernelRange range(nFeatures); - context.run(range, calculateVariancesKernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(covariance.releaseBlockOfRows(covBlock)); - - return status; -} - -} // namespace internal -} // namespace pca -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi.h b/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi.h deleted file mode 100644 index e1e937c0ce7..00000000000 --- a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi.h +++ /dev/null @@ -1,68 +0,0 @@ -/* file: pca_dense_correlation_online_kernel_ucapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA Online Kernel for GPU. -//-- -*/ - -#ifndef __PCA_DENSE_CORRELATION_ONLINE_KERNEL_UCAPI_H__ -#define __PCA_DENSE_CORRELATION_ONLINE_KERNEL_UCAPI_H__ - -#include "src/algorithms/pca/pca_dense_correlation_base_iface.h" -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "algorithms/pca/pca_types.h" - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace internal -{ -template -class PCACorrelationKernelOnlineUCAPI : public Kernel -{ -public: - using PCACorrelationBaseIfacePtr = services::SharedPtr >; - -public: - PCACorrelationKernelOnlineUCAPI(const PCACorrelationBaseIfacePtr & host_impl); - - services::Status compute(const data_management::NumericTablePtr & pData, PartialResult * partialResult, - const OnlineParameter * parameter); - services::Status finalize(PartialResult * partialResult, const OnlineParameter * parameter, - data_management::NumericTable & eigenvectors, data_management::NumericTable & eigenvalues); - -private: - PCACorrelationBaseIfacePtr _host_impl; - -private: - services::Status copyIfNeeded(const data_management::NumericTable * src, data_management::NumericTable * dst); - services::Status copyCovarianceResultToPartialResult(const covariance::PartialResult * covariancePres, - PartialResult * partialResult); -}; - -} // namespace internal -} // namespace pca -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi_impl.i b/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi_impl.i deleted file mode 100644 index e2ac7691c3e..00000000000 --- a/cpp/daal/src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi_impl.i +++ /dev/null @@ -1,140 +0,0 @@ -/* file: pca_dense_correlation_online_kernel_ucapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA Online Kernel for GPU. -//-- -*/ - -#ifndef __PCA_DENSE_CORRELATION_ONLINE_KERNEL_UCAPI_IMPL__ -#define __PCA_DENSE_CORRELATION_ONLINE_KERNEL_UCAPI_IMPL__ - -#include "src/externals/service_profiler.h" - -#include "include/services/env_detect.h" -#include "include/services/internal/sycl/types.h" -#include "src/algorithms/covariance/oneapi/covariance_oneapi_impl.i" -#include "pca_dense_correlation_online_kernel_ucapi.h" - -using namespace daal::services; -using namespace daal::internal; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace internal -{ -template -PCACorrelationKernelOnlineUCAPI::PCACorrelationKernelOnlineUCAPI(const PCACorrelationBaseIfacePtr & host_impl) -{ - _host_impl = host_impl; -} - -template -Status PCACorrelationKernelOnlineUCAPI::compute(const data_management::NumericTablePtr & pData, - PartialResult * partialResult, - const OnlineParameter * parameter) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute); - parameter->covariance->input.set(covariance::data, pData); - parameter->covariance->parameter.outputMatrixType = covariance::correlationMatrix; - - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.covariance); - DAAL_CHECK_STATUS_VAR(parameter->covariance->computeNoThrow()); - } - DAAL_CHECK_STATUS_VAR(copyCovarianceResultToPartialResult(parameter->covariance->getPartialResult().get(), partialResult)); - - return Status(); -} - -template -services::Status PCACorrelationKernelOnlineUCAPI::copyCovarianceResultToPartialResult( - const covariance::PartialResult * covariancePres, PartialResult * partialResult) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.copy.to.partial.result); - DAAL_CHECK_STATUS_VAR(copyIfNeeded(covariancePres->get(covariance::sum).get(), partialResult->get(sumCorrelation).get())); - DAAL_CHECK_STATUS_VAR(copyIfNeeded(covariancePres->get(covariance::nObservations).get(), partialResult->get(nObservationsCorrelation).get())); - DAAL_CHECK_STATUS_VAR(copyIfNeeded(covariancePres->get(covariance::crossProduct).get(), partialResult->get(crossProductCorrelation).get())); - return Status(); -} - -template -services::Status PCACorrelationKernelOnlineUCAPI::copyIfNeeded(const data_management::NumericTable * src, - data_management::NumericTable * dst) -{ - if (src == dst) return services::Status(); - - DAAL_ASSERT(dst->getNumberOfRows() == src->getNumberOfRows()); - DAAL_ASSERT(dst->getNumberOfColumns() == src->getNumberOfColumns()); - - BlockDescriptor srcBlock; - BlockDescriptor dstBlock; - - { - DAAL_CHECK_STATUS_VAR(const_cast(src)->getBlockOfRows(0, src->getNumberOfRows(), readOnly, srcBlock)); - DAAL_CHECK_STATUS_VAR(dst->getBlockOfRows(0, dst->getNumberOfRows(), writeOnly, dstBlock)); - } - - const size_t nRows = dst->getNumberOfRows(); - const size_t nCols = dst->getNumberOfColumns(); - const size_t nDataElements = nRows * nCols; - - auto & context = services::internal::getDefaultContext(); - services::Status status; - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(dstBlock.getBuffer()), algorithmFPType, nDataElements); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(srcBlock.getBuffer()), algorithmFPType, nDataElements); - context.copy(dstBlock.getBuffer(), 0, srcBlock.getBuffer(), 0, nDataElements, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(const_cast(src)->releaseBlockOfRows(srcBlock)); - DAAL_CHECK_STATUS_VAR(dst->releaseBlockOfRows(dstBlock)); - - return status; -} - -template -Status PCACorrelationKernelOnlineUCAPI::finalize(PartialResult * partialResult, - const OnlineParameter * parameter, - data_management::NumericTable & eigenvectors, - data_management::NumericTable & eigenvalues) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(finalize); - { - DAAL_ITTNOTIFY_SCOPED_TASK(compute.covariance.finalize); - DAAL_CHECK_STATUS_VAR(parameter->covariance->finalizeCompute()); - } - - data_management::NumericTablePtr correlation = parameter->covariance->getResult()->get(covariance::covariance); - DAAL_ASSERT(correlation); - DAAL_CHECK_STATUS_VAR(_host_impl->computeCorrelationEigenvalues(*correlation, eigenvectors, eigenvalues)); - - return services::Status(); -} - -} // namespace internal -} // namespace pca -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_container.h b/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_container.h index 4ae9f43999c..7c09dde2ca8 100644 --- a/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_container.h +++ b/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_container.h @@ -27,8 +27,6 @@ #include "src/algorithms/kernel.h" #include "algorithms/pca/pca_batch.h" #include "src/algorithms/pca/pca_dense_correlation_batch_kernel.h" -#include "src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi.h" -#include "services/internal/sycl/execution_context.h" using namespace daal::services::internal; @@ -43,18 +41,7 @@ namespace interface3 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::PCACorrelationKernel, batch, algorithmFPType); - } - else - { - services::SharedPtr > hostImpl(new internal::PCACorrelationBase()); - _kernel = new internal::PCACorrelationKernelBatchUCAPI(hostImpl); - } + __DAAL_INITIALIZE_KERNELS(internal::PCACorrelationKernel, batch, algorithmFPType); } template @@ -66,9 +53,6 @@ BatchContainer::~BatchContainer() template services::Status BatchContainer::compute() { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - Input * input = static_cast(_in); Result * result = static_cast(_res); interface3::BatchParameter * parameter = @@ -89,18 +73,9 @@ services::Status BatchContainer::compute covarianceAlgorithm->getResult()->set(covariance::mean, means); } - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::PCACorrelationKernel, __DAAL_KERNEL_ARGUMENTS(batch, algorithmFPType), compute, input->isCorrelation(), - parameter->isDeterministic, *data, covarianceAlgorithm.get(), parameter->resultsToCompute, *eigenvectors, *eigenvalues, - *means, *variances); - } - else - { - return ((internal::PCACorrelationKernelBatchUCAPI *)(_kernel)) - ->compute(input->isCorrelation(), parameter->isDeterministic, *data, covarianceAlgorithm.get(), parameter->resultsToCompute, - *eigenvectors, *eigenvalues, *means, *variances); - } + __DAAL_CALL_KERNEL(env, internal::PCACorrelationKernel, __DAAL_KERNEL_ARGUMENTS(batch, algorithmFPType), compute, input->isCorrelation(), + parameter->isDeterministic, *data, covarianceAlgorithm.get(), parameter->resultsToCompute, *eigenvectors, *eigenvalues, *means, + *variances); } } // namespace interface3 diff --git a/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_fpt_dispatcher.cpp index 440ce9c6db5..94570afc07c 100644 --- a/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_fpt_dispatcher.cpp @@ -27,6 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(pca::interface3::BatchContainer, batch, DAAL_FPTYPE, pca::correlationDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(pca::interface3::BatchContainer, batch, DAAL_FPTYPE, pca::correlationDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_kernel_ucapi_fpt.cpp deleted file mode 100644 index d21a97813fb..00000000000 --- a/cpp/daal/src/algorithms/pca/pca_dense_correlation_batch_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: pca_dense_correlation_batch_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA Batch Kernel for GPU. -//-- -*/ - -#include "src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi.h" -#include "src/algorithms/pca/oneapi/pca_dense_correlation_batch_kernel_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace internal -{ -template class DAAL_EXPORT PCACorrelationKernelBatchUCAPI; -} // namespace internal -} // namespace pca -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/pca/pca_dense_correlation_distr_step2_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/pca/pca_dense_correlation_distr_step2_fpt_dispatcher.cpp index 552309b8cc8..a9d318b2459 100644 --- a/cpp/daal/src/algorithms/pca/pca_dense_correlation_distr_step2_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/pca/pca_dense_correlation_distr_step2_fpt_dispatcher.cpp @@ -29,6 +29,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(pca::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, pca::correlationDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(pca::DistributedContainer, distributed, step2Master, DAAL_FPTYPE, pca::correlationDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_container.h b/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_container.h index 32888e3ea25..d1bbed088e4 100644 --- a/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_container.h +++ b/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_container.h @@ -27,7 +27,6 @@ #include "src/algorithms/kernel.h" #include "algorithms/pca/pca_online.h" #include "src/algorithms/pca/pca_dense_correlation_online_kernel.h" -#include "src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi.h" namespace daal { @@ -38,18 +37,7 @@ namespace pca template OnlineContainer::OnlineContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::PCACorrelationKernel, online, algorithmFPType); - } - else - { - services::SharedPtr > hostImpl(new internal::PCACorrelationBase()); - _kernel = new internal::PCACorrelationKernelOnlineUCAPI(hostImpl); - } + __DAAL_INITIALIZE_KERNELS(internal::PCACorrelationKernel, online, algorithmFPType); } template @@ -68,18 +56,8 @@ services::Status OnlineContainer::comput data_management::NumericTablePtr data = input->get(pca::data); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::PCACorrelationKernel, __DAAL_KERNEL_ARGUMENTS(online, algorithmFPType), compute, data, partialResult, - parameter); - } - else - { - return ((internal::PCACorrelationKernelOnlineUCAPI *)(_kernel))->compute(data, partialResult, parameter); - } + __DAAL_CALL_KERNEL(env, internal::PCACorrelationKernel, __DAAL_KERNEL_ARGUMENTS(online, algorithmFPType), compute, data, partialResult, + parameter); } template @@ -93,19 +71,8 @@ services::Status OnlineContainer::finali data_management::NumericTablePtr eigenvalues = result->get(pca::eigenvalues); data_management::NumericTablePtr eigenvectors = result->get(pca::eigenvectors); - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::PCACorrelationKernel, __DAAL_KERNEL_ARGUMENTS(online, algorithmFPType), finalize, partialResult, parameter, - *eigenvectors, *eigenvalues); - } - else - { - return ((internal::PCACorrelationKernelOnlineUCAPI *)(_kernel)) - ->finalize(partialResult, parameter, *eigenvectors, *eigenvalues); - } + __DAAL_CALL_KERNEL(env, internal::PCACorrelationKernel, __DAAL_KERNEL_ARGUMENTS(online, algorithmFPType), finalize, partialResult, parameter, + *eigenvectors, *eigenvalues); } } // namespace pca diff --git a/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_fpt_dispatcher.cpp index 1e03555cb21..abf5078c8e1 100644 --- a/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_fpt_dispatcher.cpp @@ -29,6 +29,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(pca::OnlineContainer, online, DAAL_FPTYPE, pca::correlationDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(pca::OnlineContainer, online, DAAL_FPTYPE, pca::correlationDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_kernel_ucapi_fpt.cpp b/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_kernel_ucapi_fpt.cpp deleted file mode 100644 index 9c1a585bb41..00000000000 --- a/cpp/daal/src/algorithms/pca/pca_dense_correlation_online_kernel_ucapi_fpt.cpp +++ /dev/null @@ -1,39 +0,0 @@ -/* file: pca_dense_correlation_online_kernel_ucapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA Online Kernel for GPU. -//-- -*/ - -#include "src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi.h" -#include "src/algorithms/pca/oneapi/pca_dense_correlation_online_kernel_ucapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace internal -{ -template class PCACorrelationKernelOnlineUCAPI; -} // namespace internal -} // namespace pca -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/pca/pca_partialresult_correlation.h b/cpp/daal/src/algorithms/pca/pca_partialresult_correlation.h index 9d3052175de..dfb7afeac99 100644 --- a/cpp/daal/src/algorithms/pca/pca_partialresult_correlation.h +++ b/cpp/daal/src/algorithms/pca/pca_partialresult_correlation.h @@ -25,7 +25,7 @@ #define __PCA_PARTIALRESULT_CORRELATION_ #include "algorithms/pca/pca_types.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -33,7 +33,6 @@ namespace algorithms { namespace pca { -using daal::data_management::internal::SyclHomogenNumericTable; /** * Allocates memory to store partial results of the PCA SVD algorithm @@ -47,29 +46,14 @@ DAAL_EXPORT services::Status PartialResult::allocate(const daa { services::Status s; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - set(nObservationsCorrelation, - data_management::HomogenNumericTable::create(1, 1, data_management::NumericTableIface::doAllocate, 0, &s)); - set(sumCorrelation, data_management::HomogenNumericTable::create((static_cast(input))->getNFeatures(), 1, - data_management::NumericTableIface::doAllocate, 0, &s)); - set(crossProductCorrelation, - data_management::HomogenNumericTable::create((static_cast(input))->getNFeatures(), - (static_cast(input))->getNFeatures(), - data_management::NumericTableIface::doAllocate, 0, &s)); - } - else - { - set(nObservationsCorrelation, SyclHomogenNumericTable::create(1, 1, data_management::NumericTableIface::doAllocate, 0, &s)); - set(sumCorrelation, SyclHomogenNumericTable::create((static_cast(input))->getNFeatures(), 1, - data_management::NumericTableIface::doAllocate, 0, &s)); - set(crossProductCorrelation, SyclHomogenNumericTable::create((static_cast(input))->getNFeatures(), - (static_cast(input))->getNFeatures(), + set(nObservationsCorrelation, + data_management::HomogenNumericTable::create(1, 1, data_management::NumericTableIface::doAllocate, 0, &s)); + set(sumCorrelation, data_management::HomogenNumericTable::create((static_cast(input))->getNFeatures(), 1, data_management::NumericTableIface::doAllocate, 0, &s)); - } + set(crossProductCorrelation, + data_management::HomogenNumericTable::create((static_cast(input))->getNFeatures(), + (static_cast(input))->getNFeatures(), + data_management::NumericTableIface::doAllocate, 0, &s)); return s; }; diff --git a/cpp/daal/src/algorithms/pca/pca_result_impl_fpt.cpp b/cpp/daal/src/algorithms/pca/pca_result_impl_fpt.cpp index 627025e5af6..1f74533be0c 100644 --- a/cpp/daal/src/algorithms/pca/pca_result_impl_fpt.cpp +++ b/cpp/daal/src/algorithms/pca/pca_result_impl_fpt.cpp @@ -22,7 +22,7 @@ */ #include "src/algorithms/pca/pca_result_impl.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" +#include "data_management/data/homogen_numeric_table.h" namespace daal { @@ -74,18 +74,7 @@ services::Status ResultImpl::allocate(const daal::algorithms::PartialResult * pa template services::Status ResultImpl::allocate(size_t nFeatures, size_t nComponents, DAAL_UINT64 resultsToCompute) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - return __allocate__impl >(nFeatures, nComponents, resultsToCompute); - } - else - { - return __allocate__impl >(nFeatures, nComponents, - resultsToCompute); - } + return __allocate__impl >(nFeatures, nComponents, resultsToCompute); } template diff --git a/cpp/daal/src/algorithms/pca/transform/BUILD b/cpp/daal/src/algorithms/pca/transform/BUILD index 9f64eecd83b..9810bc00230 100644 --- a/cpp/daal/src/algorithms/pca/transform/BUILD +++ b/cpp/daal/src/algorithms/pca/transform/BUILD @@ -4,9 +4,8 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/daal/src/algorithms/pca/transform/oneapi/cl_kernels/pca_transform_cl_kernels.cl b/cpp/daal/src/algorithms/pca/transform/oneapi/cl_kernels/pca_transform_cl_kernels.cl deleted file mode 100644 index 65f600948ff..00000000000 --- a/cpp/daal/src/algorithms/pca/transform/oneapi/cl_kernels/pca_transform_cl_kernels.cl +++ /dev/null @@ -1,96 +0,0 @@ -/* file: pca_transform_cl_kernels.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of PCA transform OpenCL kernels. -//-- -*/ - -#ifndef __PCA_TRANSFORM_CL_KERNELS_CL__ -#define __PCA_TRANSFORM_CL_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - pca_transform_cl_kernels, - - __kernel void computeInvSigmas(__global const algorithmFPType * rawVariances, __global algorithmFPType * invSigmas) { - const unsigned int tid = get_global_id(0); - const algorithmFPType epsilon = 1e-10; - - /*Case when rawVariances[tid] < 0 is handled inside compute method*/ - if (rawVariances[tid] > epsilon) - { - invSigmas[tid] = (algorithmFPType)1 / (algorithmFPType)sqrt(rawVariances[tid]); - } - else - { - invSigmas[tid] = (algorithmFPType)0; - } - } - - __kernel void normalize(__global algorithmFPType * copyBlock, __global const algorithmFPType * rawMeans, - __global const algorithmFPType * invSigmas, const char hasMeans, const char hasInvSigmas, const uint maxWorkItemsPerGroup, - const uint numFeatures) { - const unsigned int glid = get_global_id(0); - const unsigned int numWorkItemsPerGroup = get_local_size(0); - const unsigned int numVec = get_num_groups(0); - - uint numOfDataItemsProcessedByWI = numFeatures / maxWorkItemsPerGroup; - - for (uint i = 0; i < numOfDataItemsProcessedByWI + 1; i++) - { - const int dataId = glid + numVec * numWorkItemsPerGroup * i; - const int meansId = dataId % numFeatures; - if (dataId < numFeatures * numVec) - { - if (hasMeans) - { - copyBlock[dataId] = copyBlock[dataId] - rawMeans[meansId]; - } - if (hasInvSigmas) - { - copyBlock[dataId] = copyBlock[dataId] * invSigmas[meansId]; - } - } - } - } - - __kernel void whitening(__global algorithmFPType * transformedBlock, __global const algorithmFPType * invEigenValues, - const uint maxWorkItemsPerGroup, const uint numComponents) { - const int glid = get_global_id(0); - const int numWorkItemsPerGroup = get_local_size(0); - const int numVec = get_num_groups(0); - - uint numOfDataItemsProcessedByWI = numComponents / maxWorkItemsPerGroup; - for (uint i = 0; i < numOfDataItemsProcessedByWI + 1; i++) - { - const int dataId = glid + numVec * numWorkItemsPerGroup * i; - const int eigValId = dataId % numComponents; - if (dataId < numComponents * numVec) - { - transformedBlock[dataId] = transformedBlock[dataId] * invEigenValues[eigValId]; - } - } - } - -); - -#endif diff --git a/cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi.h b/cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi.h deleted file mode 100644 index ac44e9cb432..00000000000 --- a/cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi.h +++ /dev/null @@ -1,103 +0,0 @@ -/* file: pca_transform_dense_default_batch_oneapi.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Common functions of PCA transformation on GPU -//-- -*/ - -#ifndef __PCA_TRANSFORM_DENSE_DEFAULT_BATCH_ONEAPI_H__ -#define __PCA_TRANSFORM_DENSE_DEFAULT_BATCH_ONEAPI_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/sycl/execution_context.h" -#include "algorithms/pca/pca_types.h" -#include "algorithms/pca/transform/pca_transform_types.h" -#include "src/sycl/blas_gpu.h" - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace transform -{ -namespace oneapi -{ -namespace internal -{ -template -class TransformKernelOneAPI : public Kernel -{ -public: - services::Status compute(data_management::NumericTable & data, data_management::NumericTable & eigenvectors, - data_management::NumericTable * pMeans, data_management::NumericTable * pVariances, - data_management::NumericTable * pEigenvalues, data_management::NumericTable & transformedData); - - services::Status computeTransformedBlock(uint32_t numRows, uint32_t numFeatures, uint32_t numComponents, - daal::services::internal::sycl::UniversalBuffer & dataBlock, - const services::internal::Buffer & eigenvectors, - const services::internal::Buffer & resultBlock); - -private: - services::Status allocateBuffer(daal::services::internal::sycl::ExecutionContextIface & context, - daal::services::internal::sycl::UniversalBuffer & returnBuffer, uint32_t bufferSize); - - services::Status copyBuffer(daal::services::internal::sycl::ExecutionContextIface & context, - daal::services::internal::sycl::UniversalBuffer & returnBuffer, data_management::NumericTable & data, - const uint32_t nRows, const uint32_t nCols); - - services::Status buildKernel(daal::services::internal::sycl::ExecutionContextIface & context, - daal::services::internal::sycl::ClKernelFactoryIface & factory); - - services::Status checkVariances(data_management::NumericTable & pVariances, uint32_t numRows); - - services::Status computeInvSigmas(daal::services::internal::sycl::ExecutionContextIface & context, data_management::NumericTable * variances, - const services::internal::Buffer & invSigmas, const uint32_t numFeatures); - - services::Status normalize(daal::services::internal::sycl::ExecutionContextIface & context, - daal::services::internal::sycl::UniversalBuffer & copyBlock, - daal::services::internal::sycl::UniversalBuffer & rawMeans, - daal::services::internal::sycl::UniversalBuffer & invSigmas, bool hasMeans, bool hasInvSigmas, - const uint32_t numFeatures, const uint32_t numVectors); - - services::Status whitening(daal::services::internal::sycl::ExecutionContextIface & context, - const services::internal::Buffer & transformedBlock, - daal::services::internal::sycl::UniversalBuffer & invEigenvalues, const uint32_t numComponents, - const uint32_t numVectors); - - services::Status initBuffers(daal::services::internal::sycl::ExecutionContextIface & ctx, data_management::NumericTable & data, - const uint32_t numFeatures, const uint32_t numComponents, const uint32_t numVectors); - -private: - const uint32_t maxWorkItemsPerGroup = 256; - daal::services::internal::sycl::UniversalBuffer invSigmas; - daal::services::internal::sycl::UniversalBuffer invEigenvalues; - daal::services::internal::sycl::UniversalBuffer rawMeans; - daal::services::internal::sycl::UniversalBuffer copyBlock; -}; - -} // namespace internal -} // namespace oneapi -} // namespace transform -} // namespace pca -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi_impl.i b/cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi_impl.i deleted file mode 100644 index c26e8fa53b6..00000000000 --- a/cpp/daal/src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi_impl.i +++ /dev/null @@ -1,363 +0,0 @@ -/* file: pca_transform_dense_default_batch_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Common functions of PCA transformation on GPU -//-- -*/ - -#ifndef __PCA_TRANSFORM_DENSE_DEFAULT_BATCH_ONEAPI_IMPL_I__ -#define __PCA_TRANSFORM_DENSE_DEFAULT_BATCH_ONEAPI_IMPL_I__ - -#include "src/externals/service_profiler.h" - -#include "src/algorithms/pca/transform/oneapi/cl_kernels/pca_transform_cl_kernels.cl" -#include "src/services/service_data_utils.h" -#include "include/services/internal/sycl/types.h" - -using namespace daal::services; -using namespace daal::services::internal::sycl; -using namespace daal::data_management; - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace transform -{ -namespace oneapi -{ -namespace internal -{ -using namespace daal::services::internal::sycl; - -template -services::Status TransformKernelOneAPI::computeTransformedBlock( - const uint32_t numRows, const uint32_t nFeatures, const uint32_t numComponents, UniversalBuffer & dataBlock, - const services::internal::Buffer & eigenvectors, const services::internal::Buffer & resultBlock) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.gemm); - - DAAL_ASSERT_UNIVERSAL_BUFFER(dataBlock, algorithmFPType, numRows * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(eigenvectors), algorithmFPType, numComponents * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(resultBlock), algorithmFPType, numRows * numComponents); - - return BlasGpu::xgemm(math::Layout::ColMajor, math::Transpose::Trans, math::Transpose::NoTrans, numComponents, numRows, - nFeatures, 1.0, eigenvectors, nFeatures, 0, dataBlock, nFeatures, 0, 0.0, resultBlock, numComponents, 0); -} - -template -services::Status TransformKernelOneAPI::computeInvSigmas(ExecutionContextIface & ctx, NumericTable * variances, - const services::internal::Buffer & invSigmas, - const uint32_t nFeatures) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.computeInvSigmas); - services::Status status; - - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildKernel(ctx, factory)); - - const char * const computeInvSigmasKernel = "computeInvSigmas"; - KernelPtr kernel = factory.getKernel(computeInvSigmasKernel, status); - DAAL_CHECK_STATUS_VAR(status); - - BlockDescriptor varBlock; - DAAL_CHECK_STATUS_VAR(variances->getBlockOfRows(0, nFeatures, readOnly, varBlock)); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(varBlock.getBuffer()), algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(invSigmas), algorithmFPType, nFeatures); - - KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, varBlock.getBuffer(), AccessModeIds::read); - args.set(1, invSigmas, AccessModeIds::write); - KernelRange range(nFeatures); - ctx.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(variances->releaseBlockOfRows(varBlock)); - - return status; -} - -template -services::Status TransformKernelOneAPI::normalize(ExecutionContextIface & ctx, UniversalBuffer & copyBlock, - UniversalBuffer & rawMeans, UniversalBuffer & invSigmas, bool hasMeans, - bool hasInvSigmas, const uint32_t nFeatures, const uint32_t nVectors) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.normalize); - services::Status status; - - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildKernel(ctx, factory)); - - const char * const normalizeKernel = "normalize"; - KernelPtr kernel = factory.getKernel(normalizeKernel, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(copyBlock, algorithmFPType, nVectors * nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(rawMeans, algorithmFPType, nFeatures); - DAAL_ASSERT_UNIVERSAL_BUFFER(invSigmas, algorithmFPType, nFeatures); - - const uint32_t workItemsPerGroup = (nFeatures > maxWorkItemsPerGroup) ? maxWorkItemsPerGroup : nFeatures; - DAAL_ASSERT(workItemsPerGroup != 0); - KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, copyBlock, AccessModeIds::readwrite); - args.set(1, rawMeans, AccessModeIds::read); - args.set(2, invSigmas, AccessModeIds::read); - args.set(3, static_cast(hasMeans)); - args.set(4, static_cast(hasInvSigmas)); - args.set(5, maxWorkItemsPerGroup); - args.set(6, nFeatures); - - KernelRange local_range(workItemsPerGroup); - KernelRange global_range(workItemsPerGroup * nVectors); - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - ctx.run(range, kernel, args, status); - - return status; -} - -template -services::Status TransformKernelOneAPI::whitening(ExecutionContextIface & ctx, - const services::internal::Buffer & transformedBlock, - UniversalBuffer & invEigenvalues, const uint32_t numComponents, - const uint32_t numVectors) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.whitening); - services::Status status; - - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildKernel(ctx, factory)); - - const char * const whiteningKernel = "whitening"; - KernelPtr kernel = factory.getKernel(whiteningKernel, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(transformedBlock), algorithmFPType, numVectors * numComponents); - DAAL_ASSERT_UNIVERSAL_BUFFER(invEigenvalues, algorithmFPType, numComponents); - - const uint32_t workItemsPerGroup = (numComponents > maxWorkItemsPerGroup) ? maxWorkItemsPerGroup : numComponents; - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, transformedBlock, AccessModeIds::readwrite); - args.set(1, invEigenvalues, AccessModeIds::read); - args.set(2, maxWorkItemsPerGroup); - args.set(3, numComponents); - - KernelRange local_range(workItemsPerGroup); - KernelRange global_range(workItemsPerGroup * numVectors); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - ctx.run(range, kernel, args, status); - - return status; -} - -template -services::Status TransformKernelOneAPI::allocateBuffer(ExecutionContextIface & ctx, UniversalBuffer & returnBuffer, - uint32_t bufferSize) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.allocateBuffer); - services::Status status; - - const algorithmFPType zero = 0.0; - returnBuffer = ctx.allocate(TypeIds::id(), bufferSize, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(returnBuffer, algorithmFPType, bufferSize); - ctx.fill(returnBuffer, zero, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status TransformKernelOneAPI::copyBuffer(ExecutionContextIface & ctx, UniversalBuffer & returnBuffer, - NumericTable & data, uint32_t nRows, uint32_t nCols) - -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.copyBuffer); - services::Status status; - - BlockDescriptor dataBlock; - DAAL_CHECK_STATUS(status, data.getBlockOfRows(0, nRows, ReadWriteMode::readOnly, dataBlock)); - - DAAL_ASSERT_UNIVERSAL_BUFFER(returnBuffer, algorithmFPType, nRows * nCols); - DAAL_ASSERT_UNIVERSAL_BUFFER(UniversalBuffer(dataBlock.getBuffer()), algorithmFPType, nRows * nCols); - ctx.copy(returnBuffer, 0, dataBlock.getBuffer(), 0, nRows * nCols, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, data.releaseBlockOfRows(dataBlock)); - - return status; -} - -template -services::Status TransformKernelOneAPI::checkVariances(NumericTable & pVariances, uint32_t numRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.checkVariances); - services::Status status; - - BlockDescriptor varBlock; - DAAL_CHECK_STATUS(status, pVariances.getBlockOfRows(0, numRows, ReadWriteMode::readOnly, varBlock)); - for (uint32_t i = 0; i < numRows; i++) - { - if (varBlock.getBlockPtr()[i] < 0) - { - status |= status.add(ErrorIncorrectOptionalInput); - } - } - DAAL_CHECK_STATUS(status, pVariances.releaseBlockOfRows(varBlock)); - - return status; -} - -template -services::Status TransformKernelOneAPI::buildKernel(ExecutionContextIface & ctx, ClKernelFactoryIface & factory) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute.buildKernel); - services::Status status; - - auto fptype_name = services::internal::sycl::getKeyFPType(); - auto build_options = fptype_name; - - const services::String options = getKeyFPType(); - services::String cachekey("__daal_algorithms_pca_transform"); - cachekey.add(fptype_name); - factory.build(ExecutionTargetIds::device, cachekey.c_str(), pca_transform_cl_kernels, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -services::Status TransformKernelOneAPI::initBuffers(ExecutionContextIface & ctx, NumericTable & data, - const uint32_t numFeatures, const uint32_t numComponents, - const uint32_t numVectors) -{ - services::Status status; - - DAAL_CHECK_STATUS(status, allocateBuffer(ctx, invSigmas, numFeatures)); - DAAL_CHECK_STATUS(status, allocateBuffer(ctx, invEigenvalues, numComponents)); - DAAL_CHECK_STATUS(status, allocateBuffer(ctx, rawMeans, numFeatures)); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, numVectors, numFeatures); - copyBlock = ctx.allocate(TypeIds::id(), numVectors * numFeatures, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, copyBuffer(ctx, copyBlock, data, numVectors, numFeatures)); - - return status; -} - -template -services::Status TransformKernelOneAPI::compute(NumericTable & data, NumericTable & eigenvectors, NumericTable * pMeans, - NumericTable * pVariances, NumericTable * pEigenvalues, - NumericTable & transformedData) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(pca.transform.compute); - services::Status status; - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - if (data.getNumberOfRows() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - if (data.getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - if (transformedData.getNumberOfColumns() > static_cast(services::internal::MaxVal::get())) - { - return services::Status(daal::services::ErrorCovarianceInternal); - } - - const uint32_t numVectors = static_cast(data.getNumberOfRows()); - const uint32_t numFeatures = static_cast(data.getNumberOfColumns()); - const uint32_t numComponents = static_cast(transformedData.getNumberOfColumns()); - - DAAL_CHECK_STATUS(status, initBuffers(ctx, data, numFeatures, numComponents, numVectors)); - - bool hasInvSigmas = false; - if (pVariances != nullptr) - { - hasInvSigmas = true; - DAAL_CHECK_STATUS(status, checkVariances(*pVariances, numFeatures)); - DAAL_CHECK_STATUS(status, computeInvSigmas(ctx, pVariances, invSigmas.template get(), numFeatures)); - } - - if (pEigenvalues != nullptr) - { - DAAL_CHECK_STATUS(status, computeInvSigmas(ctx, pEigenvalues, invEigenvalues.template get(), numComponents)); - } - - bool hasMeans = false; - if (pMeans != nullptr) - { - hasMeans = true; - DAAL_CHECK_STATUS(status, copyBuffer(ctx, rawMeans, *pMeans, numFeatures, 1)); - } - - bool isWhitening = pEigenvalues != nullptr; - bool isNormalize = pMeans != nullptr || pVariances != nullptr; - - if (isNormalize) - { - DAAL_CHECK_STATUS(status, normalize(ctx, copyBlock, rawMeans, invSigmas, hasMeans, hasInvSigmas, numFeatures, numVectors)); - } - - BlockDescriptor transformedBlock; - DAAL_CHECK_STATUS(status, transformedData.getBlockOfRows(0, transformedData.getNumberOfRows(), ReadWriteMode::readWrite, transformedBlock)); - - BlockDescriptor basis; - DAAL_CHECK_STATUS(status, eigenvectors.getBlockOfRows(0, numComponents, ReadWriteMode::readOnly, basis)); - - DAAL_CHECK_STATUS(status, - computeTransformedBlock(numVectors, numFeatures, numComponents, copyBlock, basis.getBuffer(), transformedBlock.getBuffer())); - - if (isWhitening) - { - DAAL_CHECK_STATUS(status, whitening(ctx, transformedBlock.getBuffer(), invEigenvalues, numComponents, numVectors)); - } - DAAL_CHECK_STATUS(status, transformedData.releaseBlockOfRows(transformedBlock)); - DAAL_CHECK_STATUS(status, eigenvectors.releaseBlockOfRows(basis)); - - return status; -} - -} /* namespace internal */ -} /* namespace oneapi */ -} /* namespace transform */ -} /* namespace pca */ -} /* namespace algorithms */ -} /* namespace daal */ - -#endif diff --git a/cpp/daal/src/algorithms/pca/transform/pca_transform_batch_fpt.cpp b/cpp/daal/src/algorithms/pca/transform/pca_transform_batch_fpt.cpp index 5e336fd0b1c..57c66fd4853 100644 --- a/cpp/daal/src/algorithms/pca/transform/pca_transform_batch_fpt.cpp +++ b/cpp/daal/src/algorithms/pca/transform/pca_transform_batch_fpt.cpp @@ -23,7 +23,6 @@ #include "algorithms/pca/transform/pca_transform_types.h" #include "data_management/data/homogen_numeric_table.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" #include "src/services/daal_strings.h" using namespace daal::services; @@ -38,7 +37,6 @@ namespace transform { using namespace daal::services; using namespace daal::data_management; -using daal::data_management::internal::SyclHomogenNumericTable; template DAAL_EXPORT Status Result::allocate(const daal::algorithms::Input * input, const daal::algorithms::Parameter * par, const int method) @@ -56,19 +54,10 @@ DAAL_EXPORT Status Result::allocate(const daal::algorithms::Input * input, const services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - NumericTablePtr transformedDataNT; - if (deviceInfo.isCpu) - { - transformedDataNT = HomogenNumericTable::create(nComponents, nInputs, NumericTable::doAllocate, &status); - } - else - { - transformedDataNT = SyclHomogenNumericTable::create(nComponents, nInputs, NumericTable::doAllocate, &status); - } + transformedDataNT = HomogenNumericTable::create(nComponents, nInputs, NumericTable::doAllocate, &status); + DAAL_CHECK_STATUS_VAR(status); set(transformedData, transformedDataNT); diff --git a/cpp/daal/src/algorithms/pca/transform/pca_transform_container.h b/cpp/daal/src/algorithms/pca/transform/pca_transform_container.h index 294b72ce945..f94eba0514f 100644 --- a/cpp/daal/src/algorithms/pca/transform/pca_transform_container.h +++ b/cpp/daal/src/algorithms/pca/transform/pca_transform_container.h @@ -27,8 +27,6 @@ #define __PCA_TRANSFORM_CONTAINER_H__ #include "src/algorithms/pca/transform/pca_transform_kernel.h" -#include "services/internal/execution_context.h" -#include "src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi.h" namespace daal { @@ -41,17 +39,7 @@ namespace transform template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface(daalEnv) { - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::TransformKernel, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(oneapi::internal::TransformKernelOneAPI, algorithmFPType, method); - } + __DAAL_INITIALIZE_KERNELS(internal::TransformKernel, algorithmFPType, method); } template @@ -73,19 +61,8 @@ services::Status BatchContainer::compute() daal::services::Environment::env & env = *_env; - auto & context = daal::services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::TransformKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, *(input->get(data)), - *(input->get(eigenvectors)), pMeans, pVariances, pEigenvalues, *(result->get(transformedData))); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, oneapi::internal::TransformKernelOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, - *(input->get(data)), *(input->get(eigenvectors)), pMeans, pVariances, pEigenvalues, *(result->get(transformedData))); - } + __DAAL_CALL_KERNEL(env, internal::TransformKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, *(input->get(data)), + *(input->get(eigenvectors)), pMeans, pVariances, pEigenvalues, *(result->get(transformedData))); } } // namespace transform diff --git a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_fpt_dispatcher.cpp index 81133ae3c82..2761a7a5842 100644 --- a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_fpt_dispatcher.cpp @@ -29,6 +29,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(pca::transform::BatchContainer, batch, DAAL_FPTYPE, pca::transform::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(pca::transform::BatchContainer, batch, DAAL_FPTYPE, pca::transform::defaultDense) } } // namespace daal diff --git a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index 53e92f49116..00000000000 --- a/cpp/daal/src/algorithms/pca/transform/pca_transform_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,46 +0,0 @@ -/* file: pca_transform_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of pca transform kernel. -//-- -*/ - -#include "src/algorithms/pca/transform/pca_transform_container.h" -#include "src/algorithms/pca/transform/oneapi/pca_transform_dense_default_batch_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace pca -{ -namespace transform -{ -namespace oneapi -{ -namespace internal -{ -template class DAAL_EXPORT TransformKernelOneAPI; - -} // namespace internal -} // namespace oneapi -} // namespace transform -} // namespace pca -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/svm/BUILD b/cpp/daal/src/algorithms/svm/BUILD index 307a44c195a..31bf41bc8dd 100644 --- a/cpp/daal/src/algorithms/svm/BUILD +++ b/cpp/daal/src/algorithms/svm/BUILD @@ -4,10 +4,9 @@ load("@onedal//dev/bazel:daal.bzl", "daal_module") daal_module( name = "kernel", auto = True, - opencl = True, deps = [ "@onedal//cpp/daal:core", - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", "@onedal//cpp/daal/src/algorithms/classifier:kernel", ], ) diff --git a/cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_kernels.cl b/cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_kernels.cl deleted file mode 100644 index 6b06094f4ea..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_kernels.cl +++ /dev/null @@ -1,139 +0,0 @@ -/* file: svm_kernels.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of SVM kernels. -//-- -*/ - -#ifndef __SVM_KERNELS_CL__ -#define __SVM_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelSVM, - - __kernel void makeInversion(const __global algorithmFPType * const x, __global algorithmFPType * res) { - const uint i = get_global_id(0); - res[i] = -x[i]; - } - - __kernel void makeRange(__global uint * x) { - const uint i = get_global_id(0); - x[i] = i; - } - - __kernel void checkUpper(const __global algorithmFPType * const y, const __global algorithmFPType * const alpha, const algorithmFPType C, - __global uint * indicator) { - const uint i = get_global_id(0); - indicator[i] = (y[i] > 0 && alpha[i] < C) || (y[i] < 0 && alpha[i] > 0); - } - - __kernel void checkLower(const __global algorithmFPType * const y, const __global algorithmFPType * const alpha, const algorithmFPType C, - __global uint * indicator) { - const uint i = get_global_id(0); - indicator[i] = (y[i] > 0 && alpha[i] > 0) || (y[i] < 0 && alpha[i] < C); - } - - __kernel void checkBorder(const __global algorithmFPType * const alpha, const algorithmFPType C, __global uint * indicator) { - const uint i = get_global_id(0); - const algorithmFPType alphai = alpha[i]; - indicator[i] = 0 < alphai && alphai < C; - } - - __kernel void checkNonZeroBinary(const __global algorithmFPType * const alpha, __global uint * indicator) { - const uint i = get_global_id(0); - indicator[i] = alpha[i] != (algorithmFPType)0; - } - - __kernel void resetIndicatorWithZeros(const __global uint * const ind, __global uint * indicator) { - const uint i = get_global_id(0); - indicator[ind[i]] = 0; - } - - __kernel void copyDataByIndices(const __global algorithmFPType * const x, const __global uint * const xInd, const uint ldx, - __global algorithmFPType * newX) { - const uint index = get_global_id(1); - const uint jCol = get_global_id(0); - - const uint iRow = xInd[index]; - - const __global algorithmFPType * const xi = &x[iRow * ldx]; - __global algorithmFPType * newXi = &newX[index * ldx]; - - newXi[jCol] = xi[jCol]; - } - - __kernel void copyDataByIndicesInt(const __global algorithmFPType * const x, const __global int * const xInd, const uint ldx, - __global algorithmFPType * newX) { - const int index = get_global_id(1); - const int jCol = get_global_id(0); - - const int iRow = xInd[index]; - - const __global algorithmFPType * const xi = &x[iRow * ldx]; - __global algorithmFPType * newXi = &newX[index * ldx]; - - newXi[jCol] = xi[jCol]; - } - - __kernel void copyRowIndicesByIndices(const __global ulong * const rowIndex, const __global uint * const ind, __global ulong * newRowIndex, - const ulong nRows, __global ulong * dataSize) { - newRowIndex[0] = 1; - for (ulong iRow = 0; iRow < nRows; ++iRow) - { - const ulong wRow = ind[iRow]; - const ulong nNonZeroValuesInRow = rowIndex[wRow + 1] - rowIndex[wRow]; - newRowIndex[iRow + 1] = newRowIndex[iRow] + nNonZeroValuesInRow; - } - *dataSize = newRowIndex[nRows]; - } - - __kernel void copyCSRByIndices(const __global ulong * const rowIndexIn, const __global ulong * const rowIndexOut, const __global uint * const ind, - const __global algorithmFPType * const valuesIn, const __global ulong * const columnsIn, - __global algorithmFPType * valuesOut, __global ulong * columnsOut) { - const ulong iRowOut = get_global_id(0); - const ulong nNonZeroValuesInRow = rowIndexOut[iRowOut + 1] - rowIndexOut[iRowOut]; - - const ulong j = get_global_id(1); - if (j >= nNonZeroValuesInRow) - { - return; - } - - const ulong iRowIn = ind[iRowOut]; - const ulong offsetIn = rowIndexIn[iRowIn] - rowIndexIn[0]; - const ulong offsetOut = rowIndexOut[iRowOut] - rowIndexOut[0]; - - valuesOut[j + offsetOut] = valuesIn[j + offsetIn]; - columnsOut[j + offsetOut] = columnsIn[j + offsetIn]; - } - - __kernel void computeDualCoeffs(const __global algorithmFPType * const y, __global algorithmFPType * a) { - const uint i = get_global_id(0); - a[i] = a[i] * y[i]; - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_train_block_smo_oneapi.cl b/cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_train_block_smo_oneapi.cl deleted file mode 100755 index 0cecc9b9e05..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/cl_kernels/svm_train_block_smo_oneapi.cl +++ /dev/null @@ -1,233 +0,0 @@ -/* file: svm_train_block_smo_oneapi.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of SMO algorithm for wokset block. -//-- -*/ - -#ifndef __SVM_TRAIN_BLOCK_SMO_ONEAPI_CL__ -#define __SVM_TRAIN_BLOCK_SMO_ONEAPI_CL__ - -#include - -#define DECLARE_SOURCE_DAAL(name, src) static const char *(name) = #src; - -DECLARE_SOURCE_DAAL( - clKernelBlockSMO, - - inline bool isUpper(const algorithmFPType alpha, const algorithmFPType y, const algorithmFPType C) { - return (y > 0 && alpha < C) || (y < 0 && alpha > 0); - } - - inline bool isLower(const algorithmFPType alpha, const algorithmFPType y, const algorithmFPType C) { - return (y > 0 && alpha > 0) || (y < 0 && alpha < C); - } - - typedef struct { - int index; - algorithmFPType value; - } KeyValue; - - void reduceArgMax(const __local algorithmFPType * values, __local KeyValue * localCache, __local KeyValue * result) { - const int localGroupId = get_sub_group_local_id(); - const int groupId = get_sub_group_id(); - const int localId = get_local_id(0); - const int groupCount = get_num_sub_groups(); - const int subGroupSize = get_sub_group_size(); - - algorithmFPType x = values[localId]; - int indX = localId; - - algorithmFPType resMax = sub_group_reduce_max(x); - int resIndex = sub_group_reduce_min(resMax == x ? indX : INT_MAX); - - if (localGroupId == 0) - { - localCache[groupId].value = resMax; - localCache[groupId].index = resIndex; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - if (groupId == 0 && localGroupId < groupCount) - { - x = localCache[localGroupId].value; - indX = localCache[localGroupId].index; - resMax = sub_group_reduce_max(x); - resIndex = sub_group_reduce_min(resMax == x ? indX : INT_MAX); - - for (int iGroup = subGroupSize; iGroup < groupCount; iGroup += subGroupSize) - { - x = localCache[iGroup + localGroupId].value; - indX = localCache[iGroup + localGroupId].index; - - const algorithmFPType innerMax = sub_group_reduce_max(x); - if (innerMax > resMax) - { - resMax = innerMax; - resIndex = sub_group_reduce_min(resMax == x ? indX : INT_MAX); - } - } - - if (localGroupId == 0) - { - result->value = resMax; - result->index = resIndex; - } - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - __kernel void smoKernel(const __global algorithmFPType * const y, const __global algorithmFPType * const kernelWsRows, - const __global uint * wsIndices, const uint nVectors, const __global algorithmFPType * grad, const algorithmFPType C, - const algorithmFPType eps, const algorithmFPType tau, const uint maxInnerIteration, __global algorithmFPType * alpha, - __global algorithmFPType * deltaalpha, __global algorithmFPType * resinfo) { - const uint i = get_local_id(0); - __local algorithmFPType kd[WS_SIZE]; - const uint wsIndex = wsIndices[i]; - - const algorithmFPType MIN_FLT = -FPTYPE_MAXVALUE; - - const algorithmFPType two = (algorithmFPType)2.0; - - algorithmFPType gradi = grad[wsIndex]; - algorithmFPType alphai = alpha[wsIndex]; - const algorithmFPType oldalphai = alphai; - const algorithmFPType yi = y[wsIndex]; - - __local algorithmFPType objFunc[WS_SIZE]; - - __local algorithmFPType deltaBi; - __local algorithmFPType deltaBj; - - __local KeyValue localCache[SIMD_WIDTH]; - __local KeyValue maxValInd; - - uint Bi = 0; - uint Bj = 0; - - kd[i] = kernelWsRows[i * nVectors + wsIndex]; - barrier(CLK_LOCAL_MEM_FENCE); - - __local algorithmFPType localDiff; - __local algorithmFPType localEps; - - uint iter = 0; - for (; iter < maxInnerIteration; ++iter) - { - /* m(alpha) = min(grad[i]): i belongs to I_UP (alpha) */ - objFunc[i] = isUpper(alphai, yi, C) ? -gradi : MIN_FLT; - - /* Find i index of the working set (Bi) */ - reduceArgMax(objFunc, localCache, &maxValInd); - Bi = maxValInd.index; - const algorithmFPType ma = -maxValInd.value; - - /* maxgrad(alpha) = max(grad[i]): i belongs to I_low (alpha) */ - objFunc[i] = isLower(alphai, yi, C) ? gradi : MIN_FLT; - - /* Find max gradinet */ - reduceArgMax(objFunc, localCache, &maxValInd); - - if (i == 0) - { - const algorithmFPType maxGrad = maxValInd.value; - - /* for condition check: m(alpha) >= maxgrad */ - localDiff = maxGrad - ma; - if (iter == 0) - { - localEps = max(eps, localDiff * (algorithmFPType)1e-1); - resinfo[1] = localDiff; - } - } - - barrier(CLK_LOCAL_MEM_FENCE); - if (localDiff < localEps) - { - break; - } - - const algorithmFPType Kii = kd[i]; - const algorithmFPType KBiBi = kd[Bi]; - const algorithmFPType KiBi = kernelWsRows[Bi * nVectors + wsIndex]; - - if (isLower(alphai, yi, C) && ma < gradi) - { - /* M(alpha) = max((b^2/a) : i belongs to I_low(alpha) and ma < grad(alpha) */ - const algorithmFPType b = ma - gradi; - const algorithmFPType a = max(Kii + KBiBi - two * KiBi, tau); - const algorithmFPType dt = b / a; - - objFunc[i] = b * dt; - } - else - { - objFunc[i] = MIN_FLT; - } - - /* Find j index of the working set (Bj) */ - reduceArgMax(objFunc, localCache, &maxValInd); - Bj = maxValInd.index; - - const algorithmFPType KiBj = kernelWsRows[Bj * nVectors + wsIndex]; - - /* Update alpha */ - if (i == Bi) - { - deltaBi = yi > 0 ? C - alphai : alphai; - } - if (i == Bj) - { - deltaBj = yi > 0 ? alphai : C - alphai; - const algorithmFPType b = ma - gradi; - const algorithmFPType a = max(Kii + KBiBi - two * KiBi, tau); - - const algorithmFPType dt = -b / a; - deltaBj = min(deltaBj, dt); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - const algorithmFPType delta = min(deltaBi, deltaBj); - if (i == Bi) - { - alphai = alphai + yi * delta; - } - if (i == Bj) - { - alphai = alphai - yi * delta; - } - - /* Update gradient */ - gradi = gradi + delta * (KiBi - KiBj); - } - alpha[wsIndex] = alphai; - deltaalpha[i] = (alphai - oldalphai) * yi; - if (i == 0) - { - resinfo[0] = iter; - } - } - -); - -#undef DECLARE_SOURCE_DAAL - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_helper_oneapi.h b/cpp/daal/src/algorithms/svm/oneapi/svm_helper_oneapi.h deleted file mode 100644 index 783afd83583..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_helper_oneapi.h +++ /dev/null @@ -1,465 +0,0 @@ -/* file: svm_helper_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __SVM_HELPER_ONEAPI_H__ -#define __SVM_HELPER_ONEAPI_H__ - -#include "src/data_management/service_numeric_table.h" -#include "src/sycl/sorter.h" -#include "src/sycl/partition.h" -#include "src/externals/service_profiler.h" -#include "src/algorithms/svm/oneapi/cl_kernels/svm_kernels.cl" -#include "src/services/service_data_utils.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace utils -{ -namespace internal -{ -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; - -template -inline const T min(const T a, const T b) -{ - return !(b < a) ? a : b; -} - -template -inline const T max(const T a, const T b) -{ - return (a < b) ? b : a; -} - -template -inline const T abs(const T & a) -{ - return a > 0 ? a : -a; -} - -inline size_t maxpow2(size_t n) -{ - if (!(n & (n - 1))) - { - return n; - } - - size_t count = 0; - while (n > 1) - { - n >>= 1; - count++; - } - return 1 << count; -} - -template -struct HelperSVM -{ - static services::Status buildProgram(ClKernelFactoryIface & factory) - { - services::String options = getKeyFPType(); - - services::String cachekey("__daal_algorithms_svm_"); - options.add(" -D LOCAL_SUM_SIZE=256 "); - cachekey.add(options); - - services::Status status; - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelSVM, options.c_str(), status); - return status; - } - - static services::Status makeInversion(const services::internal::Buffer & x, services::internal::Buffer & res, - const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(makeInversion); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("makeInversion", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(x.size() == n); - DAAL_ASSERT(res.size() == n); - - args.set(0, x, AccessModeIds::read); - args.set(1, res, AccessModeIds::write); - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - - static services::Status makeRange(UniversalBuffer & x, const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(makeRange); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("makeRange", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(1, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, AccessModeIds::readwrite); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - - static services::Status argSort(const UniversalBuffer & f, UniversalBuffer & values, UniversalBuffer & valuesBuf, UniversalBuffer & indecesSort, - UniversalBuffer & indecesBuf, const size_t n) - { - services::Status status; - auto & context = services::internal::getDefaultContext(); - - context.copy(values, 0, f, 0, n, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_STATUS(status, makeRange(indecesSort, n)); - DAAL_CHECK_STATUS(status, sort::RadixSort::sortIndices(values, indecesSort, valuesBuf, indecesBuf, n)); - return status; - } - - static services::Status copyDataByIndices(const services::internal::Buffer & x, - const services::internal::Buffer & indX, services::internal::Buffer & newX, - const size_t nWS, const size_t p) - { - DAAL_ITTNOTIFY_SCOPED_TASK(copyDataByIndices); - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - buildProgram(factory); - - const char * const kernelName = "copyDataByIndices"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(indX.size() == nWS); - DAAL_ASSERT(newX.size() == nWS * p); - - args.set(0, x, services::internal::sycl::AccessModeIds::read); - args.set(1, indX, services::internal::sycl::AccessModeIds::read); - DAAL_ASSERT(p <= uint32max); - args.set(2, static_cast(p)); - args.set(3, newX, services::internal::sycl::AccessModeIds::write); - - services::internal::sycl::KernelRange range(p, nWS); - - ctx.run(range, kernel, args, status); - return status; - } - - static services::Status copyDataByIndices(const services::internal::Buffer & x, const services::internal::Buffer & indX, - services::internal::Buffer & newX, const size_t nWS, const size_t p) - { - DAAL_ITTNOTIFY_SCOPED_TASK(copyDataByIndices); - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - buildProgram(factory); - - const char * const kernelName = "copyDataByIndicesInt"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(indX.size() == nWS); - DAAL_ASSERT(newX.size() == nWS * p); - - args.set(0, x, services::internal::sycl::AccessModeIds::read); - args.set(1, indX, services::internal::sycl::AccessModeIds::read); - DAAL_ASSERT(p <= uint32max); - args.set(2, static_cast(p)); - args.set(3, newX, services::internal::sycl::AccessModeIds::write); - services::internal::sycl::KernelRange range(p, nWS); - - ctx.run(range, kernel, args, status); - return status; - } - - static services::Status copyRowIndicesByIndices(const services::internal::Buffer & rowsIn, const UniversalBuffer & ind, - services::internal::Buffer & rowsOut, const size_t nWS, size_t & dataSize) - { - DAAL_ITTNOTIFY_SCOPED_TASK(copyCSRByIndices); - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - buildProgram(factory); - - const char * const kernelName = "copyRowIndicesByIndices"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - - auto dataSizeU = ctx.allocate(TypeIds::id(), 1, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(rowsOut.size() == nWS + 1); - DAAL_ASSERT(rowsIn.size() > nWS); - - args.set(0, rowsIn, services::internal::sycl::AccessModeIds::read); - args.set(1, ind, services::internal::sycl::AccessModeIds::read); - args.set(2, rowsOut, services::internal::sycl::AccessModeIds::write); - args.set(3, nWS); - args.set(4, dataSizeU); - - services::internal::sycl::KernelRange range(1); - - ctx.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - auto svValuesHosrPtr = dataSizeU.get().toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - dataSize = *svValuesHosrPtr; - - return status; - } - - static services::Status copyCSRByIndices(const services::internal::Buffer & rowsIn, const services::internal::Buffer & rowsOut, - const UniversalBuffer & ind, const services::internal::Buffer & val, - const services::internal::Buffer & cols, services::internal::Buffer & valOut, - services::internal::Buffer & colsOut, const size_t nWS, const size_t p) - { - DAAL_ITTNOTIFY_SCOPED_TASK(copyCSRByIndices); - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - buildProgram(factory); - - const char * const kernelName = "copyCSRByIndices"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(rowsOut.size() == nWS + 1); - DAAL_ASSERT(rowsIn.size() > nWS); - - args.set(0, rowsIn, services::internal::sycl::AccessModeIds::read); - args.set(1, rowsOut, services::internal::sycl::AccessModeIds::read); - args.set(2, ind, services::internal::sycl::AccessModeIds::read); - args.set(3, val, services::internal::sycl::AccessModeIds::read); - args.set(4, cols, services::internal::sycl::AccessModeIds::read); - args.set(5, valOut, services::internal::sycl::AccessModeIds::write); - args.set(6, colsOut, services::internal::sycl::AccessModeIds::write); - - services::internal::sycl::KernelRange range(nWS, p); - - ctx.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - return status; - } - - static services::Status checkUpper(const services::internal::Buffer & y, - const services::internal::Buffer & alpha, services::internal::Buffer & indicator, - const algorithmFPType C, const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(checkUpper); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("checkUpper", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(y.size() == n); - DAAL_ASSERT(alpha.size() == n); - DAAL_ASSERT(indicator.size() == n); - - args.set(0, y, AccessModeIds::read); - args.set(1, alpha, AccessModeIds::read); - args.set(2, C); - args.set(3, indicator, AccessModeIds::write); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - - static services::Status checkLower(const services::internal::Buffer & y, - const services::internal::Buffer & alpha, services::internal::Buffer & indicator, - const algorithmFPType C, const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(checkLower); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("checkLower", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(y.size() == n); - DAAL_ASSERT(alpha.size() == n); - DAAL_ASSERT(indicator.size() == n); - - args.set(0, y, AccessModeIds::read); - args.set(1, alpha, AccessModeIds::read); - args.set(2, C); - args.set(3, indicator, AccessModeIds::write); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - - static services::Status checkBorder(const services::internal::Buffer & alpha, services::internal::Buffer & mask, - const algorithmFPType C, const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(checkBorder); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("checkBorder", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(alpha.size() == n); - DAAL_ASSERT(mask.size() == n); - - args.set(0, alpha, AccessModeIds::read); - args.set(1, C); - args.set(2, mask, AccessModeIds::write); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - - static services::Status checkNonZeroBinary(const services::internal::Buffer & alpha, services::internal::Buffer & mask, - const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(checkNonZeroBinary); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("checkNonZeroBinary", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(alpha.size() == n); - DAAL_ASSERT(mask.size() == n); - - args.set(0, alpha, AccessModeIds::read); - args.set(1, mask, AccessModeIds::write); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - - static services::Status computeDualCoeffs(const services::internal::Buffer & y, - services::internal::Buffer & alpha, const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(computeDualCoeffs); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("computeDualCoeffs", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(y.size() == n); - DAAL_ASSERT(alpha.size() == n); - - args.set(0, y, AccessModeIds::read); - args.set(1, alpha, AccessModeIds::readwrite); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - -private: - static constexpr size_t uint32max = static_cast(services::internal::MaxVal::get()); -}; - -} // namespace internal -} // namespace utils -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_predict_kernel_oneapi.h b/cpp/daal/src/algorithms/svm/oneapi/svm_predict_kernel_oneapi.h deleted file mode 100644 index 296247af2d7..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_predict_kernel_oneapi.h +++ /dev/null @@ -1,65 +0,0 @@ -/* file: svm_predict_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Declaration of template structs that contains SVM prediction functions. -//-- -*/ - -#ifndef __SVM_PREDICT_KERNEL_ONEAPI_H__ -#define __SVM_PREDICT_KERNEL_ONEAPI_H__ - -#include "data_management/data/numeric_table.h" -#include "algorithms/model.h" -#include "algorithms/svm/svm_predict_types.h" -#include "src/algorithms/kernel.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace prediction -{ -namespace internal -{ -using namespace daal::data_management; - -template -struct SVMPredictImplOneAPI : public Kernel -{ - services::Status compute(const NumericTablePtr & xTable, Model * model, NumericTable & r, const svm::Parameter * par) - { - return services::ErrorMethodNotImplemented; - } -}; - -template -struct SVMPredictImplOneAPI : public Kernel -{ - services::Status compute(const NumericTablePtr & xTable, Model * model, NumericTable & r, const svm::Parameter * par); -}; - -} // namespace internal -} // namespace prediction -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_predict_oneapi_impl.i b/cpp/daal/src/algorithms/svm/oneapi/svm_predict_oneapi_impl.i deleted file mode 100644 index e4b77fd1784..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_predict_oneapi_impl.i +++ /dev/null @@ -1,269 +0,0 @@ -/* file: svm_predict_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// SVM prediction algorithm implementation -//-- -*/ - -#ifndef __SVM_PREDICT_ONEAPI_IMPL_I__ -#define __SVM_PREDICT_ONEAPI_IMPL_I__ - -#include "src/sycl/blas_gpu.h" -#include "src/externals/service_profiler.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "data_management/data/internal/numeric_table_sycl_csr.h" -#include "src/algorithms/svm/oneapi/svm_helper_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace prediction -{ -namespace internal -{ -using namespace daal::internal; -using namespace daal::services::internal::sycl; -using daal::data_management::internal::SyclHomogenNumericTable; - -template -class PredictTask : public Base -{ -public: - virtual ~PredictTask() {} - - services::Status kernelCompute(const size_t startRow, const size_t nRows) - { - services::Status status; - auto subbuff = _buff.get().getSubBuffer(0, nRows * _nSV, status); - DAAL_CHECK_STATUS_VAR(status); - auto shResNT = SyclHomogenNumericTable::create(subbuff, _nSV, nRows, &status); - DAAL_CHECK_STATUS_VAR(status); - - auto xBlockNT = getBlockNTData(startRow, nRows, status); - DAAL_CHECK_STATUS_VAR(status); - - _shRes->set(kernel_function::values, shResNT); - _kernel->getInput()->set(kernel_function::X, xBlockNT); - _kernel->getParameter()->computationMode = kernel_function::matrixMatrix; - DAAL_CHECK(_kernel->computeNoThrow(), services::ErrorSVMPredictKernerFunctionCall); - - return status; - } - - services::internal::Buffer getBuff() const { return _buff.get(); } - -protected: - PredictTask(const size_t nMaxRowsPerBlock, const NumericTablePtr & xTable, const NumericTablePtr & svTable, - const kernel_function::KernelIfacePtr & kernel, services::Status & status) - : _xTable(xTable), _nSV(svTable->getNumberOfRows()) - { - auto & context = services::internal::getDefaultContext(); - _buff = context.allocate(TypeIds::id(), _nSV * nMaxRowsPerBlock, status); - - if (!status) - { - return; - } - - _kernel = kernel->clone(); - _shRes = kernel_function::ResultPtr(new kernel_function::Result()); - _kernel->setResult(_shRes); - _kernel->getInput()->set(kernel_function::Y, svTable); - } - - virtual NumericTablePtr getBlockNTData(const size_t startRow, const size_t nRows, services::Status & status) = 0; - -protected: - const NumericTablePtr & _xTable; - const size_t _nSV; - UniversalBuffer _buff; - kernel_function::KernelIfacePtr _kernel; - kernel_function::ResultPtr _shRes; -}; - -template -class PredictTaskDense : public PredictTask -{ -public: - using Super = PredictTask; - - virtual ~PredictTaskDense() { Super::_xTable->releaseBlockOfRows(_xBlock); } - - static services::SharedPtr > create(const size_t nRowsPerBlock, const NumericTablePtr & xTable, - const NumericTablePtr & svTable, - const kernel_function::KernelIfacePtr & kernel, - services::Status * stat = nullptr) - { - DAAL_DEFAULT_CREATE_TEMPLATE_IMPL_EX(PredictTaskDense, algorithmFPType, nRowsPerBlock, xTable, svTable, kernel); - } - -protected: - PredictTaskDense(const size_t nRowsPerBlock, const NumericTablePtr & xTable, const NumericTablePtr & svTable, - const kernel_function::KernelIfacePtr & kernel, services::Status & status) - : Super(nRowsPerBlock, xTable, svTable, kernel, status) - {} - - NumericTablePtr getBlockNTData(const size_t startRow, const size_t nRows, services::Status & status) override - { - Super::_xTable->releaseBlockOfRows(_xBlock); - status |= Super::_xTable->getBlockOfRows(startRow, nRows, ReadWriteMode::readOnly, _xBlock); - const services::internal::Buffer xBuf = _xBlock.getBuffer(); - - NumericTablePtr xBlockNT = SyclHomogenNumericTable::create(xBuf, Super::_xTable->getNumberOfColumns(), nRows, &status); - if (!xBlockNT) - { - status |= services::Status(services::ErrorMemoryAllocationFailed); - } - return xBlockNT; - } - -private: - BlockDescriptor _xBlock; -}; - -template -class PredictTaskCSR : public PredictTask -{ -public: - using Super = PredictTask; - - virtual ~PredictTaskCSR() {} - - static services::SharedPtr > create(const size_t nRowsPerBlock, const NumericTablePtr & xTable, - const NumericTablePtr & svTable, - const kernel_function::KernelIfacePtr & kernel, - services::Status * stat = nullptr) - { - DAAL_DEFAULT_CREATE_TEMPLATE_IMPL_EX(PredictTaskCSR, algorithmFPType, nRowsPerBlock, xTable, svTable, kernel); - } - -protected: - PredictTaskCSR(const size_t nRowsPerBlock, const NumericTablePtr & xTable, const NumericTablePtr & svTable, - const kernel_function::KernelIfacePtr & kernel, services::Status & status) - : Super(nRowsPerBlock, xTable, svTable, kernel, status) - {} - - NumericTablePtr getBlockNTData(const size_t startRow, const size_t nRows, services::Status & status) override - { - auto csrIface = services::dynamicPointerCast(Super::_xTable); - status |= csrIface->releaseSparseBlock(_xBlock); - status |= csrIface->getSparseBlock(startRow, nRows, readOnly, _xBlock); - - auto xValuesBuff = _xBlock.getBlockValuesBuffer(); - auto xColIndicesBuff = _xBlock.getBlockColumnIndicesBuffer(); - auto xRowOffsetsBuff = _xBlock.getBlockRowIndicesBuffer(); - - NumericTablePtr xBlockNT = SyclCSRNumericTable::create(xValuesBuff, xColIndicesBuff, xRowOffsetsBuff, Super::_xTable->getNumberOfColumns(), - nRows, CSRNumericTableIface::oneBased, &status); - if (!xBlockNT) - { - status |= services::Status(services::ErrorMemoryAllocationFailed); - } - return xBlockNT; - } - -private: - CSRBlockDescriptor _xBlock; -}; - -template -services::Status SVMPredictImplOneAPI::compute(const NumericTablePtr & xTable, Model * model, NumericTable & result, - const svm::Parameter * par) -{ - services::Status status; - auto & context = services::internal::getDefaultContext(); - - const size_t nVectors = xTable->getNumberOfRows(); - const size_t nFeatures = xTable->getNumberOfColumns(); - - DAAL_ASSERT(result.getNumberOfRows() == nVectors) - DAAL_ASSERT(result.getNumberOfColumns() == 1) - - BlockDescriptor resultBlock; - DAAL_CHECK_STATUS(status, result.getBlockOfRows(0, nVectors, ReadWriteMode::writeOnly, resultBlock)); - auto distanceBuff = resultBlock.getBuffer(); - - auto svCoeffTable = model->getClassificationCoefficients(); - const size_t nSV = svCoeffTable->getNumberOfRows(); - - if (nSV == 0) - { - context.fill(distanceBuff, 0.0, status); - return status; - } - - BlockDescriptor svCoeffBlock; - DAAL_CHECK_STATUS(status, svCoeffTable->getBlockOfRows(0, nSV, ReadWriteMode::readOnly, svCoeffBlock)); - auto svCoeffBuff = svCoeffBlock.getBuffer(); - - const algorithmFPType bias(model->getBias()); - context.fill(distanceBuff, double(bias), status); - DAAL_CHECK_STATUS_VAR(status); - - auto svTable = model->getSupportVectors(); - - const size_t nRowsPerBlock = xTable->getDataLayout() == NumericTableIface::csrArray ? nVectors : 1024; - const size_t nBlocks = nVectors / nRowsPerBlock + !!(nVectors % nRowsPerBlock); - - kernel_function::ResultPtr shRes(new kernel_function::Result()); - DAAL_CHECK_MALLOC(shRes) - - services::SharedPtr > predictTask; - if (xTable->getDataLayout() == NumericTableIface::csrArray) - { - predictTask = PredictTaskCSR::create(nRowsPerBlock, xTable, svTable, par->kernel); - } - else - { - predictTask = PredictTaskDense::create(nRowsPerBlock, xTable, svTable, par->kernel); - } - - for (size_t iBlock = 0; iBlock < nBlocks; ++iBlock) - { - const size_t startRow = iBlock * nRowsPerBlock; - const size_t nRowsPerBlockReal = (iBlock != nBlocks - 1) ? nRowsPerBlock : nVectors - iBlock * nRowsPerBlock; - - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(predictTask->kernelCompute(startRow, nRowsPerBlockReal), services::ErrorSVMPredictKernerFunctionCall); - const auto kernelResBuff = predictTask->getBuff(); - - { - DAAL_ITTNOTIFY_SCOPED_TASK(gemm); - DAAL_CHECK_STATUS(status, BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::NoTrans, math::Transpose::NoTrans, - nRowsPerBlockReal, 1, nSV, algorithmFPType(1.0), kernelResBuff, nSV, 0, - svCoeffBuff, 1, 0, algorithmFPType(1.0), distanceBuff, 1, startRow)); - } - } - DAAL_CHECK_STATUS(status, result.releaseBlockOfRows(resultBlock)); - DAAL_CHECK_STATUS(status, svCoeffTable->releaseBlockOfRows(svCoeffBlock)); - - return status; -} - -} // namespace internal -} // namespace prediction -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_train_cache_oneapi.h b/cpp/daal/src/algorithms/svm/oneapi/svm_train_cache_oneapi.h deleted file mode 100644 index 5a3b8bd31f6..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_train_cache_oneapi.h +++ /dev/null @@ -1,349 +0,0 @@ -/* file: svm_train_cache_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// SVM cache structure implementation -//-- -*/ - -#ifndef __SVM_TRAIN_CACHE_ONEAPI_H__ -#define __SVM_TRAIN_CACHE_ONEAPI_H__ - -#include "src/services/service_utils.h" -#include "src/externals/service_memory.h" -#include "src/data_management/service_micro_table.h" -#include "src/data_management/service_numeric_table.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "data_management/data/internal/numeric_table_sycl_csr.h" -#include "src/algorithms/svm/oneapi/svm_helper_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace training -{ -namespace internal -{ -using namespace daal::data_management; -using namespace daal::services::internal::sycl; -using daal::data_management::internal::SyclHomogenNumericTable; - -template -class SubDataTaskBase -{ -public: - DAAL_NEW_DELETE(); - virtual ~SubDataTaskBase() {} - - virtual services::Status copyDataByIndices(const services::internal::Buffer & wsIndices, const size_t nSubsetVectors, - const NumericTablePtr & xTable) = 0; - - NumericTablePtr getTableData() const { return _dataTable; } - -protected: - SubDataTaskBase(const size_t nMaxSubsetVectors, const size_t dataSize, services::Status & status) : _nMaxSubsetVectors(nMaxSubsetVectors) - { - auto & context = services::internal::getDefaultContext(); - this->_data = context.allocate(TypeIds::id(), dataSize, status); - } - - SubDataTaskBase(const size_t nMaxSubsetVectors) : _nMaxSubsetVectors(nMaxSubsetVectors) {} - -protected: - size_t _nMaxSubsetVectors; - UniversalBuffer _data; - NumericTablePtr _dataTable; -}; - -template -class SubDataTaskDense : public SubDataTaskBase -{ -public: - using super = SubDataTaskBase; - using Helper = utils::internal::HelperSVM; - - static services::SharedPtr > create(const size_t nFeatures, const size_t nMaxSubsetVectors, - services::Status * stat = nullptr) - { - DAAL_DEFAULT_CREATE_TEMPLATE_IMPL_EX(SubDataTaskDense, algorithmFPType, nFeatures, nMaxSubsetVectors); - } - - services::Status copyDataByIndices(const services::internal::Buffer & wsIndices, const size_t nSubsetVectors, - const NumericTablePtr & xTable) override - { - services::Status status; - BlockDescriptor xBlock; - - DAAL_CHECK_STATUS(status, xTable->getBlockOfRows(0, xTable->getNumberOfRows(), ReadWriteMode::readOnly, xBlock)); - const services::internal::Buffer & xBuff = xBlock.getBuffer(); - - const size_t blockSize = nSubsetVectors; - services::internal::Buffer wsIndicesReal = wsIndices; - auto xBlockBuff = this->_data.template get(); - - DAAL_CHECK_STATUS(status, Helper::copyDataByIndices(xBuff, wsIndicesReal, xBlockBuff, blockSize, xTable->getNumberOfColumns())); - DAAL_CHECK_STATUS(status, xTable->releaseBlockOfRows(xBlock)); - return status; - } - -protected: - SubDataTaskDense(const size_t nFeatures, const size_t nMaxSubsetVectors, services::Status & status) - : super(nMaxSubsetVectors, nFeatures * nMaxSubsetVectors, status) - { - auto xBlockBuff = this->_data.template get(); - this->_dataTable = SyclHomogenNumericTable::create(xBlockBuff, nFeatures, nMaxSubsetVectors, &status); - } -}; - -template -class SubDataTaskCSR : public SubDataTaskBase -{ -public: - using super = SubDataTaskBase; - using Helper = utils::internal::HelperSVM; - - static services::SharedPtr > create(const NumericTablePtr & xTable, const size_t nMaxSubsetVectors, - services::Status * stat = nullptr) - { - DAAL_DEFAULT_CREATE_TEMPLATE_IMPL_EX(SubDataTaskCSR, algorithmFPType, xTable, nMaxSubsetVectors); - } - - services::Status copyDataByIndices(const services::internal::Buffer & wsIndices, const size_t nSubsetVectors, - const NumericTablePtr & xTable) override - { - services::Status status; - - CSRBlockDescriptor blockCSR; - CSRNumericTableIface * csrIface = dynamic_cast(const_cast(xTable.get())); - DAAL_CHECK(csrIface, services::ErrorEmptyCSRNumericTable); - - DAAL_CHECK_STATUS(status, csrIface->getSparseBlock(0, xTable->getNumberOfRows(), readOnly, blockCSR)); - - const auto xValuesBuff = blockCSR.getBlockValuesBuffer(); - const auto xColumnIndicesBuff = blockCSR.getBlockColumnIndicesBuffer(); - const auto xRowIndicesBuff = blockCSR.getBlockRowIndicesBuffer(); - - services::internal::Buffer wsIndicesReal = wsIndices; - auto xBlockBuff = this->_data.template get(); - - size_t dataSizeOut = 0; - DAAL_CHECK_STATUS(status, Helper::copyRowIndicesByIndices(xRowIndicesBuff, wsIndicesReal, _rowOffsets, nSubsetVectors, dataSizeOut)); - DAAL_CHECK_STATUS(status, Helper::copyCSRByIndices(xRowIndicesBuff, _rowOffsets, wsIndicesReal, xValuesBuff, xColumnIndicesBuff, xBlockBuff, - _colIndices, nSubsetVectors, xTable->getNumberOfColumns())); - DAAL_CHECK_STATUS(status, csrIface->releaseSparseBlock(blockCSR)); - return status; - } - -protected: - SubDataTaskCSR(const NumericTablePtr & xTable, const size_t nMaxSubsetVectors, services::Status & status) : super(nMaxSubsetVectors) - { - const size_t p = xTable->getNumberOfColumns(); - const size_t nRows = xTable->getNumberOfRows(); - CSRNumericTableIface * const csrIface = dynamic_cast(const_cast(xTable.get())); - const size_t maxDataSize = csrIface->getDataSize(); - - auto & context = services::internal::getDefaultContext(); - this->_data = context.allocate(TypeIds::id(), maxDataSize, status); - - UniversalBuffer colIndices = context.allocate(TypeIds::id(), maxDataSize, status); - UniversalBuffer rowOffsets = context.allocate(TypeIds::id(), nMaxSubsetVectors + 1, status); - _colIndices = colIndices.template get(); - _rowOffsets = rowOffsets.template get(); - - auto data = this->_data.template get(); - this->_dataTable = SyclCSRNumericTable::create(data, _colIndices, _rowOffsets, p, nMaxSubsetVectors, - CSRNumericTableIface::oneBased, &status); - } - -private: - services::internal::Buffer _colIndices; - services::internal::Buffer _rowOffsets; -}; - -/** - * Types of caches for kernel function values - */ -enum SVMCacheOneAPIType -{ - noCache, /*!< No storage for caching kernel function values is provided */ - simpleCache, /*!< Storage for caching ALL kernel function values is provided */ - lruCache /*!< Storage for caching PART of kernel function values is provided; - LRU algorithm is used to exclude values from cache */ -}; - -/** - * Common interface for cache that stores kernel function values - */ -template -class SVMCacheOneAPIIface -{ -public: - virtual ~SVMCacheOneAPIIface() {} - - virtual services::Status compute(const NumericTablePtr & xTable, const services::internal::Buffer & wsIndices, const size_t p) = 0; - - virtual const services::internal::Buffer & getRowsBlock() const = 0; - virtual services::Status copyLastToFirst() = 0; - -protected: - SVMCacheOneAPIIface(const size_t blockSize, const size_t lineSize, const kernel_function::KernelIfacePtr & kernel) - : _lineSize(lineSize), _blockSize(blockSize), _kernel(kernel) - {} - - const size_t _lineSize; /*!< Number of elements in the cache line */ - const size_t _blockSize; /*!< Number of cache lines */ - const kernel_function::KernelIfacePtr _kernel; /*!< Kernel function */ -}; - -template -class SVMCacheOneAPI -{}; - -template -using SVMCacheOneAPIPtr = services::SharedPtr >; - -/** - * No cache: kernel function values are not cached - */ -template -class SVMCacheOneAPI : public SVMCacheOneAPIIface -{ - using Helper = utils::internal::HelperSVM; - using super = SVMCacheOneAPIIface; - using thisType = SVMCacheOneAPI; - using super::_kernel; - using super::_lineSize; - using super::_blockSize; - using SubDataTaskBasePtr = services::SharedPtr >; - -public: - ~SVMCacheOneAPI() {} - - DAAL_NEW_DELETE(); - - static SVMCacheOneAPIPtr create(const size_t cacheSize, const size_t blockSize, const size_t lineSize, - const NumericTablePtr & xTable, const kernel_function::KernelIfacePtr & kernel, - services::Status & status) - { - status.clear(); - services::SharedPtr res = services::SharedPtr(new thisType(blockSize, lineSize, xTable, kernel)); - if (!res) - { - status.add(ErrorMemoryAllocationFailed); - } - else - { - status = res->init(cacheSize, xTable); - if (!status) - { - res.reset(); - } - } - return SVMCacheOneAPIPtr(res); - } - - const services::internal::Buffer & getRowsBlock() const override { return _cacheBuff; } - - services::Status compute(const NumericTablePtr & xTable, const services::internal::Buffer & wsIndices, const size_t p) override - { - DAAL_ITTNOTIFY_SCOPED_TASK(cacheCompute); - - services::Status status; - const size_t nWorkElements = wsIndices.size(); - - DAAL_CHECK_STATUS(status, _blockTask->copyDataByIndices(wsIndices, nWorkElements, xTable)); - - DAAL_CHECK_STATUS(status, _kernel->computeNoThrow()); - return status; - } - - services::Status copyLastToFirst() override - { - _nSelectRows = _blockSize / 2; - _ifComputeSubKernel = true; - services::Status status; - - auto & context = services::internal::getDefaultContext(); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _nSelectRows, _lineSize); - context.copy(_cache, 0, _cache, _nSelectRows * _lineSize, _nSelectRows * _lineSize, status); - return status; - } - -protected: - SVMCacheOneAPI(const size_t blockSize, const size_t lineSize, const NumericTablePtr & xTable, const kernel_function::KernelIfacePtr & kernel) - : super(blockSize, lineSize, kernel), _nSelectRows(0), _ifComputeSubKernel(false) - {} - - services::Status init(const size_t cacheSize, const NumericTablePtr & xTable) - { - services::Status status; - auto & context = services::internal::getDefaultContext(); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, _lineSize, _blockSize); - _cache = context.allocate(TypeIds::id(), _lineSize * _blockSize, status); - DAAL_CHECK_STATUS_VAR(status); - - _cacheBuff = _cache.get(); - auto cacheTable = SyclHomogenNumericTable::create(_cacheBuff, _lineSize, _blockSize, &status); - - const size_t p = xTable->getNumberOfColumns(); - - DAAL_CHECK_STATUS_VAR(status); - - if (xTable->getDataLayout() == NumericTableIface::csrArray) - { - _blockTask = SubDataTaskCSR::create(xTable, _blockSize, &status); - } - else - { - _blockTask = SubDataTaskDense::create(p, _blockSize, &status); - } - - DAAL_CHECK_STATUS_VAR(status); - _kernel->getParameter()->computationMode = kernel_function::matrixMatrix; - _kernel->getInput()->set(kernel_function::X, _blockTask->getTableData()); - _kernel->getInput()->set(kernel_function::Y, xTable); - - kernel_function::ResultPtr shRes(new kernel_function::Result()); - shRes->set(kernel_function::values, cacheTable); - _kernel->setResult(shRes); - - return status; - } - -protected: - size_t _nSelectRows; - bool _ifComputeSubKernel; - UniversalBuffer _cache; - - SubDataTaskBasePtr _blockTask; - - services::internal::Buffer _xBlockBuff; - services::internal::Buffer _cacheBuff; -}; - -} // namespace internal -} // namespace training -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_train_result_oneapi.h b/cpp/daal/src/algorithms/svm/oneapi/svm_train_result_oneapi.h deleted file mode 100644 index 8938bef9e92..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_train_result_oneapi.h +++ /dev/null @@ -1,306 +0,0 @@ -/* file: svm_train_result_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// SVM save result structure implementation -//-- -*/ - -#ifndef __SVM_TRAIN_RESULT_ONEAPI_H__ -#define __SVM_TRAIN_RESULT_ONEAPI_H__ - -#include "src/services/service_utils.h" -#include "src/algorithms/svm/oneapi/svm_helper_oneapi.h" -#include "src/sycl/reducer.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace training -{ -namespace internal -{ -using namespace daal::services::internal::sycl::math; - -template -class SaveResultModel -{ - using Helper = utils::internal::HelperSVM; - -public: - SaveResultModel(services::internal::Buffer & alphaBuff, const services::internal::Buffer & fBuff, - const services::internal::Buffer & yBuff, const algorithmFPType C, const size_t nVectors) - : _yBuff(yBuff), _coeffBuff(alphaBuff), _fBuff(fBuff), _C(C), _nVectors(nVectors) - {} - - services::Status init() - { - services::Status status; - auto & context = services::internal::getDefaultContext(); - _tmpValues = context.allocate(TypeIds::id(), _nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - _mask = context.allocate(TypeIds::id(), _nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; - } - - services::Status setResultsToModel(const NumericTablePtr & xTable, Model & model) - { - DAAL_ITTNOTIFY_SCOPED_TASK(setResultsToModel); - - services::Status status; - - /* Calculate bias and write it into model */ - algorithmFPType bias; - DAAL_CHECK_STATUS(status, calculateBias(_C, bias)); - model.setBias(double(bias)); - - DAAL_CHECK_STATUS(status, Helper::computeDualCoeffs(_yBuff, _coeffBuff, _nVectors)); - - model.setNFeatures(xTable->getNumberOfColumns()); - - size_t nSV; - DAAL_CHECK_STATUS(status, setSVCoefficients(nSV, model)); - DAAL_CHECK_STATUS(status, setSVIndices(nSV, model)); - - if (xTable->getDataLayout() == NumericTableIface::csrArray) - { - DAAL_CHECK_STATUS(status, setSVCSR(model, xTable, nSV)); - } - else - { - DAAL_CHECK_STATUS(status, setSVDense(model, xTable, nSV)); - } - - return status; - } - -protected: - services::Status setSVCoefficients(size_t & nSV, Model & model) const - { - services::Status status; - - auto & context = services::internal::getDefaultContext(); - - auto tmpValuesBuff = _tmpValues.get(); - auto maskBuff = _mask.get(); - - DAAL_CHECK_STATUS(status, Helper::checkNonZeroBinary(_coeffBuff, maskBuff, _nVectors)); - nSV = 0; - DAAL_CHECK_STATUS(status, Partition::flagged(maskBuff, _coeffBuff, tmpValuesBuff, _nVectors, nSV)); - - NumericTablePtr svCoeffTable = model.getClassificationCoefficients(); - DAAL_CHECK_STATUS(status, svCoeffTable->resize(nSV)); - if (nSV == 0) return status; - - BlockDescriptor svCoeffBlock; - DAAL_CHECK_STATUS(status, svCoeffTable->getBlockOfRows(0, nSV, ReadWriteMode::writeOnly, svCoeffBlock)); - auto svCoeffBuff = svCoeffBlock.getBuffer(); - context.copy(svCoeffBuff, 0, tmpValuesBuff, 0, nSV, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, svCoeffTable->releaseBlockOfRows(svCoeffBlock)); - return status; - } - - services::Status setSVIndices(size_t nSV, Model & model) const - { - auto & context = services::internal::getDefaultContext(); - - NumericTablePtr svIndicesTable = model.getSupportIndices(); - services::Status status; - DAAL_CHECK_STATUS(status, svIndicesTable->resize(nSV)); - if (nSV == 0) return status; - - BlockDescriptor svIndicesBlock; - DAAL_CHECK_STATUS(status, svIndicesTable->getBlockOfRows(0, nSV, ReadWriteMode::writeOnly, svIndicesBlock)); - - auto svIndices = svIndicesBlock.getBuffer(); - auto buffIndex = context.allocate(TypeIds::id(), nSV, status); - DAAL_CHECK_STATUS_VAR(status); - auto rangeIndex = context.allocate(TypeIds::id(), _nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, Helper::makeRange(rangeIndex, _nVectors)); - - size_t nSVCheck = 0; - DAAL_CHECK_STATUS(status, Partition::flagged(_mask, rangeIndex, buffIndex, _nVectors, nSVCheck)); - DAAL_ASSERT(nSVCheck == nSV); - - context.copy(svIndices, 0, buffIndex, 0, nSV, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS(status, svIndicesTable->releaseBlockOfRows(svIndicesBlock)); - return status; - } - - services::Status setSVDense(Model & model, const NumericTablePtr & xTable, size_t nSV) const - { - services::Status status; - - const size_t nFeatures = xTable->getNumberOfColumns(); - - NumericTablePtr svTable = model.getSupportVectors(); - DAAL_CHECK_STATUS(status, svTable->resize(nSV)); - if (nSV == 0) return status; - - BlockDescriptor svBlock; - DAAL_CHECK_STATUS(status, svTable->getBlockOfRows(0, nSV, ReadWriteMode::writeOnly, svBlock)); - auto svBuff = svBlock.getBuffer(); - - NumericTablePtr svIndicesTable = model.getSupportIndices(); - BlockDescriptor svIndicesBlock; - DAAL_CHECK_STATUS(status, svIndicesTable->getBlockOfRows(0, nSV, ReadWriteMode::readOnly, svIndicesBlock)); - auto svIndicesBuff = svIndicesBlock.getBuffer(); - - BlockDescriptor xBlock; - DAAL_CHECK_STATUS(status, xTable->getBlockOfRows(0, _nVectors, ReadWriteMode::readOnly, xBlock)); - auto xBuff = xBlock.getBuffer(); - - DAAL_CHECK_STATUS(status, Helper::copyDataByIndices(xBuff, svIndicesBuff, svBuff, nSV, nFeatures)); - - DAAL_CHECK_STATUS(status, svTable->releaseBlockOfRows(svBlock)); - DAAL_CHECK_STATUS(status, svIndicesTable->releaseBlockOfRows(svIndicesBlock)); - - return status; - } - - services::Status setSVCSR(Model & model, const NumericTablePtr & xTable, size_t nSV) const - { - services::Status status; - - auto & context = services::internal::getDefaultContext(); - UniversalBuffer rowOffsets = context.allocate(TypeIds::id(), nSV + 1, status); - DAAL_CHECK_STATUS_VAR(status); - - NumericTablePtr svIndicesTable = model.getSupportIndices(); - BlockDescriptor svIndicesBlock; - DAAL_CHECK_STATUS(status, svIndicesTable->getBlockOfRows(0, nSV, ReadWriteMode::readOnly, svIndicesBlock)); - auto svIndicesBuff = svIndicesBlock.getBuffer(); - - auto svRowOffsetsBuff = rowOffsets.template get(); - - CSRBlockDescriptor blockCSR; - CSRNumericTableIface * const csrIface = dynamic_cast(xTable.get()); - DAAL_CHECK(csrIface, services::ErrorEmptyCSRNumericTable); - - DAAL_CHECK_STATUS(status, csrIface->getSparseBlock(0, xTable->getNumberOfRows(), readOnly, blockCSR)); - const auto xRowOffsetsBuff = blockCSR.getBlockRowIndicesBuffer(); - - size_t svDataSize = 0; - DAAL_CHECK_STATUS(status, Helper::copyRowIndicesByIndices(xRowOffsetsBuff, svIndicesBuff, svRowOffsetsBuff, nSV, svDataSize)); - - UniversalBuffer values = context.allocate(TypeIds::id(), svDataSize, status); - DAAL_CHECK_STATUS_VAR(status); - UniversalBuffer colIndices = context.allocate(TypeIds::id(), svDataSize, status); - DAAL_CHECK_STATUS_VAR(status); - - const auto xValuesBuff = blockCSR.getBlockValuesBuffer(); - const auto xColIndicesBuff = blockCSR.getBlockColumnIndicesBuffer(); - - auto svValuesBuff = values.template get(); - auto svColIndicesBuff = colIndices.template get(); - - DAAL_CHECK_STATUS(status, Helper::copyCSRByIndices(xRowOffsetsBuff, svRowOffsetsBuff, svIndicesBuff, xValuesBuff, xColIndicesBuff, - svValuesBuff, svColIndicesBuff, nSV, xTable->getNumberOfColumns())); - - DAAL_CHECK_STATUS(status, svIndicesTable->releaseBlockOfRows(svIndicesBlock)); - DAAL_CHECK_STATUS(status, csrIface->releaseSparseBlock(blockCSR)); - - /* Allocate memory for storing support vectors and coefficients */ - SyclCSRNumericTablePtr svTable = services::staticPointerCast(model.getSupportVectors()); - DAAL_CHECK_STATUS(status, svTable->resize(nSV)); - svTable->setArrays(svValuesBuff, svColIndicesBuff, svRowOffsetsBuff); - - return status; - } - - services::Status calculateBias(const algorithmFPType C, algorithmFPType & bias) const - { - services::Status status; - - auto tmpValuesBuff = _tmpValues.get(); - auto maskBuff = _mask.get(); - - /* free SV: (0 < alpha < C)*/ - DAAL_CHECK_STATUS(status, Helper::checkBorder(_coeffBuff, maskBuff, C, _nVectors)); - size_t nFree = 0; - DAAL_CHECK_STATUS(status, Partition::flagged(maskBuff, _fBuff, tmpValuesBuff, _nVectors, nFree)); - - if (nFree > 0) - { - auto reduceRes = Reducer::reduce(Reducer::BinaryOp::SUM, Layout::RowMajor, tmpValuesBuff, 1, nFree, status); - DAAL_CHECK_STATUS_VAR(status); - UniversalBuffer sumU = reduceRes.reduceRes; - auto sumHost = sumU.get().toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - bias = -*sumHost / algorithmFPType(nFree); - } - else - { - algorithmFPType ub = -MaxVal::get(); - algorithmFPType lb = MaxVal::get(); - { - DAAL_CHECK_STATUS(status, Helper::checkUpper(_yBuff, _coeffBuff, maskBuff, C, _nVectors)); - size_t nUpper = 0; - DAAL_CHECK_STATUS(status, Partition::flagged(maskBuff, _fBuff, tmpValuesBuff, _nVectors, nUpper)); - auto resultOp = Reducer::reduce(Reducer::BinaryOp::MIN, Layout::RowMajor, tmpValuesBuff, 1, nUpper, status); - DAAL_CHECK_STATUS_VAR(status); - UniversalBuffer minBuff = resultOp.reduceRes; - auto minHost = minBuff.get().toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - ub = *minHost; - } - { - DAAL_CHECK_STATUS(status, Helper::checkLower(_yBuff, _coeffBuff, maskBuff, C, _nVectors)); - size_t nLower = 0; - DAAL_CHECK_STATUS(status, Partition::flagged(maskBuff, _fBuff, tmpValuesBuff, _nVectors, nLower)); - auto resultOp = Reducer::reduce(Reducer::BinaryOp::MAX, Layout::RowMajor, tmpValuesBuff, 1, nLower, status); - DAAL_CHECK_STATUS_VAR(status); - UniversalBuffer maxBuff = resultOp.reduceRes; - auto maxHost = maxBuff.get().toHost(data_management::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - lb = *maxHost; - } - - bias = -0.5 * (ub + lb); - } - return status; - } - -private: - services::internal::Buffer _yBuff; - services::internal::Buffer _fBuff; - services::internal::Buffer _coeffBuff; - UniversalBuffer _tmpValues; - UniversalBuffer _mask; - const algorithmFPType _C; - const size_t _nVectors; -}; - -} // namespace internal -} // namespace training -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_kernel_oneapi.h b/cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_kernel_oneapi.h deleted file mode 100644 index 8492dc986fe..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_kernel_oneapi.h +++ /dev/null @@ -1,93 +0,0 @@ -/* file: svm_train_thunder_kernel_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -//++ -// Declaration of template structs that calculate SVM Training functions. -//-- - -#ifndef __SVM_TRAIN_THUNDER_KERNEL_ONEAPI_H__ -#define __SVM_TRAIN_THUNDER_KERNEL_ONEAPI_H__ - -#include "services/env_detect.h" -#include "data_management/data/numeric_table.h" -#include "algorithms/svm/svm_train_types.h" -#include "src/algorithms/kernel.h" -#include "src/algorithms/svm/oneapi/svm_helper_oneapi.h" -#include "src/services/service_data_utils.h" -#include "src/algorithms/svm/svm_train_kernel.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace training -{ -namespace internal -{ -using namespace daal::data_management; -using namespace daal::services; - -template -class SVMTrainOneAPI : public Kernel -{ -public: - services::Status compute(const NumericTablePtr & xTable, NumericTable & yTable, daal::algorithms::Model * r, const KernelParameter & par) - { - return services::ErrorMethodNotImplemented; - } -}; - -template -class SVMTrainOneAPI : public Kernel -{ - using Helper = utils::internal::HelperSVM; - -public: - services::Status compute(const NumericTablePtr & xTable, NumericTable & yTable, daal::algorithms::Model * r, const KernelParameter & par); - -protected: - services::Status updateGrad(const services::internal::Buffer & kernelWS, - const services::internal::Buffer & deltaalpha, services::internal::Buffer & grad, - const size_t nVectors, const size_t nWS); - services::Status smoKernel(const services::internal::Buffer & y, - const services::internal::Buffer & kernelWsRows, - const services::internal::Buffer & wsIndices, const size_t ldK, - const services::internal::Buffer & f, const algorithmFPType C, const algorithmFPType eps, - const algorithmFPType tau, const size_t maxInnerIteration, services::internal::Buffer & alpha, - services::internal::Buffer & deltaalpha, services::internal::Buffer & resinfo, - const size_t nWS); - - bool checkStopCondition(const algorithmFPType diff, const algorithmFPType diffPrev, const algorithmFPType eps, size_t & sameLocalDiff); - -private: - // One of the conditions for stopping is diff stays unchanged. nNoChanges - number of repetitions - static constexpr size_t nNoChanges = 5; - // The maximum numbers of iteration of the subtask is number of observation in WS x cInnerIterations. It's enough to find minimum for subtask. - static constexpr size_t cInnerIterations = 1000; - - static constexpr size_t uint32max = static_cast(services::internal::MaxVal::get()); -}; - -} // namespace internal -} // namespace training -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_oneapi_impl.i b/cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_oneapi_impl.i deleted file mode 100644 index b36875b3fc4..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_train_thunder_oneapi_impl.i +++ /dev/null @@ -1,274 +0,0 @@ -/* file: svm_train_thunder_oneapi_impl.i */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// SVM training algorithm implementation thunder method -//-- -*/ -/* -// DESCRIPTION -// -// Definition of the functions for training with SVM 2-class classifier. -// -// REFERENCES -// - -// 1. Zeyi Wen, Jiashuai Shi, Bingsheng He -// ThunderSVM: A Fast SVM Library on GPUs and CPUs, -// Journal of Machine Learning Research, 19, 1-5 (2018) -// 2. Rong-En Fan, Pai-Hsuen Chen, Chih-Jen Lin, -// Working Set Selection Using Second Order Information -// for Training Support Vector Machines, -// Journal of Machine Learning Research 6 (2005), pp. 1889___1918 -// 3. Bernard E. boser, Isabelle M. Guyon, Vladimir N. Vapnik, -// A Training Algorithm for Optimal Margin Classifiers. -// 4. Thorsten Joachims, Making Large-Scale SVM Learning Practical, -// Advances in Kernel Methods - Support Vector Learning -*/ - -#ifndef __SVM_TRAIN_THUNDER_ONEAPI_IMPL_I__ -#define __SVM_TRAIN_THUNDER_ONEAPI_IMPL_I__ - -#include "src/services/service_utils.h" -#include "src/services/service_data_utils.h" -#include "src/sycl/blas_gpu.h" -#include "src/externals/service_memory.h" -#include "src/externals/service_profiler.h" -#include "src/externals/service_service.h" -#include "src/algorithms/svm/oneapi/cl_kernels/svm_train_block_smo_oneapi.cl" - -#include "src/algorithms/svm/oneapi/svm_train_cache_oneapi.h" -#include "src/algorithms/svm/oneapi/svm_train_workset_oneapi.h" -#include "src/algorithms/svm/oneapi/svm_train_result_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace training -{ -namespace internal -{ -using namespace daal::internal; -using namespace daal::services::internal; -using namespace daal::services::internal::sycl; - -template -services::Status SVMTrainOneAPI::updateGrad(const services::internal::Buffer & kernelWS, - const services::internal::Buffer & deltaalpha, - services::internal::Buffer & grad, const size_t nVectors, - const size_t nWS) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(updateGrad); - return BlasGpu::xgemm(math::Layout::RowMajor, math::Transpose::Trans, math::Transpose::NoTrans, nVectors, 1, nWS, - algorithmFPType(1), kernelWS, nVectors, 0, deltaalpha, 1, 0, algorithmFPType(1), grad, 1, 0); -} - -template -services::Status SVMTrainOneAPI::smoKernel( - const services::internal::Buffer & y, const services::internal::Buffer & kernelWsRows, - const services::internal::Buffer & wsIndices, const size_t ldK, const services::internal::Buffer & f, - const algorithmFPType C, const algorithmFPType eps, const algorithmFPType tau, const size_t maxInnerIteration, - services::internal::Buffer & alpha, services::internal::Buffer & deltaalpha, - services::internal::Buffer & resinfo, const size_t nWS) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(smoKernel); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::String build_options = getKeyFPType(); - - services::String cachekey("__daal_algorithms_svm_smo_block_"); - build_options.add(" -D WS_SIZE="); - char bufferString[DAAL_MAX_STRING_SIZE] = { 0 }; - DAAL_ASSERT(nWS <= static_cast(services::internal::MaxVal::get())); - services::daal_int_to_string(bufferString, DAAL_MAX_STRING_SIZE, static_cast(nWS)); - build_options.add(bufferString); - build_options.add(" -D SIMD_WIDTH=64 "); - cachekey.add(build_options); - - services::Status status; - factory.build(ExecutionTargetIds::device, cachekey.c_str(), clKernelBlockSMO, build_options.c_str(), status); - - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("smoKernel", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(12, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(wsIndices.size() == nWS); - DAAL_ASSERT(deltaalpha.size() == nWS); - DAAL_ASSERT(resinfo.size() == 2); - DAAL_ASSERT(f.size() == y.size()); - DAAL_ASSERT(f.size() == alpha.size()); - - args.set(0, y, AccessModeIds::read); - args.set(1, kernelWsRows, AccessModeIds::read); - args.set(2, wsIndices, AccessModeIds::read); - DAAL_ASSERT(ldK <= uint32max); - args.set(3, static_cast(ldK)); - args.set(4, f, AccessModeIds::read); - args.set(5, C); - args.set(6, eps); - args.set(7, tau); - DAAL_ASSERT(maxInnerIteration <= uint32max); - args.set(8, static_cast(maxInnerIteration)); - args.set(9, alpha, AccessModeIds::readwrite); - args.set(10, deltaalpha, AccessModeIds::readwrite); - args.set(11, resinfo, AccessModeIds::readwrite); - - KernelRange localRange(nWS); - KernelRange globalRange(nWS); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template -bool SVMTrainOneAPI::checkStopCondition(const algorithmFPType diff, const algorithmFPType diffPrev, - const algorithmFPType eps, size_t & sameLocalDiff) -{ - sameLocalDiff = utils::internal::abs(diff - diffPrev) < eps * 1e-2 ? sameLocalDiff + 1 : 0; - - if (sameLocalDiff > nNoChanges || diff < eps) - { - return true; - } - return false; -} - -template -services::Status SVMTrainOneAPI::compute(const NumericTablePtr & xTable, NumericTable & yTable, daal::algorithms::Model * r, - const KernelParameter & svmPar) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - const auto idType = TypeIds::id(); - - const algorithmFPType C(svmPar.C); - const algorithmFPType eps(svmPar.accuracyThreshold); - const algorithmFPType tau(svmPar.tau); - const size_t maxIterations(svmPar.maxIterations); - const size_t cacheSize(svmPar.cacheSize); - kernel_function::KernelIfacePtr kernel = svmPar.kernel->clone(); - - const size_t nVectors = xTable->getNumberOfRows(); - const size_t nFeatures = xTable->getNumberOfColumns(); - // ai = 0 - auto alphaU = context.allocate(idType, nVectors, status); - context.fill(alphaU, 0.0, status); - DAAL_CHECK_STATUS_VAR(status); - auto alphaBuff = alphaU.template get(); - - BlockDescriptor yBD; - DAAL_CHECK_STATUS(status, yTable.getBlockOfRows(0, nVectors, ReadWriteMode::readOnly, yBD)); - auto yBuff = yBD.getBuffer(); - - // gradi = -yi - auto gradU = context.allocate(idType, nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - auto gradBuff = gradU.template get(); - - DAAL_CHECK_STATUS(status, Helper::makeInversion(yBuff, gradBuff, nVectors)); - - TaskWorkingSet workSet(nVectors); - - DAAL_CHECK_STATUS(status, workSet.init()); - - const size_t nWS = workSet.getSize(); - - const size_t innerMaxIterations(nWS * cInnerIterations); - - auto deltaalphaU = context.allocate(idType, nWS, status); - DAAL_CHECK_STATUS_VAR(status); - auto deltaalphaBuff = deltaalphaU.template get(); - - auto resinfoU = context.allocate(idType, 2, status); - DAAL_CHECK_STATUS_VAR(status); - auto resinfoBuff = resinfoU.template get(); - - algorithmFPType diff = algorithmFPType(0); - algorithmFPType diffPrev = algorithmFPType(0); - - size_t sameLocalDiff = 0; - - // TODO: support caching for thunder method - SVMCacheOneAPIPtr cachePtr = SVMCacheOneAPI::create(cacheSize, nWS, nVectors, xTable, kernel, status); - DAAL_CHECK_STATUS_VAR(status); - - size_t iter = 0; - for (; iter < maxIterations; iter++) - { - if (iter != 0) - { - DAAL_CHECK_STATUS(status, workSet.copyLastToFirst()); - DAAL_CHECK_STATUS(status, cachePtr->copyLastToFirst()); - } - - DAAL_CHECK_STATUS(status, workSet.selectWS(yBuff, alphaBuff, gradBuff, C)); - - const services::internal::Buffer & wsIndices = workSet.getWSIndices(); - DAAL_CHECK_STATUS(status, cachePtr->compute(xTable, wsIndices, nFeatures)); - - const services::internal::Buffer & kernelWS = cachePtr->getRowsBlock(); - - DAAL_CHECK_STATUS(status, smoKernel(yBuff, kernelWS, wsIndices, nVectors, gradBuff, C, eps, tau, innerMaxIterations, alphaBuff, - deltaalphaBuff, resinfoBuff, nWS)); - - { - auto resinfoHostPtr = resinfoBuff.toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - auto resinfoHost = resinfoHostPtr.get(); - diff = resinfoHost[1]; - } - - DAAL_CHECK_STATUS(status, updateGrad(kernelWS, deltaalphaBuff, gradBuff, nVectors, nWS)); - - if (checkStopCondition(diff, diffPrev, eps, sameLocalDiff)) break; - diffPrev = diff; - } - SaveResultModel result(alphaBuff, gradBuff, yBuff, C, nVectors); - - DAAL_CHECK_STATUS(status, result.init()); - DAAL_CHECK_STATUS(status, result.setResultsToModel(xTable, *static_cast(r))); - DAAL_CHECK_STATUS(status, yTable.releaseBlockOfRows(yBD)); - - return status; -} - -} // namespace internal -} // namespace training -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/oneapi/svm_train_workset_oneapi.h b/cpp/daal/src/algorithms/svm/oneapi/svm_train_workset_oneapi.h deleted file mode 100644 index 140760e7521..00000000000 --- a/cpp/daal/src/algorithms/svm/oneapi/svm_train_workset_oneapi.h +++ /dev/null @@ -1,228 +0,0 @@ -/* file: svm_train_workset_oneapi.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// SVM workset structure implementation -//-- -*/ - -#ifndef __SVM_TRAIN_WORKSET_ONEAPI_H__ -#define __SVM_TRAIN_WORKSET_ONEAPI_H__ - -#include "src/services/service_utils.h" -#include "src/algorithms/svm/oneapi/svm_helper_oneapi.h" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace training -{ -namespace internal -{ -using namespace daal::services::internal; - -template -struct TaskWorkingSet -{ - using Helper = utils::internal::HelperSVM; - - TaskWorkingSet(const size_t nVectors) : _nVectors(nVectors) {} - - services::Status init() - { - services::Status status; - auto & context = services::internal::getDefaultContext(); - - _sortedFIndices = context.allocate(TypeIds::id(), _nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - - _indicator = context.allocate(TypeIds::id(), _nVectors, status); - context.fill(_indicator, 0, status); - DAAL_CHECK_STATUS_VAR(status); - - auto & deviceInfo = context.getInfoDevice(); - - const size_t maxWS = deviceInfo.maxWorkGroupSize; - - _nWS = utils::internal::min(utils::internal::maxpow2(_nVectors), utils::internal::maxpow2(maxWS)); - _nSelected = 0; - - _valuesSort = context.allocate(TypeIds::id(), _nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - - _valuesSortBuff = context.allocate(TypeIds::id(), _nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - - _buffIndices = context.allocate(TypeIds::id(), _nVectors, status); - DAAL_CHECK_STATUS_VAR(status); - - _wsIndices = context.allocate(TypeIds::id(), _nWS, status); - return status; - } - - size_t getSize() const { return _nWS; } - - services::Status copyLastToFirst() - { - const size_t q = _nWS / 2; - services::Status status; - auto & context = services::internal::getDefaultContext(); - context.copy(_wsIndices, 0, _wsIndices, q, _nWS - q, status); - _nSelected = q; - return status; - } - - services::Status selectWS(const services::internal::Buffer & yBuff, - const services::internal::Buffer & alphaBuff, - const services::internal::Buffer & fBuff, const algorithmFPType C) - { - DAAL_ITTNOTIFY_SCOPED_TASK(selectWS); - services::Status status; - auto & context = services::internal::getDefaultContext(); - - auto wsIndicesBuff = _wsIndices.get(); - auto indicatorBuff = _indicator.get(); - - DAAL_CHECK_STATUS(status, Helper::argSort(fBuff, _valuesSort, _valuesSortBuff, _sortedFIndices, _buffIndices, _nVectors)); - - { - const size_t nNeedSelect = (_nWS - _nSelected) / 2; - - DAAL_CHECK_STATUS(status, Helper::checkUpper(yBuff, alphaBuff, indicatorBuff, C, _nVectors)); - - /* Reset indicator for busy Indices */ - if (_nSelected > 0) - { - DAAL_CHECK_STATUS(status, resetIndicatorWithZeros(wsIndicesBuff, indicatorBuff, _nSelected)); - } - - size_t nUpperSelect = 0; - DAAL_CHECK_STATUS(status, Partition::flaggedIndex(indicatorBuff, _sortedFIndices, _buffIndices, _nVectors, nUpperSelect)); - - const size_t nCopy = utils::internal::min(nUpperSelect, nNeedSelect); - - context.copy(_wsIndices, _nSelected, _buffIndices, 0, nCopy, status); - DAAL_CHECK_STATUS_VAR(status); - - _nSelected += nCopy; - } - - { - const size_t nNeedSelect = _nWS - _nSelected; - - DAAL_CHECK_STATUS(status, Helper::checkLower(yBuff, alphaBuff, indicatorBuff, C, _nVectors)); - - /* Reset indicator for busy Indices */ - if (_nSelected > 0) - { - DAAL_CHECK_STATUS(status, resetIndicatorWithZeros(wsIndicesBuff, indicatorBuff, _nSelected)); - } - - size_t nLowerSelect = 0; - DAAL_CHECK_STATUS(status, Partition::flaggedIndex(indicatorBuff, _sortedFIndices, _buffIndices, _nVectors, nLowerSelect)); - - const size_t nCopy = utils::internal::min(nLowerSelect, nNeedSelect); - - /* Copy latest nCopy elements */ - context.copy(_wsIndices, _nSelected, _buffIndices, nLowerSelect - nCopy, nCopy, status); - DAAL_CHECK_STATUS_VAR(status); - _nSelected += nCopy; - } - - if (_nSelected < _nWS) - { - const size_t nNeedSelect = _nWS - _nSelected; - - DAAL_CHECK_STATUS(status, Helper::checkUpper(yBuff, alphaBuff, indicatorBuff, C, _nVectors)); - - /* Reset indicator for busy Indices */ - if (_nSelected > 0) - { - DAAL_CHECK_STATUS(status, resetIndicatorWithZeros(wsIndicesBuff, indicatorBuff, _nSelected)); - } - - size_t nUpperSelect = 0; - DAAL_CHECK_STATUS(status, Partition::flaggedIndex(indicatorBuff, _sortedFIndices, _buffIndices, _nVectors, nUpperSelect)); - - const size_t nCopy = utils::internal::min(nUpperSelect, nNeedSelect); - - context.copy(_wsIndices, _nSelected, _buffIndices, 0, nCopy, status); - DAAL_CHECK_STATUS_VAR(status); - _nSelected += nCopy; - } - - DAAL_ASSERT(_nSelected == _nWS); - - _nSelected = 0; - return status; - } - - const services::internal::Buffer & getWSIndices() const { return _wsIndices.get(); } - - services::Status resetIndicatorWithZeros(const services::internal::Buffer & idx, services::internal::Buffer & indicator, - const size_t n) - { - DAAL_ITTNOTIFY_SCOPED_TASK(resetIndicatorWithZeros); - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - services::Status status = Helper::buildProgram(factory); - DAAL_CHECK_STATUS_VAR(status); - - auto kernel = factory.getKernel("resetIndicatorWithZeros", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT(idx.size() == _nWS); - DAAL_ASSERT(indicator.size() == _nVectors); - - args.set(0, idx, AccessModeIds::read); - args.set(1, indicator, AccessModeIds::write); - - KernelRange range(n); - - context.run(range, kernel, args, status); - return status; - } - -private: - size_t _nSelected; - size_t _nVectors; - size_t _nWS; - - UniversalBuffer _sortedFIndices; - UniversalBuffer _indicator; - UniversalBuffer _wsIndices; - UniversalBuffer _buffIndices; - UniversalBuffer _valuesSort; - UniversalBuffer _valuesSortBuff; -}; - -} // namespace internal -} // namespace training -} // namespace svm -} // namespace algorithms -} // namespace daal - -#endif diff --git a/cpp/daal/src/algorithms/svm/svm_model_fpt.cpp b/cpp/daal/src/algorithms/svm/svm_model_fpt.cpp index 22e7f4fc496..20231244259 100644 --- a/cpp/daal/src/algorithms/svm/svm_model_fpt.cpp +++ b/cpp/daal/src/algorithms/svm/svm_model_fpt.cpp @@ -22,8 +22,8 @@ */ #include "algorithms/svm/svm_model.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "data_management/data/internal/numeric_table_sycl_csr.h" +#include "data_management/data/homogen_numeric_table.h" +#include "data_management/data/csr_numeric_table.h" namespace daal { @@ -44,39 +44,17 @@ services::SharedPtr Model::create(size_t nColumns, data_management::Numer template Model::Model(modelFPType dummy, size_t nColumns, data_management::NumericTableIface::StorageLayout layout, services::Status & st) : _bias(0.0) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - - if (!deviceInfo.isCpu) + if (layout == dm::NumericTableIface::csrArray) { - if (layout == dm::NumericTableIface::csrArray) - { - _SV = dmi::SyclCSRNumericTable::create(services::internal::Buffer(), services::internal::Buffer(), - services::internal::Buffer(), nColumns, size_t(0), - dm::CSRNumericTable::oneBased, &st); - } - else - { - _SV = dmi::SyclHomogenNumericTable::create(nColumns, 0, dm::NumericTable::doNotAllocate, &st); - } - _SVCoeff = dmi::SyclHomogenNumericTable::create(1, 0, dm::NumericTable::doNotAllocate, &st); - if (!st) return; - _SVIndices = dmi::SyclHomogenNumericTable::create(1, 0, dm::NumericTable::doNotAllocate, &st); + _SV = dm::CSRNumericTable::create(NULL, NULL, NULL, nColumns, 0, dm::CSRNumericTable::oneBased, &st); } else { - if (layout == dm::NumericTableIface::csrArray) - { - _SV = dm::CSRNumericTable::create(NULL, NULL, NULL, nColumns, 0, dm::CSRNumericTable::oneBased, &st); - } - else - { - _SV = dm::HomogenNumericTable::create(NULL, nColumns, 0, &st); - } - _SVCoeff = dm::HomogenNumericTable::create(NULL, 1, 0, &st); - if (!st) return; - _SVIndices = dm::HomogenNumericTable::create(NULL, 1, 0, &st); + _SV = dm::HomogenNumericTable::create(NULL, nColumns, 0, &st); } + _SVCoeff = dm::HomogenNumericTable::create(NULL, 1, 0, &st); + if (!st) return; + _SVIndices = dm::HomogenNumericTable::create(NULL, 1, 0, &st); return; } diff --git a/cpp/daal/src/algorithms/svm/svm_predict_batch_container.h b/cpp/daal/src/algorithms/svm/svm_predict_batch_container.h index 9c7f067b552..d8897a3c680 100644 --- a/cpp/daal/src/algorithms/svm/svm_predict_batch_container.h +++ b/cpp/daal/src/algorithms/svm/svm_predict_batch_container.h @@ -24,7 +24,6 @@ #include "algorithms/svm/svm_predict.h" #include "src/algorithms/svm/svm_predict_kernel.h" #include "algorithms/classifier/classifier_predict_types.h" -#include "src/algorithms/svm/oneapi/svm_predict_kernel_oneapi.h" namespace daal { @@ -42,16 +41,7 @@ namespace interface2 template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS(internal::SVMPredictImpl, method, algorithmFPType); - } - else - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::SVMPredictImplOneAPI, method, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(internal::SVMPredictImpl, method, algorithmFPType); } template @@ -73,16 +63,7 @@ services::Status BatchContainer::compute() services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL(env, internal::SVMPredictImpl, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a, m, *r, par); - } - else - { - __DAAL_CALL_KERNEL_SYCL(env, internal::SVMPredictImplOneAPI, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a, m, *r, par); - } + __DAAL_CALL_KERNEL(env, internal::SVMPredictImpl, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, a, m, *r, par); } } // namespace interface2 } // namespace prediction diff --git a/cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_fpt_dispatcher.cpp index a92283aed78..f27be69b003 100644 --- a/cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_fpt_dispatcher.cpp @@ -27,6 +27,6 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL(svm::prediction::BatchContainer, batch, DAAL_FPTYPE, svm::prediction::defaultDense) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(svm::prediction::BatchContainer, batch, DAAL_FPTYPE, svm::prediction::defaultDense) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_oneapi_fpt.cpp deleted file mode 100644 index 5e983673f05..00000000000 --- a/cpp/daal/src/algorithms/svm/svm_predict_dense_default_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: svm_predict_dense_default_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of SVM Fast prediction algorithm. -//-- -*/ - -#include "src/algorithms/svm/oneapi/svm_predict_kernel_oneapi.h" -#include "src/algorithms/svm/oneapi/svm_predict_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace prediction -{ -namespace internal -{ -template struct DAAL_EXPORT SVMPredictImplOneAPI; - -} // namespace internal -} // namespace prediction -} // namespace svm -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/algorithms/svm/svm_train_batch_container.h b/cpp/daal/src/algorithms/svm/svm_train_batch_container.h index 7825cb7698a..993f40344f1 100644 --- a/cpp/daal/src/algorithms/svm/svm_train_batch_container.h +++ b/cpp/daal/src/algorithms/svm/svm_train_batch_container.h @@ -27,7 +27,6 @@ #include "src/algorithms/svm/svm_train_boser_kernel.h" #include "algorithms/classifier/classifier_training_types.h" #include "src/algorithms/svm/svm_train_thunder_kernel.h" -#include "src/algorithms/svm/oneapi/svm_train_thunder_kernel_oneapi.h" namespace daal { @@ -48,16 +47,7 @@ using namespace daal::data_management; template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (method == thunder && !deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::SVMTrainOneAPI, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::SVMTrainImpl, method, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(internal::SVMTrainImpl, method, algorithmFPType); } template @@ -92,16 +82,7 @@ services::Status BatchContainer::compute() daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (method == thunder && !deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::SVMTrainOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, x, *y, r, kernelPar); - } - else - { - __DAAL_CALL_KERNEL(env, internal::SVMTrainImpl, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, x, weights, *y, r, kernelPar); - } + __DAAL_CALL_KERNEL(env, internal::SVMTrainImpl, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, x, weights, *y, r, kernelPar); } } // namespace interface2 @@ -116,16 +97,7 @@ using namespace daal::data_management; template BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) { - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (method == thunder && !deviceInfo.isCpu) - { - __DAAL_INITIALIZE_KERNELS_SYCL(internal::SVMTrainOneAPI, algorithmFPType, method); - } - else - { - __DAAL_INITIALIZE_KERNELS(internal::SVMTrainImpl, method, algorithmFPType); - } + __DAAL_INITIALIZE_KERNELS(internal::SVMTrainImpl, method, algorithmFPType); } template @@ -150,16 +122,7 @@ services::Status BatchContainer::compute() daal::services::Environment::env & env = *_env; - auto & context = services::internal::getDefaultContext(); - auto & deviceInfo = context.getInfoDevice(); - if (method == thunder && !deviceInfo.isCpu) - { - __DAAL_CALL_KERNEL_SYCL(env, internal::SVMTrainOneAPI, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, x, *y, r, kernelPar); - } - else - { - __DAAL_CALL_KERNEL(env, internal::SVMTrainImpl, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, x, weights, *y, r, kernelPar); - } + __DAAL_CALL_KERNEL(env, internal::SVMTrainImpl, __DAAL_KERNEL_ARGUMENTS(method, algorithmFPType), compute, x, weights, *y, r, kernelPar); } } // namespace internal } // namespace training diff --git a/cpp/daal/src/algorithms/svm/svm_train_thunder_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/svm/svm_train_thunder_batch_fpt_dispatcher.cpp index f807d322102..522c1730cd0 100755 --- a/cpp/daal/src/algorithms/svm/svm_train_thunder_batch_fpt_dispatcher.cpp +++ b/cpp/daal/src/algorithms/svm/svm_train_thunder_batch_fpt_dispatcher.cpp @@ -27,7 +27,7 @@ namespace daal { namespace algorithms { -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(svm::training::BatchContainer, batch, DAAL_FPTYPE, svm::training::thunder) -__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SYCL_SAFE(svm::training::internal::BatchContainer, batch, DAAL_FPTYPE, svm::training::thunder) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(svm::training::BatchContainer, batch, DAAL_FPTYPE, svm::training::thunder) +__DAAL_INSTANTIATE_DISPATCH_CONTAINER_SAFE(svm::training::internal::BatchContainer, batch, DAAL_FPTYPE, svm::training::thunder) } // namespace algorithms } // namespace daal diff --git a/cpp/daal/src/algorithms/svm/svm_train_thunder_batch_oneapi_fpt.cpp b/cpp/daal/src/algorithms/svm/svm_train_thunder_batch_oneapi_fpt.cpp deleted file mode 100644 index b7433eeb3b2..00000000000 --- a/cpp/daal/src/algorithms/svm/svm_train_thunder_batch_oneapi_fpt.cpp +++ /dev/null @@ -1,43 +0,0 @@ -/* file: svm_train_thunder_batch_oneapi_fpt.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of SVM thunder training algorithm. -//-- -*/ - -#include "src/algorithms/svm/oneapi/svm_train_thunder_kernel_oneapi.h" -#include "src/algorithms/svm/oneapi/svm_train_thunder_oneapi_impl.i" - -namespace daal -{ -namespace algorithms -{ -namespace svm -{ -namespace training -{ -namespace internal -{ -template struct DAAL_EXPORT SVMTrainOneAPI; - -} // namespace internal -} // namespace training -} // namespace svm -} // namespace algorithms -} // namespace daal diff --git a/cpp/daal/src/data_management/daal_factory_impl.cpp b/cpp/daal/src/data_management/daal_factory_impl.cpp index 8649c67a949..0fd954d2be3 100755 --- a/cpp/daal/src/data_management/daal_factory_impl.cpp +++ b/cpp/daal/src/data_management/daal_factory_impl.cpp @@ -33,8 +33,6 @@ #include "data_management/data/data_collection.h" #include "src/services/serialization_utils.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" - #include "data_management/data/memory_block.h" namespace daal @@ -139,8 +137,6 @@ Factory::Factory() : _impl(nullptr) __DAAL_REGISTER_TEMPLATED_OBJECT(Creator, HomogenNumericTable, ); __DAAL_REGISTER_TEMPLATED_OBJECT(Creator, Matrix, ); - __DAAL_REGISTER_TEMPLATED_OBJECT(Creator, internal::SyclHomogenNumericTable, ); - __DAAL_REGISTER_TEMPLATED_OBJECT(Creator, PackedSymmetricMatrix, NumericTableIface::upperPackedSymmetricMatrix, ); __DAAL_REGISTER_TEMPLATED_OBJECT(Creator, PackedSymmetricMatrix, NumericTableIface::lowerPackedSymmetricMatrix, ); __DAAL_REGISTER_TEMPLATED_OBJECT(Creator, PackedTriangularMatrix, NumericTableIface::upperPackedTriangularMatrix, ); diff --git a/cpp/daal/src/data_management/numeric_table.cpp b/cpp/daal/src/data_management/numeric_table.cpp index 0bb323cfcdf..92851dbff62 100644 --- a/cpp/daal/src/data_management/numeric_table.cpp +++ b/cpp/daal/src/data_management/numeric_table.cpp @@ -18,10 +18,6 @@ #include "algorithms/algorithm_types.h" #include "data_management/data/numeric_table.h" -#include "data_management/data/internal/numeric_table_sycl_homogen.h" -#include "data_management/data/internal/numeric_table_sycl_soa.h" -#include "data_management/data/internal/numeric_table_sycl_csr.h" - #include "data_management/data/homogen_numeric_table.h" #include "data_management/data/merged_numeric_table.h" #include "data_management/data/row_merged_numeric_table.h" @@ -299,37 +295,6 @@ void MergedNumericTable::freeDataMemoryImpl() } // namespace data_management } // namespace daal -namespace daal -{ -namespace data_management -{ -namespace internal -{ -namespace interface1 -{ -IMPLEMENT_SERIALIZABLE_TAG(SyclSOANumericTable, SERIALIZATION_SYCL_SOA_NT_ID) -IMPLEMENT_SERIALIZABLE_TAG(SyclCSRNumericTable, SERIALIZATION_SYCL_CSR_NT_ID) - -#define DAAL_INSTANTIATE_SER_TAG_SYCL(T) IMPLEMENT_SERIALIZABLE_TAG1T(SyclHomogenNumericTable, T, SERIALIZATION_SYCL_HOMOGEN_NT_ID) - -DAAL_INSTANTIATE_SER_TAG_SYCL(float) -DAAL_INSTANTIATE_SER_TAG_SYCL(double) -DAAL_INSTANTIATE_SER_TAG_SYCL(int) -DAAL_INSTANTIATE_SER_TAG_SYCL(unsigned int) -DAAL_INSTANTIATE_SER_TAG_SYCL(DAAL_INT64) -DAAL_INSTANTIATE_SER_TAG_SYCL(DAAL_UINT64) -DAAL_INSTANTIATE_SER_TAG_SYCL(char) -DAAL_INSTANTIATE_SER_TAG_SYCL(unsigned char) -DAAL_INSTANTIATE_SER_TAG_SYCL(short) -DAAL_INSTANTIATE_SER_TAG_SYCL(unsigned short) -DAAL_INSTANTIATE_SER_TAG_SYCL(unsigned long) -DAAL_INSTANTIATE_SER_TAG_SYCL(long) - -} // namespace interface1 -} // namespace internal -} // namespace data_management -} // namespace daal - namespace daal { namespace internal diff --git a/cpp/daal/src/services/env_detect.cpp b/cpp/daal/src/services/env_detect.cpp index 286416ed571..8929f15cc3b 100644 --- a/cpp/daal/src/services/env_detect.cpp +++ b/cpp/daal/src/services/env_detect.cpp @@ -129,7 +129,6 @@ DAAL_EXPORT daal::services::Environment::Environment() : _schedulerHandle {}, _g { _env.cpuid_init_flag = false; _env.cpuid = -1; - this->setDefaultExecutionContext(internal::CpuExecutionContext()); } DAAL_EXPORT daal::services::Environment::Environment(const Environment & e) : daal::services::Environment::Environment() {} diff --git a/cpp/daal/src/services/execution_context.cpp b/cpp/daal/src/services/execution_context.cpp deleted file mode 100644 index 460b32ef286..00000000000 --- a/cpp/daal/src/services/execution_context.cpp +++ /dev/null @@ -1,37 +0,0 @@ -/* file: execution_context.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "services/internal/execution_context.h" -#include "services/env_detect.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace interface1 -{ -sycl::ExecutionContextIface & getDefaultContext() -{ - return services::Environment::getInstance()->getDefaultExecutionContext(); -} - -} // namespace interface1 -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/services/types_utils.cpp b/cpp/daal/src/services/types_utils.cpp deleted file mode 100644 index 487c458f302..00000000000 --- a/cpp/daal/src/services/types_utils.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/* file: types_utils.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "services/internal/sycl/types_utils.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace interface1 -{ -struct TypeToStringConverter -{ - services::String result; - - template - void operator()(Typelist, Status & status) - { - result = daal::services::internal::sycl::getKeyFPType(); - } -}; - -services::String getKeyFPType(TypeId typeId) -{ - Status status; - - TypeToStringConverter converter; - TypeDispatcher::dispatch(typeId, converter, status); - - return converter.result; -} - -} // namespace interface1 -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/blas_gpu.cpp b/cpp/daal/src/sycl/blas_gpu.cpp deleted file mode 100644 index d65565edd72..00000000000 --- a/cpp/daal/src/sycl/blas_gpu.cpp +++ /dev/null @@ -1,160 +0,0 @@ -/* file: blas_gpu.cpp */ -/******************************************************************************* -* Copyright 2015 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "services/internal/sycl/math/reference_gemm.h" -#include "services/internal/sycl/math/reference_axpy.h" -#include "src/sycl/blas_gpu.h" -#include "src/sycl/cl_kernels/kernel_blas.cl" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -template -services::Status ReferenceGemm::operator()(const Transpose transa, const Transpose transb, const size_t m, const size_t n, - const size_t k, const algorithmFPType alpha, - const services::internal::Buffer & a_buffer, const size_t lda, - const size_t offsetA, const services::internal::Buffer & b_buffer, - const size_t ldb, const size_t offsetB, const algorithmFPType beta, - services::internal::Buffer & c_buffer, const size_t ldc, - const size_t offsetC) -{ - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - services::String options = getKeyFPType(); - services::String cacheKey = "__daal_gemm_"; - cacheKey.add(options); - - factory.build(ExecutionTargetIds::device, cacheKey.c_str(), clKernelGemm, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = beta != algorithmFPType(0) ? "blas_sgemm_small" : "blas_sgemm_without_sum"; - - KernelPtr kernelGemm = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(15, status); - DAAL_CHECK_STATUS_VAR(status); - - const uint32_t one = uint32_t(1); - - args.set(0, (uint32_t)k); - args.set(1, alpha); - args.set(2, a_buffer); - args.set(5, (uint32_t)offsetA); - args.set(6, b_buffer); - args.set(9, (uint32_t)offsetB); - args.set(10, beta); - args.set(11, c_buffer, AccessModeIds::write); - args.set(12, one); - args.set(13, (uint32_t)ldc); - args.set(14, (uint32_t)offsetC); - - if (transa == Transpose::NoTrans && transb == Transpose::NoTrans) - { - args.set(3, one); - args.set(4, (uint32_t)lda); - args.set(7, one); - args.set(8, (uint32_t)ldb); - } - else if (transa == Transpose::Trans && transb == Transpose::NoTrans) - { - args.set(3, (uint32_t)lda); - args.set(4, one); - args.set(7, one); - args.set(8, (uint32_t)ldb); - } - else if (transa == Transpose::NoTrans && transb == Transpose::Trans) - { - args.set(3, one); - args.set(4, (uint32_t)lda); - args.set(7, (uint32_t)ldb); - args.set(8, one); - } - else - { - args.set(3, one); - args.set(4, (uint32_t)lda); - args.set(7, one); - args.set(8, (uint32_t)ldb); - } - - KernelRange range(m, n); - - ctx.run(range, kernelGemm, args, status); - - return status; -} - -template class ReferenceGemm; -template class ReferenceGemm; - -template -services::Status ReferenceAxpy::operator()(const int n, const algorithmFPType a, - const services::internal::Buffer & x_buffer, const int incx, - services::internal::Buffer & y_buffer, const int incy) -{ - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - services::String options = getKeyFPType(); - services::String cacheKey = "__daal_axpy_"; - cacheKey.add(options); - - factory.build(ExecutionTargetIds::device, cacheKey.c_str(), clKernelAxpy, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - KernelPtr blas_axpy = factory.getKernel("blas_axpy", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - - args.set(0, a); - args.set(1, x_buffer, AccessModeId::read); - args.set(2, incx); - args.set(3, y_buffer, AccessModeId::readwrite); - args.set(4, incy); - - KernelRange range(n); - - ctx.run(range, blas_axpy, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -template class ReferenceAxpy; -template class ReferenceAxpy; - -} // namespace interface1 -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/blas_gpu.h b/cpp/daal/src/sycl/blas_gpu.h deleted file mode 100644 index d9f954c19df..00000000000 --- a/cpp/daal/src/sycl/blas_gpu.h +++ /dev/null @@ -1,105 +0,0 @@ -/* file: blas_gpu.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Template wrappers for common GPU blas functions. -//-- -*/ - -#ifndef __SERVICE_ONEAPI_BLAS_GPU_H__ -#define __SERVICE_ONEAPI_BLAS_GPU_H__ - -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types_utils.h" -#include "src/sycl/math_service_types.h" -#include "services/internal/buffer.h" -#include "services/internal/execution_context.h" -#include "services/internal/sycl/math/types.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -template -struct BlasGpu -{ - static services::Status xgemm(const math::Layout layout, const math::Transpose transa, const math::Transpose transb, const uint32_t m, - const uint32_t n, const uint32_t k, const algorithmFPType alpha, const UniversalBuffer a_buffer, const uint32_t lda, - const uint32_t offsetA, const UniversalBuffer b_buffer, const uint32_t ldb, const uint32_t offsetB, - const algorithmFPType beta, UniversalBuffer c_buffer, const uint32_t ldc, const uint32_t offsetC) - { - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - if (layout == math::Layout::ColMajor) - { - ctx.gemm(transa, transb, m, n, k, alpha, a_buffer, lda, offsetA, b_buffer, ldb, offsetB, beta, c_buffer, ldc, offsetC, status); - } - else - { - ctx.gemm(transb, transa, n, m, k, alpha, b_buffer, ldb, offsetB, a_buffer, lda, offsetA, beta, c_buffer, ldc, offsetC, status); - } - - return status; - } - - static services::Status xsyrk(const math::Layout layout, const math::UpLo upper_lower, const math::Transpose trans, const uint32_t n, - const uint32_t k, const algorithmFPType alpha, const UniversalBuffer a_buffer, const uint32_t lda, - const uint32_t offsetA, const algorithmFPType beta, UniversalBuffer c_buffer, const uint32_t ldc, - const uint32_t offsetC) - { - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - if (layout == math::Layout::ColMajor) - { - ctx.syrk(upper_lower, trans, n, k, alpha, a_buffer, lda, offsetA, beta, c_buffer, ldc, offsetC, status); - } - else - { - ctx.syrk(upper_lower, trans, k, n, alpha, a_buffer, lda, offsetA, beta, c_buffer, ldc, offsetC, status); - } - - return status; - } - - static services::Status xaxpy(const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, UniversalBuffer y_buffer, - const int incy) - { - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - ctx.axpy(n, a, x_buffer, incx, y_buffer, incy, status); - - return status; - } -}; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/cl_kernels/kernel_blas.cl b/cpp/daal/src/sycl/cl_kernels/kernel_blas.cl deleted file mode 100644 index 86e40ee40e1..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/kernel_blas.cl +++ /dev/null @@ -1,123 +0,0 @@ -/* file: kernel_blas.cl */ -/******************************************************************************* -* Copyright 2019 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __BLAS_KERNELS_CL__ -#define __BLAS_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - clKernelGemm, - - __kernel void blas_sgemm_small(const uint k, const algorithmFPType alpha, const __global algorithmFPType * a, const uint lda_col, - const uint lda_row, const uint offsetA, const __global algorithmFPType * b, const uint ldb_col, const uint ldb_row, - const uint offsetB, const algorithmFPType beta, __global algorithmFPType * c, const uint ldc_col, const ldc_row, - const uint offsetC) { - const uint rows = get_global_id(0); - const uint cols = get_global_id(1); - - algorithmFPType sum = (algorithmFPType)0; - for (uint i = 0; i < k; i++) - { - sum += a[i * lda_row + rows * lda_col + offsetA] * b[cols * ldb_row + i * ldb_col + offsetB]; - } - - c[rows * ldc_col + cols * ldc_row + offsetC] = alpha * sum + beta * c[rows * ldc_col + cols * ldc_row + offsetC]; - } - - __kernel void blas_sgemm_without_sum(const uint k, const algorithmFPType alpha, const __global algorithmFPType * a, const uint lda_col, - const uint lda_row, const uint offsetA, const __global algorithmFPType * b, const uint ldb_col, - const uint ldb_row, const uint offsetB, const algorithmFPType beta, __global algorithmFPType * c, - const uint ldc_col, const uint ldc_row, const uint offsetC) { - const uint rows = get_global_id(0); - const uint cols = get_global_id(1); - - algorithmFPType sum = (algorithmFPType)0; - for (uint i = 0; i < k; i++) - { - sum += a[i * lda_row + rows * lda_col + offsetA] * b[cols * ldb_row + i * ldb_col + offsetB]; - } - - c[rows * ldc_col + cols * ldc_row + offsetC] = alpha * sum; - } - - __kernel void blas_sgemv_small(const uint k, const algorithmFPType alpha, const __global algorithmFPType * a, const uint lda_col, - const uint lda_row, const __global algorithmFPType * x, const algorithmFPType beta, __global algorithmFPType * y) { - const uint row = get_global_id(0); - - algorithmFPType sum = (algorithmFPType)0; - for (uint i = 0; i < k; i++) - { - sum += a[i * lda_row + row * lda_col] * x[i]; - } - - y[row] = alpha * sum + beta * y[row]; - } - - __kernel void blas_sgemm(const uint k, const algorithmFPType alpha, const __global algorithmFPType * a, const uint lda_col, const uint lda_row, - const __global algorithmFPType * b, const uint ldb_col, const uint ldb_row, const algorithmFPType beta, - __global algorithmFPType * c, const uint ldc_col, const uint ldc_row) { - const size_t BLOCK_SIZE = 4; - - const int row = get_local_id(0); - const int col = get_local_id(1); - - const int globalRow = BLOCK_SIZE * get_group_id(0) + row; - const int globalCol = BLOCK_SIZE * get_group_id(1) + col; - - __local algorithmFPType Asub[BLOCK_SIZE][BLOCK_SIZE]; - __local algorithmFPType Bsub[BLOCK_SIZE][BLOCK_SIZE]; - - algorithmFPType sum = (algorithmFPType)0; - - const int numTiles = k / BLOCK_SIZE; - for (int t = 0; t < numTiles; t++) - { - const int tiledRow = BLOCK_SIZE * t + row; - const int tiledCol = BLOCK_SIZE * t + col; - - Asub[col][row] = a[globalRow * lda_col + tiledCol * lda_row]; - Bsub[col][row] = b[tiledRow * ldb_col + globalCol * ldb_row]; - - barrier(CLK_LOCAL_MEM_FENCE); - for (int i = 0; i < BLOCK_SIZE; i++) - { - sum += Asub[i][row] * Bsub[col][i]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - c[globalRow * ldc_col + globalCol * ldc_row] = alpha * sum + beta * c[globalRow * ldc_col + globalCol * ldc_row]; - } - -); - -DECLARE_SOURCE( - clKernelAxpy, - - __kernel void blas_axpy(const algorithmFPType a, const __global algorithmFPType * x, const int incx, __global algorithmFPType * y, - const int incy) { - const int i_x = (get_global_id(0)) * incx; - const int i_y = (get_global_id(0)) * incy; - y[i_y] += x[i_x] * a; - } - -); - -#endif // __BLAS_KERNELS_CL__ diff --git a/cpp/daal/src/sycl/cl_kernels/kernel_sparse_blas.cl b/cpp/daal/src/sycl/cl_kernels/kernel_sparse_blas.cl deleted file mode 100755 index 94fcfb13602..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/kernel_sparse_blas.cl +++ /dev/null @@ -1,93 +0,0 @@ -/* file: kernel_sparse_blas.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __SPARSE_BLAS_KERNELS_CL__ -#define __SPARSE_BLAS_KERNELS_CL__ - -#include - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - clKernelSpGemm, - - algorithmFPType dot_product(__global const algorithmFPType * const aValues, __global const algorithmFPType * const bValues, - __global const ulong * const aCols, __global const ulong * const bCols, ulong aRowCur, ulong aRowEnd, ulong bRowCur, - ulong bRowEnd) { - algorithmFPType localSum = (algorithmFPType)0; - - while ((aRowCur < aRowEnd) && (bRowCur < bRowEnd)) - { - const ulong aCurIdx = aCols[aRowCur]; - const ulong bCurIdx = bCols[bRowCur]; - - if (aCurIdx == bCurIdx) - { - localSum += aValues[aRowCur] * bValues[bRowCur]; - aRowCur++; - bRowCur++; - } - else if (aCurIdx < bCurIdx) - { - aRowCur++; - } - else - { - bRowCur++; - } - } - return localSum; - } - - __kernel void spmm_kernel_without_sum(const algorithmFPType alpha, __global const algorithmFPType * const aValues, - __global const ulong * const aCols, __global const ulong * const aRowInd, - __global const algorithmFPType * const bValues, __global const ulong * const bCols, - __global const ulong * const bRowInd, __global algorithmFPType * c, const ulong ldC, const ulong offsetC, - const algorithmFPType beta) { - const ulong i = get_global_id(0); - const ulong j = get_global_id(1); - - const ulong aRowCur = aRowInd[i] - 1; - const ulong aRowEnd = aRowInd[i + 1] - 1; - - const ulong bRowCur = bRowInd[j] - 1; - const ulong bRowEnd = bRowInd[j + 1] - 1; - - const algorithmFPType dotProduct = dot_product(aValues, bValues, aCols, bCols, aRowCur, aRowEnd, bRowCur, bRowEnd); - c[i * ldC + j + offsetC] = alpha * dotProduct; - } - - __kernel void spmm_kernel(const algorithmFPType alpha, __global const algorithmFPType * const aValues, __global const ulong * const aCols, - __global const ulong * const aRowInd, __global const algorithmFPType * const bValues, - __global const ulong * const bCols, __global const ulong * const bRowInd, __global algorithmFPType * c, const ulong ldC, - const ulong offsetC, const algorithmFPType beta) { - const ulong i = get_global_id(0); - const ulong j = get_global_id(1); - - const ulong aRowCur = aRowInd[i] - 1; - const ulong aRowEnd = aRowInd[i + 1] - 1; - - const ulong bRowCur = bRowInd[j] - 1; - const ulong bRowEnd = bRowInd[j + 1] - 1; - - const algorithmFPType dotProduct = dot_product(aValues, bValues, aCols, bCols, aRowCur, aRowEnd, bRowCur, bRowEnd); - c[i * ldC + j + offsetC] = alpha * dotProduct + beta * c[i * ldC + j + offsetC]; - } - -); - -#endif // __SPARSE_BLAS_KERNELS_CL__ diff --git a/cpp/daal/src/sycl/cl_kernels/math.cl b/cpp/daal/src/sycl/cl_kernels/math.cl deleted file mode 100644 index d02f3b56a23..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/math.cl +++ /dev/null @@ -1,39 +0,0 @@ -/* file: math.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of math kernels. -//-- -*/ - -#ifndef __MATH_CL__ -#define __MATH_CL__ - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - clKernelMath, - - __kernel void vLog(const __global algorithmFPType * const x, __global algorithmFPType * result) { - const uint i = get_global_id(0); - result[i] = log(x[i]); - } - -); - -#endif diff --git a/cpp/daal/src/sycl/cl_kernels/op_reducer.cl b/cpp/daal/src/sycl/cl_kernels/op_reducer.cl deleted file mode 100644 index bd20db7bbd9..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/op_reducer.cl +++ /dev/null @@ -1,164 +0,0 @@ -/* file: op_reduce.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of reduction kernels. -//-- -*/ - -#ifndef __OP_REDUCER_CL__ -#define __OP_REDUCER_CL__ - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - op_reduce, - - inline algorithmFPType pow2(const algorithmFPType x) { return x * x; } - - inline algorithmFPType none(const algorithmFPType x) { return x; } - - inline algorithmFPType sum(const algorithmFPType x, const algorithmFPType y) { return x + y; } - - __kernel void reduceSinglepass(uint vectorsAreRows, __global algorithmFPType * vectors, uint nVectors, uint vectorSize, - __global algorithmFPType * reduces) { - const uint local_size = get_local_size(0); - - __local algorithmFPType partialReduces[LOCAL_BUFFER_SIZE]; - - uint globalDim = 1; - uint localDim = nVectors; - - if (vectorsAreRows != 0) - { - globalDim = vectorSize; - localDim = 1; - } - - uint itemId = get_local_id(0); - uint groupId = get_global_id(1); - - algorithmFPType el = vectors[groupId * globalDim + itemId * localDim]; - partialReduces[itemId] = INIT_VALUE; - - for (uint i = itemId; i < vectorSize; i += local_size) - { - el = vectors[groupId * globalDim + i * localDim]; - partialReduces[itemId] = BINARY_OP(partialReduces[itemId], UNARY_OP(el)); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint stride = local_size / 2; stride > 1; stride /= 2) - { - if (stride > itemId) - { - partialReduces[itemId] = BINARY_OP(partialReduces[itemId], partialReduces[itemId + stride]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (itemId == 0) - { - reduces[groupId] = BINARY_OP(partialReduces[itemId], partialReduces[itemId + 1]); - } - } - - void reduceReduceColmajor(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize, - __global algorithmFPType * mergedReduce, const uint rowPartIndex, const uint rowParts, const uint colPartIndex, - const uint colParts, const uint tid, const uint tnum) { - const uint colOffset = colPartIndex * tnum; - const uint x = tid + colOffset; - - if (x < nVectors) - { - uint rowPartSize = (vectorSize + rowParts - 1) / rowParts; - const uint rowOffset = rowPartSize * rowPartIndex; - - if (rowPartSize + rowOffset > vectorSize) - { - rowPartSize = vectorSize - rowOffset; - } - - algorithmFPType partialRes = INIT_VALUE; - - for (int row = 0; row < rowPartSize; row++) - { - const uint y = (row + rowOffset) * nVectors; - const algorithmFPType el = vectors[y + x]; - - partialRes = BINARY_OP(partialRes, UNARY_OP(el)); - } - - mergedReduce[x * rowParts + rowPartIndex] = partialRes; - } - } - - __kernel void reduceStepColmajor(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize, - __global algorithmFPType * mergedReduce) { - const uint tid = get_local_id(0); - const uint tnum = get_local_size(0); - const uint gid = get_group_id(0); - const uint gnum = get_num_groups(0); - - const uint colParts = (nVectors + tnum - 1) / tnum; - const uint rowParts = gnum / colParts; - - const uint rowPartIndex = gid / colParts; - const uint colPartIndex = gid - rowPartIndex * colParts; - - reduceReduceColmajor(vectors, nVectors, vectorSize, mergedReduce, rowPartIndex, rowParts, colPartIndex, colParts, tid, tnum); - } - - __kernel void reduceFinalStepRowmajor(__global const algorithmFPType * mergedReduce, uint nVectors, uint vectorSize, - __global algorithmFPType * reduces) { - const uint local_size = get_local_size(0); - - __local algorithmFPType partialReduces[LOCAL_BUFFER_SIZE]; - - uint globalDim = vectorSize; - uint localDim = 1; - uint itemId = get_local_id(0); - uint groupId = get_group_id(0); - - partialReduces[itemId] = INIT_VALUE; - for (uint i = itemId; i < vectorSize; i += local_size) - { - partialReduces[itemId] = BINARY_OP(partialReduces[itemId], mergedReduce[groupId * globalDim + i * localDim]); - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint stride = local_size / 2; stride > 1; stride >>= 2) - { - if (stride > itemId) - { - partialReduces[itemId] = BINARY_OP(partialReduces[itemId], partialReduces[itemId + stride]); - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (itemId == 0) - { - reduces[groupId] = BINARY_OP(partialReduces[itemId], partialReduces[itemId + 1]); - } - } - -); - -#endif diff --git a/cpp/daal/src/sycl/cl_kernels/partition.cl b/cpp/daal/src/sycl/cl_kernels/partition.cl deleted file mode 100755 index 5f0f8801cc2..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/partition.cl +++ /dev/null @@ -1,175 +0,0 @@ -/* file: partition.cl */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of partion kernels. -//-- -*/ - -#ifndef __PARTITION_CL__ -#define __PARTITION_CL__ - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - kernelsPartition, - - __kernel void scan(const __global int * mask, __global int * partialSums, int nElems) { - const int nGroups = get_num_groups(0); - const int nSubGroups = get_num_sub_groups(); - const int nTotalSubGroups = nSubGroups * nGroups; - const int nElementsForSubgroup = nElems / nTotalSubGroups + !!(nElems % nTotalSubGroups); - const int localSize = get_sub_group_size(); - - const int id = get_local_id(0); - const int localId = get_sub_group_local_id(); - const int subGroupId = get_sub_group_id(); - const int groupId = get_group_id(0) * nSubGroups + subGroupId; - - int iStart = groupId * nElementsForSubgroup; - int iEnd = min((groupId + 1) * nElementsForSubgroup, nElems); - - int sum = 0; - - for (int i = iStart + localId; i < iEnd; i += localSize) - { - const int value = mask[i]; - sum += sub_group_reduce_add(value); - } - - if (localId == 0) - { - partialSums[groupId] = sum; - } - } - - __kernel void scanIndex(const __global int * mask, const __global int * indices, __global int * partialSums, int nElems) { - const int nGroups = get_num_groups(0); - const int nSubGroups = get_num_sub_groups(); - const int nTotalSubGroups = nSubGroups * nGroups; - const int nElementsForSubgroup = nElems / nTotalSubGroups + !!(nElems % nTotalSubGroups); - const int localSize = get_sub_group_size(); - - const int id = get_local_id(0); - const int localId = get_sub_group_local_id(); - const int subGroupId = get_sub_group_id(); - const int groupId = get_group_id(0) * nSubGroups + subGroupId; - - int iStart = groupId * nElementsForSubgroup; - int iEnd = min((groupId + 1) * nElementsForSubgroup, nElems); - - int sum = 0; - - for (int i = iStart + localId; i < iEnd; i += localSize) - { - const int value = mask[indices[i]]; - sum += sub_group_reduce_add(value); - } - - if (localId == 0) - { - partialSums[groupId] = sum; - } - } - - __kernel void sumScan(const __global int * partialSums, __global int * partialPrefixSums, __global int * totalSum, int nSubgroupSums) { - if (get_sub_group_id() > 0) return; - - const int localSize = get_sub_group_size(); - const int localId = get_sub_group_local_id(); - - int sum = 0; - - for (int i = localId; i < nSubgroupSums; i += localSize) - { - int value = partialSums[i]; - int boundary = sub_group_scan_exclusive_add(value); - partialPrefixSums[i] = sum + boundary; - sum += sub_group_reduce_add(value); - } - - if (localId == 0) - { - totalSum[0] = sum; - partialPrefixSums[nSubgroupSums] = sum; - } - } - - __kernel void reorder(const __global int * mask, const __global algorithmFPType * data, __global algorithmFPType * outData, - const __global int * partialPrefixSums, int nElems) { - const int nGroups = get_num_groups(0); - const int nSubGroups = get_num_sub_groups(); - const int nTotalSubGroups = nSubGroups * nGroups; - const int nElementsForSubgroup = nElems / nTotalSubGroups + !!(nElems % nTotalSubGroups); - const int localSize = get_sub_group_size(); - - const int id = get_local_id(0); - const int localId = get_sub_group_local_id(); - const int subGroupId = get_sub_group_id(); - const int groupId = get_group_id(0) * nSubGroups + subGroupId; - - int iStart = groupId * nElementsForSubgroup; - int iEnd = min((groupId + 1) * nElementsForSubgroup, nElems); - - int groupOffset = partialPrefixSums[groupId]; - int totalOffset = nElems - partialPrefixSums[nTotalSubGroups]; - - int sum = 0; - - for (int i = iStart + localId; i < iEnd; i += localSize) - { - const int part = mask[i]; - const int boundary = groupOffset + sum + sub_group_scan_exclusive_add(part); - if (part) outData[boundary] = data[i]; - sum += sub_group_reduce_add(part); - } - } - - __kernel void reorderIndex(const __global int * mask, const __global int * indices, __global int * outData, - const __global int * partialPrefixSums, int nElems) { - const int nGroups = get_num_groups(0); - const int nSubGroups = get_num_sub_groups(); - const int nTotalSubGroups = nSubGroups * nGroups; - const int nElementsForSubgroup = nElems / nTotalSubGroups + !!(nElems % nTotalSubGroups); - const int localSize = get_sub_group_size(); - - const int localId = get_sub_group_local_id(); - const int subGroupId = get_sub_group_id(); - const int groupId = get_group_id(0) * nSubGroups + subGroupId; - - int iStart = groupId * nElementsForSubgroup; - int iEnd = min((groupId + 1) * nElementsForSubgroup, nElems); - - int groupOffset = partialPrefixSums[groupId]; - int totalOffset = nElems - partialPrefixSums[nTotalSubGroups]; - - int sum = 0; - - for (int i = iStart + localId; i < iEnd; i += localSize) - { - const int indexi = indices[i]; - const int part = mask[indexi]; - const int boundary = groupOffset + sum + sub_group_scan_exclusive_add(part); - if (part) outData[boundary] = indexi; - sum += sub_group_reduce_add(part); - } - } - -); - -#endif diff --git a/cpp/daal/src/sycl/cl_kernels/radix_sort.cl b/cpp/daal/src/sycl/cl_kernels/radix_sort.cl deleted file mode 100644 index c0ade905326..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/radix_sort.cl +++ /dev/null @@ -1,282 +0,0 @@ -/* file: radix_sort.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of radix sort kernels. -//-- -*/ - -#ifndef __RADIX_SORT_CL__ -#define __RADIX_SORT_CL__ - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - radix_sort_simd, - - void swap(__global sortedType ** input, __global sortedType ** output) { - __global sortedType * tmp = *input; - *input = *output; - *output = tmp; - } - - uint __attribute__((overloadable)) invBits(uint x) { - return x ^ (-(x >> 31) | 0x80000000u); - // return x ^ 0x80000000u; - } - - ulong __attribute__((overloadable)) invBits(ulong x) { - return x ^ (-(x >> 63) | 0x8000000000000000ul); - // return x ^ 0x8000000000000000u; - } - - __kernel void radix_sort_group(__global sortedType * labels, __global int * sorted, __global int * radixbuf, unsigned int N, - unsigned int BlockOffset) { - const unsigned int global_id = get_global_id(0); - const unsigned int local_id = get_local_id(1); - // Code is written for a single subgroup. It's necessary to adjust the local range if idle subgoups are presented - if (get_sub_group_id() > 0) return; - const unsigned int local_size = get_sub_group_size(); - unsigned int group_aligned_size = N - N % local_size; - unsigned int rem = N - group_aligned_size; - // radixBuf should be big enough to accumulate radix_range elements - const unsigned int radix_range = 256; - const unsigned int byte_range = 8; - - const unsigned int radix_count = sizeof(sortedType); - __global sortedType * input = &labels[global_id * BlockOffset]; - __global sortedType * output = &sorted[global_id * BlockOffset]; - __global int * counters = &radixbuf[global_id * radix_range]; - // Radix sort - for (unsigned int i = 0; i < radix_count; i++) - { - __global unsigned char * cinput = (__global unsigned char *)input; - for (unsigned int j = local_id; j < radix_range; j++) counters[j] = 0; - // Count elements in sub group to write once per value - for (unsigned int j = local_id; j < group_aligned_size + local_size; j += local_size) - { - bool exists = j < group_aligned_size || local_id < rem; - unsigned char c = exists ? cinput[j * radix_count + i] : 0; - int entry = -1; - for (unsigned int k = 0; k < local_size; k++) - { - bool correct = j < group_aligned_size || k < rem; - unsigned int done = sub_group_broadcast(correct ? 0 : 1, k); - if (done) break; - unsigned char value = sub_group_broadcast(c, k); - if (entry < 0 && value == c) entry = k; - unsigned int count = sub_group_reduce_add(exists && value == c ? 1 : 0); - if (entry == local_id && entry == k) - { - counters[value] += count; - } - } - sub_group_barrier(CLK_GLOBAL_MEM_FENCE); - } - // Parallel scan on counters to generate offsets in place - unsigned int offset = 0; - for (unsigned int j = local_id; j < radix_range; j += local_size) - { - unsigned int value = counters[j]; - unsigned int boundary = sub_group_scan_exclusive_add(value); - counters[j] = offset + boundary; - unsigned int partial_offset = sub_group_reduce_add(value); - offset += partial_offset; - } - sub_group_barrier(CLK_GLOBAL_MEM_FENCE); - for (unsigned int j = local_id; j < group_aligned_size + local_size; j += local_size) - { - bool exists = j < group_aligned_size || local_id < rem; - unsigned char c = exists ? cinput[j * radix_count + i] : 0; - unsigned int local_offset = 0; - unsigned int done = 0; - int entry = -1; - - for (unsigned int k = 0; k < local_size; k++) - { - bool correct = j < group_aligned_size || k < rem; - unsigned int done = sub_group_broadcast(correct ? 0 : 1, k); - if (done) - { - break; - } - unsigned int skip = sub_group_broadcast(entry < 0 ? 0 : 1, k); - if (skip) continue; - unsigned char value = sub_group_broadcast(c, k); - if (entry < 0 && value == c) entry = k; - unsigned int offset = sub_group_scan_exclusive_add(value == c && exists ? 1 : 0); - if (value == c) - { - local_offset = offset + counters[value]; - } - unsigned int count = sub_group_reduce_add(value == c && exists ? 1 : 0); - if (local_id == k && entry == k) - { - counters[value] += count; - } - } - sub_group_barrier(CLK_GLOBAL_MEM_FENCE); - if (exists) output[local_offset] = input[j]; - } - swap(&input, &output); - } - for (unsigned int i = local_id; i < N; i += local_size) output[i] = input[i]; - } - - __kernel void radixScan(const __global radixIntType * values, __global int * partialHists, unsigned int nRows, unsigned int bitOffset) { - const unsigned int RADIX_BITS = 4; - - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - unsigned int offset[(unsigned int)1 << RADIX_BITS]; - const unsigned int radix_range = (unsigned int)1 << RADIX_BITS; - const unsigned int radix_range_1 = radix_range - 1; - for (unsigned int i = 0; i < radix_range; i++) - { - offset[i] = 0; - } - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - radixIntType data_bits = ((invBits(values[i]) >> bitOffset) & radix_range_1); - for (unsigned int j = 0; j < radix_range; j++) - { - unsigned int value = data_bits == j; - unsigned int partial_offset = sub_group_reduce_add(value); - offset[j] += partial_offset; - } - } - - if (local_id == 0) - { - for (unsigned int j = 0; j < radix_range; j++) - { - partialHists[group_id * radix_range + j] = offset[j]; - } - } - } - - __kernel void radixHistScan(const __global int * partialHists, __global int * partialPrefixHists, unsigned int nSubgroupSums) { - const unsigned int RADIX_BITS = 4; - - if (get_sub_group_id() > 0) return; - - const unsigned int local_size = get_sub_group_size(); - const unsigned int local_id = get_sub_group_local_id(); - - unsigned int offset[(unsigned int)1 << RADIX_BITS]; - const unsigned int radix_range = (unsigned int)1 << RADIX_BITS; - for (unsigned int i = 0; i < radix_range; i++) - { - offset[i] = 0; - } - - for (unsigned int i = local_id; i < nSubgroupSums; i += local_size) - { - for (unsigned int j = 0; j < radix_range; j++) - { - unsigned int value = partialHists[i * radix_range + j]; - unsigned int boundary = sub_group_scan_exclusive_add(value); - partialPrefixHists[i * radix_range + j] = offset[j] + boundary; - unsigned int partial_offset = sub_group_reduce_add(value); - offset[j] += partial_offset; - } - } - - if (local_id == 0) - { - unsigned int totalSum = 0; - for (unsigned int j = 0; j < radix_range; j++) - { - partialPrefixHists[nSubgroupSums * radix_range + j] = totalSum; - totalSum += offset[j]; - } - } - } - - __kernel void radixReorder(const __global radixIntType * valuesSrc, const __global int * indicesSrc, const __global int * partialPrefixHists, - __global radixIntType * valuesDst, __global int * indicesDst, unsigned int nRows, unsigned int bitOffset) { - const unsigned int RADIX_BITS = 4; - - const unsigned int n_groups = get_num_groups(0); - const unsigned int n_sub_groups = get_num_sub_groups(); - const unsigned int n_total_sub_groups = n_sub_groups * n_groups; - const unsigned int nElementsForSubgroup = nRows / n_total_sub_groups + !!(nRows % n_total_sub_groups); - const unsigned int local_size = get_sub_group_size(); - - const unsigned int id = get_local_id(0); - const unsigned int local_id = get_sub_group_local_id(); - const unsigned int sub_group_id = get_sub_group_id(); - const unsigned int group_id = get_group_id(0) * n_sub_groups + sub_group_id; - - unsigned int iStart = group_id * nElementsForSubgroup; - unsigned int iEnd = (group_id + 1) * nElementsForSubgroup; - - if (iEnd > nRows) - { - iEnd = nRows; - } - - unsigned int offset[(unsigned int)1 << RADIX_BITS]; - - const unsigned int radix_range = (unsigned int)1 << RADIX_BITS; - const unsigned int radix_range_1 = radix_range - 1; - - for (unsigned int i = 0; i < radix_range; i++) - { - offset[i] = partialPrefixHists[group_id * radix_range + i] + partialPrefixHists[n_total_sub_groups * radix_range + i]; - } - - for (unsigned int i = iStart + local_id; i < iEnd; i += local_size) - { - radixIntType data_value = valuesSrc[i]; - radixIntType data_bits = ((invBits(data_value) >> bitOffset) & radix_range_1); - unsigned int pos_new = 0; - for (unsigned int j = 0; j < radix_range; j++) - { - unsigned int value = data_bits == j; - unsigned int boundary = sub_group_scan_exclusive_add(value); - pos_new |= value * (offset[j] + boundary); - unsigned int partial_offset = sub_group_reduce_add(value); - offset[j] = offset[j] + partial_offset; - } - valuesDst[pos_new] = data_value; - indicesDst[pos_new] = indicesSrc[i]; - } - } - -); - -#endif diff --git a/cpp/daal/src/sycl/cl_kernels/select_indexed.cl b/cpp/daal/src/sycl/cl_kernels/select_indexed.cl deleted file mode 100644 index f92d0006fa3..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/select_indexed.cl +++ /dev/null @@ -1,241 +0,0 @@ -/* file: select_indexed.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of quick select kernels. -//-- -*/ - -#ifndef __SELECT_INDEXED_CL__ -#define __SELECT_INDEXED_CL__ - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - quick_select_simd, - - algorithmFPType get_rnd(__global const algorithmFPType * rnd_seq, int rnd_eriod, int * count) { - algorithmFPType ret = rnd_seq[(*count)++]; - if (*count >= rnd_eriod) - { - *count = 0; - } - return ret; - } - - void partition_by_values(__global algorithmFPType * values, __global int * indices, int partition_start, int partition_end, int local_id, - int local_size, algorithmFPType pivot, int * split_index_ptr, int * great_total_ptr) { - int full_size = partition_end - partition_start; - int last_group_size = full_size % local_size; - int full_group_size = full_size - last_group_size; - - for (int i = partition_start + local_id; i < partition_end; i += local_size) - { - sub_group_barrier(CLK_GLOBAL_MEM_FENCE); - algorithmFPType cur_value = values[i]; - int cur_index = indices[i]; - unsigned char is_small = cur_value < pivot ? 1 : 0; - unsigned char num_of_great = sub_group_reduce_add(cur_value > pivot ? 1 : 0); - unsigned char num_of_small = sub_group_reduce_add(is_small); - int min_ind = sub_group_reduce_min(i); - if (num_of_small > 0) - { - unsigned char pos_in_group_small = sub_group_scan_exclusive_add(is_small); - unsigned char pos_in_group_great = sub_group_scan_exclusive_add(is_small > 0 ? 0 : 1); - int cur_size = i > full_group_size - 1 ? last_group_size : local_size; - if (is_small) - { - algorithmFPType value_to_move = values[partition_start + *split_index_ptr + pos_in_group_small]; - int index_to_move = indices[partition_start + *split_index_ptr + pos_in_group_small]; - - values[partition_start + *split_index_ptr + pos_in_group_small] = cur_value; - indices[partition_start + *split_index_ptr + pos_in_group_small] = cur_index; - if (partition_start + *split_index_ptr + pos_in_group_small < min_ind) - { - values[min_ind + cur_size - 1 - pos_in_group_small] = value_to_move; - indices[min_ind + cur_size - 1 - pos_in_group_small] = index_to_move; - } - } - else - { - values[min_ind + cur_size - 1 - pos_in_group_great] = cur_value; - indices[min_ind + cur_size - 1 - pos_in_group_great] = cur_index; - } - } - sub_group_barrier(CLK_GLOBAL_MEM_FENCE); - *split_index_ptr += num_of_small; - *great_total_ptr += num_of_great; - } - *split_index_ptr = -sub_group_reduce_min(-(*split_index_ptr)); - *great_total_ptr = -sub_group_reduce_min(-(*great_total_ptr)); - } - - __kernel void quick_select_group(__global algorithmFPType * in_values, __global int * in_indices, __global algorithmFPType * out_values, - __global int * out_indices, __global const algorithmFPType * rnd_seq, int RndPeriod, int N, int NLast, int K, - int BlockOffset) { - const int row_id = get_global_id(0) * get_num_sub_groups() + get_sub_group_id(); - const int local_id = get_local_id(1); - const int local_size = get_sub_group_size(); - const int row_number = get_global_size(0); - - if (row_id >= row_number) - { - return; - } - - N = (row_id == get_global_size(0) - 1) ? NLast : N; - - const int offset_in = row_id * BlockOffset; - const int offset_out = row_id * K; - int partition_start = 0; - int partition_end = N; - int rnd_count = 0; - - __global algorithmFPType * values = &in_values[offset_in]; - __global int * indices = &in_indices[offset_in]; - - for (int i = partition_start + local_id; i < partition_end; i += local_size) - { - indices[i] = i; - } - - int iteration_count = 0; - while (1) - { - iteration_count++; - int split_index = 0; - const algorithmFPType rnd = get_rnd(rnd_seq, RndPeriod, &rnd_count); - int pos = (int)(rnd * (partition_end - partition_start - 1)); - pos = pos < 0 ? 0 : pos; - const algorithmFPType pivot = values[partition_start + pos]; - int num_of_great = 0; - partition_by_values(values, indices, partition_start, partition_end, local_id, local_size, pivot, &split_index, &num_of_great); - - if ((partition_start + split_index) == K || (!split_index && !num_of_great)) - { - break; - } - if (partition_start + split_index > K) - { - partition_end = partition_start + split_index; - } - if (partition_start + split_index < K) - { - partition_start += split_index; - } - if (iteration_count > N) - { - break; - } - } - for (int i = local_id; i < K; i += local_size) - { - out_values[offset_out + i] = values[i]; - out_indices[offset_out + i] = indices[i]; - } - } - -); - -DECLARE_SOURCE( - direct_select_simd, __kernel void direct_select_group(__global const algorithmFPType * values_in, __global algorithmFPType * values_out, - __global int * indices_out, int N, int NL, int BlockOffset, algorithmFPType FPMax) { - const int local_size = get_sub_group_size(); - const int sub_group_num = get_num_sub_groups(); - const int M = get_global_size(0); - const int global_id = get_global_id(0) * sub_group_num + get_sub_group_id(); - - if (global_id >= M) - { - return; - } - - const int local_id = get_sub_group_local_id(); - - const __global algorithmFPType * finput = &values_in[global_id * BlockOffset]; - - if (global_id == get_global_size(0) - 1) - { - N = NL; - } - - const int array_size = __K__; - int indices[array_size]; - for (int j = 0; j < array_size; j++) - { - indices[j] = -1; - } - - algorithmFPType values[array_size]; - for (int j = 0; j < array_size; j++) - { - values[j] = FPMax; - } - - for (int i = local_id; i < N; i += local_size) - { - algorithmFPType value = finput[i]; - int index = i; - int pos = -1; - - for (int j = array_size - 1; j > -1; j--) - { - bool do_shift = values[j] > value; - pos = do_shift ? j : pos; - if (j < array_size - 1) - { - values[j + 1] = do_shift ? values[j] : values[j + 1]; - indices[j + 1] = do_shift ? indices[j] : indices[j + 1]; - } - } - if (pos != -1) - { - values[pos] = value; - indices[pos] = index; - } - } - sub_group_barrier(CLK_GLOBAL_MEM_FENCE); - int bias = 0; - algorithmFPType final_values[array_size]; - int final_indices[array_size]; - for (int i = 0; i < array_size; i++) - { - algorithmFPType min_val = sub_group_reduce_min(values[bias]); - bool present = (min_val == values[bias]); - int pos = sub_group_scan_exclusive_add(present ? 1 : 0); - bool owner = present && pos == 0; - final_indices[i] = -sub_group_reduce_min(owner ? -indices[bias] : 1); - final_values[i] = min_val; - bias += owner ? 1 : 0; - } - - __global int * local_ind_out = &indices_out[global_id * __K__]; - __global algorithmFPType * local_val_out = &values_out[global_id * __K__]; - - for (int i = local_id; i < array_size; i += local_size) - { - local_ind_out[i] = final_indices[i]; - } - - for (int i = local_id; i < array_size; i += local_size) - { - local_val_out[i] = final_values[i]; - } - }); - -#endif diff --git a/cpp/daal/src/sycl/cl_kernels/sum_reducer.cl b/cpp/daal/src/sycl/cl_kernels/sum_reducer.cl deleted file mode 100644 index 8fe8d9a5704..00000000000 --- a/cpp/daal/src/sycl/cl_kernels/sum_reducer.cl +++ /dev/null @@ -1,199 +0,0 @@ -/* file: sum_reducer.cl */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Implementation of sum reduction kernels. -//-- -*/ - -#ifndef __SUM_REDUCER_CL__ -#define __SUM_REDUCER_CL__ - -#define DECLARE_SOURCE(name, src) static const char * name = #src; - -DECLARE_SOURCE( - sum_reducer, - - __kernel void sum_singlepass(uint vectorsAreRows, __global algorithmFPType * vectors, uint nVectors, uint vectorSize, - __global algorithmFPType * sums, __global algorithmFPType * sq_sums) { - const uint local_size = get_local_size(0); - - __local algorithmFPType partial_sums[LOCAL_BUFFER_SIZE]; - __local algorithmFPType partial_sq_sums[LOCAL_BUFFER_SIZE]; - - uint globalDim = 1; - uint localDim = nVectors; - - if (vectorsAreRows != 0) - { - globalDim = vectorSize; - localDim = 1; - } - - uint itemId = get_local_id(0); - uint groupId = get_global_id(1); - - algorithmFPType el = vectors[groupId * globalDim + itemId * localDim]; - partial_sums[itemId] = 0; - partial_sq_sums[itemId] = 0; - - for (uint i = itemId; i < vectorSize; i += local_size) - { - el = vectors[groupId * globalDim + i * localDim]; - partial_sums[itemId] += el; - partial_sq_sums[itemId] += el * el; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint stride = local_size / 2; stride > 1; stride /= 2) - { - if (stride > itemId) - { - partial_sums[itemId] += partial_sums[itemId + stride]; - partial_sq_sums[itemId] += partial_sq_sums[itemId + stride]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (itemId == 0) - { - sums[groupId] = partial_sums[itemId] + partial_sums[itemId + 1]; - sq_sums[groupId] = partial_sq_sums[itemId] + partial_sq_sums[itemId + 1]; - } - } - - __kernel void sum_singlesubgroup(__global algorithmFPType * vectors, uint nVectors, uint vectorSize, __global algorithmFPType * sums, - __global algorithmFPType * sq_sums) { - const uint localId = get_local_id(1); - const uint localSize = get_local_size(1); - const uint groupId = get_global_id(0); - const uint offset = groupId * vectorSize; - if (get_sub_group_id() > 0) return; - - algorithmFPType partial_sums = 0; - algorithmFPType partial_sq_sums = 0; - - for (uint i = localId; i < vectorSize; i += localSize) - { - algorithmFPType el = vectors[offset + i]; - partial_sums += el; - partial_sq_sums += el * el; - } - - partial_sums = sub_group_reduce_add(partial_sums); - partial_sq_sums = sub_group_reduce_add(partial_sq_sums); - - if (localId == 0) - { - sums[groupId] = partial_sums; - sq_sums[groupId] = partial_sq_sums; - } - } - - void __sum_reduce_colmajor(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize, - __global algorithmFPType * mergedSums, __global algorithmFPType * mergedSqSums, const uint rowPartIndex, - const uint rowParts, const uint colPartIndex, const uint colParts, const uint tid, const uint tnum) { - const uint colOffset = colPartIndex * tnum; - const uint x = tid + colOffset; - - if (x < nVectors) - { - uint rowPartSize = (vectorSize + rowParts - 1) / rowParts; - const uint rowOffset = rowPartSize * rowPartIndex; - - if (rowPartSize + rowOffset > vectorSize) - { - rowPartSize = vectorSize - rowOffset; - } - - algorithmFPType partialSums = 0.0; - algorithmFPType partialSqSums = 0.0; - - for (int row = 0; row < rowPartSize; row++) - { - const uint y = (row + rowOffset) * nVectors; - const algorithmFPType el = vectors[y + x]; - - partialSums += el; - partialSqSums += el * el; - } - - mergedSums[x * rowParts + rowPartIndex] = partialSums; - mergedSqSums[x * rowParts + rowPartIndex] = partialSqSums; - } - } - - __kernel void sum_step_colmajor(__global const algorithmFPType * vectors, const uint nVectors, const uint vectorSize, - __global algorithmFPType * mergedSums, __global algorithmFPType * mergedSqSums) { - const uint tid = get_local_id(0); - const uint tnum = get_local_size(0); - const uint gid = get_group_id(0); - const uint gnum = get_num_groups(0); - - const uint colParts = (nVectors + tnum - 1) / tnum; - const uint rowParts = gnum / colParts; - - const uint rowPartIndex = gid / colParts; - const uint colPartIndex = gid - rowPartIndex * colParts; - - __sum_reduce_colmajor(vectors, nVectors, vectorSize, mergedSums, mergedSqSums, rowPartIndex, rowParts, colPartIndex, colParts, tid, tnum); - } - - __kernel void sum_final_step_rowmajor(__global const algorithmFPType * mergedSums, __global const algorithmFPType * mergedSqSums, uint nVectors, - uint vectorSize, __global algorithmFPType * sums, __global algorithmFPType * sqSums) { - const uint local_size = get_local_size(0); - - __local algorithmFPType partial_sums[LOCAL_BUFFER_SIZE]; - __local algorithmFPType partial_sq_sums[LOCAL_BUFFER_SIZE]; - - uint globalDim = vectorSize; - uint localDim = 1; - uint itemId = get_local_id(0); - uint groupId = get_group_id(0); - - partial_sums[itemId] = 0; - partial_sq_sums[itemId] = 0; - for (uint i = itemId; i < vectorSize; i += local_size) - { - partial_sums[itemId] += mergedSums[groupId * globalDim + i * localDim]; - partial_sq_sums[itemId] += mergedSqSums[groupId * globalDim + i * localDim]; - } - - barrier(CLK_LOCAL_MEM_FENCE); - - for (uint stride = local_size / 2; stride > 1; stride /= 2) - { - if (stride > itemId) - { - partial_sums[itemId] += partial_sums[itemId + stride]; - partial_sq_sums[itemId] += partial_sq_sums[itemId + stride]; - } - barrier(CLK_LOCAL_MEM_FENCE); - } - - if (itemId == 0) - { - sums[groupId] = partial_sums[itemId] + partial_sums[itemId + 1]; - sqSums[groupId] = partial_sq_sums[itemId] + partial_sq_sums[itemId + 1]; - } - } - -); - -#endif diff --git a/cpp/daal/src/sycl/gpu_support_checker.cpp b/cpp/daal/src/sycl/gpu_support_checker.cpp deleted file mode 100644 index aa5cefe3dfe..00000000000 --- a/cpp/daal/src/sycl/gpu_support_checker.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/** file gpu_support_checker.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "services/internal/gpu_support_checker.h" -#include "services/env_detect.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -GpuSupportChecker & GpuSupportChecker::GetInstance() -{ - static GpuSupportChecker instance; - return instance; -} - -DAAL_EXPORT bool isImplementedForDevice(const services::internal::sycl::InfoDevice & deviceInfo, algorithms::AlgorithmContainerIface * iface) -{ - bool ret = true; - if (!deviceInfo.isCpu) - { - ret = GpuSupportChecker::GetInstance().check(iface); - } - return ret; -} - -} //namespace internal -} //namespace services -} //namespace daal diff --git a/cpp/daal/src/sycl/lapack_gpu.cpp b/cpp/daal/src/sycl/lapack_gpu.cpp deleted file mode 100644 index fafc9a5a13c..00000000000 --- a/cpp/daal/src/sycl/lapack_gpu.cpp +++ /dev/null @@ -1,99 +0,0 @@ -/* file: lapack_gpu.cpp */ -/******************************************************************************* -* Copyright 2015 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "services/internal/sycl/math/reference_lapack.h" -#include "src/externals/service_lapack.h" -#include "services/error_handling.h" -#include "src/sycl/blas_gpu.h" -#include "src/sycl/cl_kernels/kernel_blas.cl" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -namespace interface1 -{ -using namespace daal::internal; - -template -services::Status ReferencePotrf::operator()(const math::UpLo uplo, const size_t n, - services::internal::Buffer & a_buffer, const size_t lda) -{ - services::Status status; - - char up = uplo == math::UpLo::Upper ? 'U' : 'L'; - - DAAL_INT info; - - DAAL_INT nInt = static_cast(n); - DAAL_INT ldaInt = static_cast(lda); - - services::SharedPtr aPtr = a_buffer.toHost(data_management::ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - LapackAutoDispatch::xpotrf(&up, &nInt, aPtr.get(), &ldaInt, &info); - - DAAL_CHECK(info == 0, services::ErrorID::ErrorNormEqSystemSolutionFailed); - return status; -} - -template -services::Status ReferencePotrs::operator()(const math::UpLo uplo, const size_t n, const size_t ny, - services::internal::Buffer & a_buffer, const size_t lda, - services::internal::Buffer & b_buffer, const size_t ldb) -{ - services::Status status; - - char up = uplo == math::UpLo::Upper ? 'U' : 'L'; - - DAAL_INT info; - - DAAL_INT nInt = static_cast(n); - DAAL_INT nyInt = static_cast(ny); - DAAL_INT ldaInt = static_cast(lda); - DAAL_INT ldbInt = static_cast(ldb); - - services::SharedPtr aPtr = a_buffer.toHost(data_management::ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - services::SharedPtr bPtr = b_buffer.toHost(data_management::ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - - LapackAutoDispatch::xpotrs(&up, &nInt, &nyInt, aPtr.get(), &ldaInt, bPtr.get(), &ldbInt, &info); - - DAAL_CHECK(info == 0, services::ErrorID::ErrorNormEqSystemSolutionFailed); - return status; -} - -template class ReferencePotrf; -template class ReferencePotrf; - -template class ReferencePotrs; -template class ReferencePotrs; - -} // namespace interface1 -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/lapack_gpu.h b/cpp/daal/src/sycl/lapack_gpu.h deleted file mode 100644 index 78dabf59ba6..00000000000 --- a/cpp/daal/src/sycl/lapack_gpu.h +++ /dev/null @@ -1,74 +0,0 @@ -/* file: lapack_gpu.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Template wrappers for common GPU lapack functions. -//-- -*/ - -#ifndef __SERVICE_ONEAPI_LAPACK_GPU_H__ -#define __SERVICE_ONEAPI_LAPACK_GPU_H__ - -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types_utils.h" -#include "src/sycl/math_service_types.h" -#include "services/internal/buffer.h" -#include "services/internal/execution_context.h" -#include "services/internal/sycl/math/types.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -template -struct LapackGpu -{ - static services::Status xpotrf(const math::UpLo uplo, const uint32_t n, UniversalBuffer a_buffer, const uint32_t lda) - { - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - ctx.potrf(uplo, n, a_buffer, lda, status); - - return status; - } - - static services::Status xpotrs(const math::UpLo uplo, const uint32_t n, const uint32_t ny, UniversalBuffer a_buffer, const uint32_t lda, - UniversalBuffer b_buffer, const uint32_t ldb) - { - services::Status status; - - ExecutionContextIface & ctx = services::internal::getDefaultContext(); - - ctx.potrs(uplo, n, ny, a_buffer, lda, b_buffer, ldb, status); - - return status; - } -}; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/math_service_types.h b/cpp/daal/src/sycl/math_service_types.h deleted file mode 100644 index 6ed5629c280..00000000000 --- a/cpp/daal/src/sycl/math_service_types.h +++ /dev/null @@ -1,82 +0,0 @@ -/* file: math_service_types.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __MATH_SERVICE_TYPES_H__ -#define __MATH_SERVICE_TYPES_H__ - -#include "services/internal/sycl/types.h" -#include "services/internal/execution_context.h" -#include "services/internal/sycl/math/types.h" -#include "services/internal/sycl/execution_context.h" -#include "services/internal/buffer.h" -#include "services/error_handling.h" -#include "src/sycl/cl_kernels/math.cl" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -template -static algorithmFPType expThreshold() -{ - return IsSameType::value ? -650.0 : -75.0f; -} - -template -static services::Status vLog(const services::internal::Buffer & x, services::internal::Buffer & result, - const uint32_t n) -{ - services::Status status; - - services::internal::sycl::ExecutionContextIface & ctx = services::internal::getDefaultContext(); - services::internal::sycl::ClKernelFactoryIface & factory = ctx.getClKernelFactory(); - - const services::String options = services::internal::sycl::getKeyFPType(); - services::String cachekey("__daal_algorithms_math_"); - cachekey.add(options); - factory.build(services::internal::sycl::ExecutionTargetIds::device, cachekey.c_str(), clKernelMath, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - const char * const kernelName = "vLog"; - services::internal::sycl::KernelPtr kernel = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - services::internal::sycl::KernelArguments args(2, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, x, services::internal::sycl::AccessModeIds::read); - args.set(1, result, services::internal::sycl::AccessModeIds::write); - - services::internal::sycl::KernelRange range(n); - - ctx.run(range, kernel, args, status); - - return status; -} - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/partition.cpp b/cpp/daal/src/sycl/partition.cpp deleted file mode 100644 index 35bcde70e20..00000000000 --- a/cpp/daal/src/sycl/partition.cpp +++ /dev/null @@ -1,290 +0,0 @@ -/* file: partition.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/sycl/partition.h" -#include "services/internal/execution_context.h" -#include "src/externals/service_profiler.h" -#include "src/sycl/cl_kernels/partition.cl" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -services::Status Partition::buildProgram(ClKernelFactoryIface & factory, const TypeId & vectorTypeId) -{ - services::String fptype_name = getKeyFPType(vectorTypeId); - auto build_options = fptype_name; - services::String cachekey("__daal_oneapi_internal_partition_"); - cachekey.add(build_options); - - services::Status status; - factory.build(ExecutionTargetIds::device, cachekey.c_str(), kernelsPartition, build_options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -services::Status Partition::scan(ClKernelFactoryIface & factory, UniversalBuffer & mask, UniversalBuffer & partialSums, size_t nElems, - size_t localSize, size_t nLocalSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(partition.scan); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto kernel = factory.getKernel("scan", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, mask, AccessModeIds::read); - args.set(1, partialSums, AccessModeIds::write); - args.set(2, (int)nElems); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalSums); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -services::Status Partition::scanIndex(ClKernelFactoryIface & factory, UniversalBuffer & mask, UniversalBuffer & data, UniversalBuffer & partialSums, - size_t nElems, size_t localSize, size_t nLocalSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(partition.scanIndex); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto kernel = factory.getKernel("scanIndex", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, mask, AccessModeIds::read); - args.set(1, data, AccessModeIds::read); - args.set(2, partialSums, AccessModeIds::write); - args.set(3, (int)nElems); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalSums); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -services::Status Partition::sumScan(ClKernelFactoryIface & factory, UniversalBuffer & partialSums, UniversalBuffer & partialPrefixSums, - UniversalBuffer & totalSum, size_t localSize, size_t nSubgroupSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(partition.sumScan); - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto kernel = factory.getKernel("sumScan", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialSums, AccessModeIds::read); - args.set(1, partialPrefixSums, AccessModeIds::write); - args.set(2, totalSum, AccessModeIds::write); - args.set(3, (int)nSubgroupSums); - - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -services::Status Partition::reorder(ClKernelFactoryIface & factory, UniversalBuffer & mask, UniversalBuffer & data, UniversalBuffer & outData, - UniversalBuffer & partialPrefixSums, size_t nElems, size_t localSize, size_t nLocalSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(partition.reorder); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto kernel = factory.getKernel("reorder", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, mask, AccessModeIds::read); - args.set(1, data, AccessModeIds::read); - args.set(2, outData, AccessModeIds::write); - args.set(3, partialPrefixSums, AccessModeIds::read); - args.set(4, (int)nElems); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalSums); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -services::Status Partition::reorderIndex(ClKernelFactoryIface & factory, UniversalBuffer & mask, UniversalBuffer & data, UniversalBuffer & outData, - UniversalBuffer & partialPrefixSums, size_t nElems, size_t localSize, size_t nLocalSums) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(partition.reorderIndex); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto kernel = factory.getKernel("reorderIndex", status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, mask, AccessModeIds::read); - args.set(1, data, AccessModeIds::read); - args.set(2, outData, AccessModeIds::write); - args.set(3, partialPrefixSums, AccessModeIds::read); - args.set(4, (int)nElems); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalSums); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - - return status; -} - -services::Status Partition::flagged(UniversalBuffer mask, UniversalBuffer data, UniversalBuffer outData, const size_t nElems, size_t & nSelect) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(flagged); - - services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - buildProgram(factory, data.type()); - - const uint32_t subSize = _preferableSubGroup; - const uint32_t localSize = _preferableSubGroup; - const uint32_t nLocalSums = _maxLocalSums * localSize < nElems ? _maxLocalSums : (nElems / localSize) + !!(nElems % localSize); - const uint32_t nSubgroupSums = nLocalSums * (localSize / subSize); - - auto partialSums = context.allocate(TypeIds::id(), nSubgroupSums + 1, status); - auto partialPrefixSums = context.allocate(TypeIds::id(), nSubgroupSums + 1, status); - auto totalSum = context.allocate(TypeIds::id(), 1, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(scan(factory, mask, partialSums, nElems, localSize, nLocalSums)); - DAAL_CHECK_STATUS_VAR(sumScan(factory, partialSums, partialPrefixSums, totalSum, localSize, nSubgroupSums)); - DAAL_CHECK_STATUS_VAR(reorder(factory, mask, data, outData, partialPrefixSums, nElems, localSize, nLocalSums)); - - { - auto totalSumHost = totalSum.template get().toHost(data_management::ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - const int * totalSumHostPtr = totalSumHost.get(); - if (!totalSumHostPtr) - { - return services::Status(services::ErrorNullPtr); - } - nSelect = totalSumHostPtr[0]; - } - - return status; -} - -services::Status Partition::flaggedIndex(UniversalBuffer mask, UniversalBuffer data, UniversalBuffer outData, const size_t nElems, size_t & nSelect) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(flaggedIndex); - - services::Status status; - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - buildProgram(factory, data.type()); - - const uint32_t subSize = _preferableSubGroup; - const uint32_t localSize = _preferableSubGroup; - const uint32_t nLocalSums = _maxLocalSums * localSize < nElems ? _maxLocalSums : (nElems / localSize) + !!(nElems % localSize); - const uint32_t nSubgroupSums = nLocalSums * (localSize / subSize); - - auto partialSums = context.allocate(TypeIds::id(), nSubgroupSums + 1, status); - auto partialPrefixSums = context.allocate(TypeIds::id(), nSubgroupSums + 1, status); - auto totalSum = context.allocate(TypeIds::id(), 1, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_CHECK_STATUS_VAR(scanIndex(factory, mask, data, partialSums, nElems, localSize, nLocalSums)); - DAAL_CHECK_STATUS_VAR(sumScan(factory, partialSums, partialPrefixSums, totalSum, localSize, nSubgroupSums)); - DAAL_CHECK_STATUS_VAR(reorderIndex(factory, mask, data, outData, partialPrefixSums, nElems, localSize, nLocalSums)); - - { - auto totalSumHost = totalSum.template get().toHost(data_management::ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - const int * totalSumHostPtr = totalSumHost.get(); - if (!totalSumHostPtr) - { - return services::Status(services::ErrorNullPtr); - } - - nSelect = totalSumHostPtr[0]; - } - - return status; -} - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/partition.h b/cpp/daal/src/sycl/partition.h deleted file mode 100755 index 26d1e2093ea..00000000000 --- a/cpp/daal/src/sycl/partition.h +++ /dev/null @@ -1,72 +0,0 @@ -/* file: partition.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __PARTITION_H__ -#define __PARTITION_H__ - -#include "src/sycl/math_service_types.h" -#include "services/internal/buffer.h" -#include "services/internal/sycl/types_utils.h" -#include "services/internal/sycl/execution_context.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -class Partition -{ -public: - Partition() = delete; - - static services::Status flaggedIndex(UniversalBuffer mask, UniversalBuffer data, UniversalBuffer outData, const size_t nElems, size_t & nSelect); - static services::Status flagged(UniversalBuffer mask, UniversalBuffer data, UniversalBuffer outData, const size_t nElems, size_t & nSelect); - -protected: - static services::Status reorderIndex(ClKernelFactoryIface & kernelFactory, UniversalBuffer & mask, UniversalBuffer & data, - UniversalBuffer & outData, UniversalBuffer & partialPrefixSums, size_t nElems, size_t localSize, - size_t nLocalSums); - - static services::Status reorder(ClKernelFactoryIface & kernelFactory, UniversalBuffer & mask, UniversalBuffer & data, UniversalBuffer & outData, - UniversalBuffer & partialPrefixSums, size_t nElems, size_t localSize, size_t nLocalSums); - - static services::Status scanIndex(ClKernelFactoryIface & factory, UniversalBuffer & mask, UniversalBuffer & data, UniversalBuffer & partialSums, - size_t nElems, size_t localSize, size_t nLocalSums); - - static services::Status sumScan(ClKernelFactoryIface & kernelFactory, UniversalBuffer & partialSums, UniversalBuffer & partialPrefixSums, - UniversalBuffer & totalSum, size_t localSize, size_t nSubgroupSums); - - static services::Status scan(ClKernelFactoryIface & kernelFactory, UniversalBuffer & mask, UniversalBuffer & partialSums, size_t nElems, - size_t localSize, size_t nLocalSums); - -private: - static services::Status buildProgram(ClKernelFactoryIface & factory, const TypeId & vectorTypeId); - -private: - static const uint32_t _preferableSubGroup = 16; // preferable maximal sub-group size - static const uint32_t _maxLocalSums = 256; -}; - -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/reducer.cpp b/cpp/daal/src/sycl/reducer.cpp deleted file mode 100644 index 09bc4acb071..00000000000 --- a/cpp/daal/src/sycl/reducer.cpp +++ /dev/null @@ -1,265 +0,0 @@ -/* file: reducer.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/sycl/reducer.h" -#include "services/internal/execution_context.h" -#include "src/externals/service_profiler.h" -#include "src/sycl/cl_kernels/op_reducer.cl" -#include "services/daal_defines.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -services::Status Reducer::buildProgram(ClKernelFactoryIface & kernelFactory, const BinaryOp op, const TypeId & vectorTypeId) -{ - services::String fptype_name = getKeyFPType(vectorTypeId); - auto build_options = fptype_name; - build_options.add(" -cl-std=CL1.2 -D LOCAL_BUFFER_SIZE=256"); - - if (op == BinaryOp::MIN) - { - build_options.add(" -D UNARY_OP=none -D BINARY_OP=min -D INIT_VALUE=FLT_MAX"); - } - else if (op == BinaryOp::MAX) - { - build_options.add(" -D UNARY_OP=none -D BINARY_OP=max -D INIT_VALUE=-FLT_MAX"); - } - else if (op == BinaryOp::SUM) - { - build_options.add(" -D UNARY_OP=none -D BINARY_OP=sum -D INIT_VALUE=0.0"); - } - else if (op == BinaryOp::SUM_OF_SQUARES) - { - build_options.add(" -D UNARY_OP=pow2 -D BINARY_OP=sum -D INIT_VALUE=0.0"); - } - - services::String cachekey("__daal_oneapi_internal_math_reducer_"); - cachekey.add(build_options); - - services::Status status; - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), op_reduce, build_options.c_str(), status); - return status; -} - -services::Status Reducer::singlepass(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, Layout vectorsLayout, - const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, uint32_t workItemsPerGroup, - UniversalBuffer & reduceRes) -{ - services::Status status; - auto reduce_kernel = kernelFactory.getKernel("reduceSinglepass", status); - DAAL_CHECK_STATUS_VAR(status); - - // no need to check overflow for nVectors * vectorSize due to we already have buffer vectors of such size - if (vectors.type() == TypeIds::id()) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, float, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(reduceRes, float, nVectors); - } - else - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, double, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(reduceRes, double, nVectors); - } - - KernelRange localRange(workItemsPerGroup, 1); - KernelRange globalRange(workItemsPerGroup, nVectors); - - KernelNDRange range(2); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - uint32_t vectorsAreRows = vectorsLayout == Layout::RowMajor ? 1 : 0; - args.set(0, vectorsAreRows); - args.set(1, vectors, AccessModeIds::read); - args.set(2, nVectors); - args.set(3, vectorSize); - args.set(4, reduceRes, AccessModeIds::write); - - context.run(range, reduce_kernel, args, status); - return status; -} - -services::Status Reducer::runStepColmajor(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, const UniversalBuffer & vectors, - uint32_t nVectors, uint32_t vectorSize, uint32_t numWorkItems, uint32_t numWorkGroups, - uint32_t numDivisionsByRow, Reducer::Result & stepResult) -{ - services::Status status; - auto reduce_kernel = kernelFactory.getKernel("reduceStepColmajor", status); - DAAL_CHECK_STATUS_VAR(status); - - // no need to check overflow for nVectors * vectorSize due to we already have buffer vectors of such size - if (vectors.type() == TypeIds::id()) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, float, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.reduceRes, float, nVectors * numDivisionsByRow); - } - else - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, double, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.reduceRes, double, nVectors * numDivisionsByRow); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, numWorkGroups, numWorkItems); - - KernelRange localRange(numWorkItems); - KernelRange globalRange(numWorkGroups * numWorkItems); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - - args.set(0, vectors, AccessModeIds::read); - args.set(1, nVectors); - args.set(2, vectorSize); - args.set(3, stepResult.reduceRes, AccessModeIds::write); - - context.run(range, reduce_kernel, args, status); - return status; -} - -services::Status Reducer::runFinalStepRowmajor(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, Reducer::Result & stepResult, - uint32_t nVectors, uint32_t vectorSize, uint32_t workItemsPerGroup, Reducer::Result & result) -{ - services::Status status; - auto reduce_kernel = kernelFactory.getKernel("reduceFinalStepRowmajor", status); - DAAL_CHECK_STATUS_VAR(status); - - // no need to check overflow for nVectors * vectorSize due to we already have buffer vectors of such size - if (result.reduceRes.type() == TypeIds::id()) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.reduceRes, float, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.reduceRes, float, nVectors); - } - else - { - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.reduceRes, double, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.reduceRes, double, nVectors); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, workItemsPerGroup, nVectors); - - KernelRange localRange(workItemsPerGroup); - KernelRange globalRange(workItemsPerGroup * nVectors); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, stepResult.reduceRes, AccessModeIds::read); - args.set(1, nVectors); - args.set(2, vectorSize); - args.set(3, result.reduceRes, AccessModeIds::write); - - context.run(range, reduce_kernel, args, status); - return status; -} - -Reducer::Result Reducer::reduce(const BinaryOp op, Layout vectorsLayout, const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, - services::Status & status) -{ - auto & context = services::internal::getDefaultContext(); - Result result(context, nVectors, vectors.type(), status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, Reducer::Result()); - return Reducer::reduce(op, vectorsLayout, vectors, result.reduceRes, nVectors, vectorSize, status); -} - -Reducer::Result Reducer::reduce(const BinaryOp op, Layout vectorsLayout, const UniversalBuffer & vectors, UniversalBuffer & resReduce, - uint32_t nVectors, uint32_t vectorSize, services::Status & status) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(Reducer); - auto & context = services::internal::getDefaultContext(); - - Result result(context, resReduce, nVectors, vectors.type(), status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, Reducer::Result()); - - DAAL_ASSERT(vectors.type() == TypeIds::id() || vectors.type() == TypeIds::id()); - - auto & kernelFactory = context.getClKernelFactory(); - - status |= buildProgram(kernelFactory, op, vectors.type()); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, result); - - const uint32_t maxWorkItemsPerGroup = 256; - const uint32_t maxNumSubSlices = 9; - - if (vectorsLayout == Layout::RowMajor) - { - status |= singlepass(context, kernelFactory, vectorsLayout, vectors, nVectors, vectorSize, maxWorkItemsPerGroup, resReduce); - } - else - { - const uint32_t numDivisionsByCol = (nVectors + maxWorkItemsPerGroup - 1) / maxWorkItemsPerGroup; - uint32_t numDivisionsByRow = 9; - if (vectorSize < 5000) - numDivisionsByRow = 1; - else if (vectorSize < 10000) - numDivisionsByRow = 3; - else if (vectorSize < 20000) - numDivisionsByRow = 6; - - const uint32_t workItemsPerGroup = (maxWorkItemsPerGroup < nVectors) ? maxWorkItemsPerGroup : nVectors; - - if (numDivisionsByRow > 1) - { - // no need to check overflow for numDivisionsByRow * nVectors due to numDivisionsByRow less than vectorSize, - // and input vectors buffer has size of vectorSize * numDivisionsByRow - Result stepResult(context, numDivisionsByRow * nVectors, vectors.type(), status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, result); - - status |= runStepColmajor(context, kernelFactory, vectors, nVectors, vectorSize, workItemsPerGroup, numDivisionsByCol * numDivisionsByRow, - numDivisionsByRow, stepResult); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, result); - - const uint32_t stepWorkItems = maxNumSubSlices / 2; //need to be power of two - status |= runFinalStepRowmajor(context, kernelFactory, stepResult, nVectors, numDivisionsByRow, stepWorkItems, result); - } - else - { - status |= runStepColmajor(context, kernelFactory, vectors, nVectors, vectorSize, workItemsPerGroup, numDivisionsByCol, numDivisionsByRow, - result); - } - } - - return result; -} - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/reducer.h b/cpp/daal/src/sycl/reducer.h deleted file mode 100644 index 403a42d721e..00000000000 --- a/cpp/daal/src/sycl/reducer.h +++ /dev/null @@ -1,112 +0,0 @@ -/* file: reducer.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __REDUCER_H__ -#define __REDUCER_H__ - -#include "src/sycl/math_service_types.h" -#include "services/internal/buffer.h" -#include "src/sycl/cl_kernels/sum_reducer.cl" -#include "services/internal/sycl/types_utils.h" -#include "services/internal/sycl/execution_context.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -class SumReducer -{ -public: - SumReducer() = delete; - - struct Result - { - UniversalBuffer sum; - UniversalBuffer sumOfSquares; - - Result() {} - - Result(ExecutionContextIface & context, uint32_t nVectors, TypeId type, services::Status & status) - : sum(context.allocate(type, nVectors, status)), sumOfSquares(context.allocate(type, nVectors, status)) - {} - }; - -public: - static Result sum(Layout vectorsLayout, const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, services::Status & status); - static Result sum(Layout vectorsLayout, const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, Result & result, - services::Status & status); -}; - -class Reducer -{ -public: - Reducer() = delete; - - enum class BinaryOp - { - MIN, - MAX, - SUM, - SUM_OF_SQUARES - }; - - struct Result - { - UniversalBuffer reduceRes; - - Result() {} - - Result(ExecutionContextIface & context, uint32_t nVectors, TypeId type, services::Status & status) - : reduceRes(context.allocate(type, nVectors, status)) - {} - - Result(ExecutionContextIface & context, UniversalBuffer & resReduce, uint32_t nVectors, TypeId type, services::Status & status) - : reduceRes(resReduce) - {} - }; - -public: - static Result reduce(const BinaryOp op, Layout vectorsLayout, const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, - services::Status & status); - static Result reduce(const BinaryOp op, Layout vectorsLayout, const UniversalBuffer & vectors, UniversalBuffer & resReduce, uint32_t nVectors, - uint32_t vectorSize, services::Status & status); - -private: - static services::Status buildProgram(ClKernelFactoryIface & kernelFactory, const BinaryOp op, const TypeId & vectorType); - static services::Status singlepass(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, Layout vectorsLayout, - const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, uint32_t workItemsPerGroup, - UniversalBuffer & result); - static services::Status runStepColmajor(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, const UniversalBuffer & vectors, - uint32_t nVectors, uint32_t vectorSize, uint32_t numWorkItems, uint32_t numWorkGroups, - uint32_t numDivisionsByRow, Reducer::Result & stepResult); - static services::Status runFinalStepRowmajor(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, Reducer::Result & stepResult, - uint32_t nVectors, uint32_t vectorSize, uint32_t workItemsPerGroup, Reducer::Result & result); -}; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/select_indexed.cpp b/cpp/daal/src/sycl/select_indexed.cpp deleted file mode 100644 index f396c8905bf..00000000000 --- a/cpp/daal/src/sycl/select_indexed.cpp +++ /dev/null @@ -1,363 +0,0 @@ -/* file: select_indexed.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "services/daal_defines.h" -#include "src/sycl/select_indexed.h" -#include "src/sycl/cl_kernels/select_indexed.cl" -#include "services/internal/execution_context.h" -#include "src/externals/service_rng.h" -#include "src/algorithms/engines/engine_batch_impl.h" -#include "services/daal_string.h" -#include "src/services/service_data_utils.h" -#include "src/externals/service_profiler.h" - -using namespace daal::data_management; -using namespace daal::services::internal; - -constexpr uint32_t maxInt32AsUint32T = static_cast(daal::services::internal::MaxVal::get()); - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace selection -{ -services::Status runQuickSelectSimd(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, const UniversalBuffer & dataVectors, - const UniversalBuffer & indexVectors, const UniversalBuffer & rndSeq, uint32_t nRndSeq, uint32_t nK, - uint32_t nVectors, uint32_t vectorSize, uint32_t lastVectorSize, uint32_t vectorOffset, - QuickSelectIndexed::Result & result) -{ - services::Status status; - - DAAL_ASSERT(nRndSeq <= maxInt32AsUint32T && nRndSeq > 0); - DAAL_ASSERT(vectorSize <= maxInt32AsUint32T); - DAAL_ASSERT(vectorOffset <= maxInt32AsUint32T && vectorOffset >= vectorSize); - DAAL_ASSERT(lastVectorSize <= vectorSize); - DAAL_ASSERT(nK <= lastVectorSize); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(int32_t, vectorOffset, (nVectors - 1)); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(int32_t, nVectors, nK); - DAAL_OVERFLOW_CHECK_BY_ADDING(int32_t, vectorOffset * (nVectors - 1), lastVectorSize); - - if (dataVectors.type() == TypeIds::float32) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(dataVectors, float, vectorOffset *(nVectors - 1) + lastVectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.values, float, nVectors * nK); - } - else if (dataVectors.type() == TypeIds::float64) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(dataVectors, double, vectorOffset *(nVectors - 1) + lastVectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.values, double, nVectors * nK); - } - else - { - return services::Status(ErrorDataTypeNotSupported); - } - DAAL_ASSERT_UNIVERSAL_BUFFER(indexVectors, int, vectorOffset *(nVectors - 1) + lastVectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.indices, int, nVectors * nK); - - auto func_kernel = kernelFactory.getKernel("quick_select_group", status); - DAAL_CHECK_STATUS_VAR(status); - - const uint32_t maxWorkItemsPerGroup = 16; - KernelRange localRange(1, maxWorkItemsPerGroup); - KernelRange globalRange(nVectors, maxWorkItemsPerGroup); - - KernelNDRange range(2); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(10, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, dataVectors, AccessModeIds::read); - args.set(1, indexVectors, AccessModeIds::read); - args.set(2, result.values, AccessModeIds::write); - args.set(3, result.indices, AccessModeIds::write); - args.set(4, rndSeq, AccessModeIds::read); - args.set(5, static_cast(nRndSeq)); - args.set(6, static_cast(vectorSize)); - args.set(7, static_cast(lastVectorSize)); - args.set(8, static_cast(nK)); - args.set(9, static_cast(vectorOffset)); - - context.run(range, func_kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - return status; -} - -services::Status runDirectSelectSimd(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, const UniversalBuffer & dataVectors, - uint32_t nK, uint32_t nVectors, uint32_t vectorSize, uint32_t lastVectorSize, uint32_t vectorOffset, - QuickSelectIndexed::Result & result) -{ - services::Status status; - DAAL_ASSERT(vectorSize <= maxInt32AsUint32T); - DAAL_ASSERT(vectorOffset <= maxInt32AsUint32T && vectorOffset >= vectorSize); - DAAL_ASSERT(lastVectorSize <= vectorSize); - DAAL_ASSERT(nK <= lastVectorSize); - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(int32_t, vectorOffset, (nVectors - 1)); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(int32_t, nVectors, nK); - DAAL_OVERFLOW_CHECK_BY_ADDING(int32_t, vectorOffset * (nVectors - 1), lastVectorSize); - - if (dataVectors.type() == TypeIds::float32) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(dataVectors, float, vectorOffset *(nVectors - 1) + lastVectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.values, float, nVectors * nK); - } - else if (dataVectors.type() == TypeIds::float64) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(dataVectors, double, vectorOffset *(nVectors - 1) + lastVectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.values, double, nVectors * nK); - } - else - { - return services::Status(ErrorDataTypeNotSupported); - } - DAAL_ASSERT_UNIVERSAL_BUFFER(result.indices, int, nVectors * nK); - - auto func_kernel = kernelFactory.getKernel("direct_select_group", status); - DAAL_CHECK_STATUS_VAR(status); - - const uint32_t maxWorkItemsPerGroup = 16; - KernelRange localRange(1, maxWorkItemsPerGroup); - KernelRange globalRange(nVectors, maxWorkItemsPerGroup); - - KernelNDRange range(2); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, dataVectors, AccessModeIds::read); - args.set(1, result.values, AccessModeIds::write); - args.set(2, result.indices, AccessModeIds::write); - args.set(3, static_cast(vectorSize)); - args.set(4, static_cast(lastVectorSize)); - args.set(5, static_cast(vectorOffset)); - if (dataVectors.type() == TypeIds::float32) - { - args.set(6, FLT_MAX); - } - else - { - args.set(6, DBL_MAX); - } - - context.run(range, func_kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - return status; -} - -services::Status SelectIndexed::convertIndicesToLabels(const UniversalBuffer & indices, const UniversalBuffer & labels, uint32_t nVectors, - uint32_t vectorSize, uint32_t vectorOffset) -{ - Status status; - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(int32_t, vectorOffset, (nVectors - 1)); - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(int32_t, nVectors, vectorSize); - DAAL_OVERFLOW_CHECK_BY_ADDING(int32_t, vectorOffset * (nVectors - 1), vectorSize); - - DAAL_ASSERT_UNIVERSAL_BUFFER(indices, int, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(labels, int, vectorOffset *(nVectors - 1) + vectorSize); - - auto index2labels = labels.template get().toHost(ReadWriteMode::readOnly, status); - DAAL_CHECK_STATUS_VAR(status); - auto index2labelsPtr = index2labels.get(); - auto outIndex = indices.template get().toHost(ReadWriteMode::readWrite, status); - DAAL_CHECK_STATUS_VAR(status); - auto outIndexPtr = outIndex.get(); - for (size_t vec = 0; vec < nVectors; vec++) - { - for (size_t k = 0; k < vectorSize; k++) - { - int index = outIndexPtr[vec * vectorSize + k]; - outIndexPtr[vec * vectorSize + k] = index2labelsPtr[vec * vectorOffset + index]; - } - } - return status; -} - -services::Status QuickSelectIndexed::buildProgram(ClKernelFactoryIface & kernelFactory, const TypeId & vectorTypeId) -{ - services::Status status; - services::String fptypeName = getKeyFPType(vectorTypeId); - auto buildOptions = fptypeName; - buildOptions.add("-cl-std=CL1.2 "); - - services::String cachekey("__daal_oneapi_internal_qselect_indexed_"); - cachekey.add(fptypeName); - cachekey.add(buildOptions); - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), quick_select_simd, buildOptions.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - return status; -} - -SelectIndexed::Result & QuickSelectIndexed::selectIndices(const UniversalBuffer & dataVectors, const UniversalBuffer & tempIndices, - const UniversalBuffer & rndSeq, uint32_t nRndSeq, uint32_t nK, uint32_t nVectors, - uint32_t vectorSize, uint32_t lastVectorSize, uint32_t vectorOffset, Result & result, - services::Status & status) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(QuickSelectIndexed.select); - - auto & context = services::internal::getDefaultContext(); - auto & kernelFactory = context.getClKernelFactory(); - - status |= buildProgram(kernelFactory, dataVectors.type()); - if (!status.ok()) - { - return result; - } - status |= runQuickSelectSimd(context, kernelFactory, dataVectors, tempIndices, rndSeq, nRndSeq, nK, nVectors, vectorSize, lastVectorSize, - vectorOffset, result); - return result; -} - -Status QuickSelectIndexed::adjustIndexBuffer(uint32_t number, uint32_t size) -{ - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(uint32_t, size, number); - uint32_t newSize = size * number; - Status status; - if (_indexSize < newSize) - { - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - _indices = context.allocate(TypeIds::id(), newSize, status); - DAAL_CHECK_STATUS_VAR(status); - _indexSize = newSize; - } - return status; -} - -Status QuickSelectIndexed::init(Params & par) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(compute.RNG); - Status status; - _nRndSeq = (par.dataSize > _maxSeqLength || par.dataSize < 2) ? _maxSeqLength : par.dataSize; - auto engineImpl = dynamic_cast(&(*par.engine)); - if (!engineImpl) - { - return Status(ErrorIncorrectEngineParameter); - } - size_t numbers[_maxSeqLength]; - daal::internal::RNGsInst rng; - rng.uniform(_nRndSeq, &numbers[0], engineImpl->getState(), 0, (size_t)(_nRndSeq - 1)); - float values[_maxSeqLength]; - for (uint32_t i = 0; i < _nRndSeq; i++) - { - values[i] = static_cast(numbers[i]) / (_nRndSeq - 1); - } - auto & context = Environment::getInstance()->getDefaultExecutionContext(); - _rndSeq = context.allocate(par.type, _nRndSeq, status); - DAAL_CHECK_STATUS_VAR(status); - context.copy(_rndSeq, 0, (void *)&values[0], _nRndSeq, 0, _nRndSeq, status); - DAAL_CHECK_STATUS_VAR(status); - return status; -} - -SelectIndexed * QuickSelectIndexed::create(Params & par, services::Status & status) -{ - QuickSelectIndexed * ret = new QuickSelectIndexed(); - if (!ret) - { - status |= Status(ErrorMemoryAllocationFailed); - return nullptr; - } - status |= ret->init(par); - if (!status.ok()) - { - delete ret; - return nullptr; - } - return ret; -} - -services::Status DirectSelectIndexed::buildProgram(ClKernelFactoryIface & kernelFactory, const TypeId & vectorTypeId, uint32_t nK) -{ - services::Status status; - services::String fptypeName = getKeyFPType(vectorTypeId); - auto buildOptions = fptypeName; - buildOptions.add("-cl-std=CL1.2 -D __K__="); - char buffer[DAAL_MAX_STRING_SIZE]; - daal::services::daal_int_to_string(buffer, DAAL_MAX_STRING_SIZE, nK); - buildOptions.add(buffer); - - services::String cachekey("__daal_oneapi_internal_dselect_indexed_"); - cachekey.add(fptypeName); - cachekey.add(buildOptions); - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), direct_select_simd, buildOptions.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - return status; -} - -SelectIndexed::Result & DirectSelectIndexed::selectIndices(const UniversalBuffer & dataVectors, uint32_t nK, uint32_t nVectors, uint32_t vectorSize, - uint32_t lastVectorSize, uint32_t vectorOffset, Result & result, services::Status & status) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(QuickSelectIndexed.select); - - auto & context = services::internal::getDefaultContext(); - auto & kernelFactory = context.getClKernelFactory(); - - status |= buildProgram(kernelFactory, dataVectors.type(), nK); - if (!status.ok()) - { - return result; - } - status |= runDirectSelectSimd(context, kernelFactory, dataVectors, nK, nVectors, vectorSize, lastVectorSize, vectorOffset, result); - return result; -} - -SelectIndexed * DirectSelectIndexed::create(Params & par, services::Status & status) -{ - DirectSelectIndexed * ret = new DirectSelectIndexed(par.nK); - if (!ret) - { - status |= ErrorMemoryAllocationFailed; - return nullptr; - } - return ret; -} - -SelectIndexedFactory::SelectIndexedFactory() -{ - _entries << makeEntry(); - _entries << makeEntry(); -} - -SelectIndexed * SelectIndexedFactory::create(int nK, SelectIndexed::Params & par, services::Status & status) -{ - for (size_t i = 0; i < _entries.size(); i++) - { - if (_entries[i].inRange(nK)) - { - return _entries[i].createMethod(par, status); - } - } - status |= ErrorMethodNotImplemented; - return nullptr; -} - -} // namespace selection -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/select_indexed.h b/cpp/daal/src/sycl/select_indexed.h deleted file mode 100755 index 2194a745722..00000000000 --- a/cpp/daal/src/sycl/select_indexed.h +++ /dev/null @@ -1,161 +0,0 @@ -/* file: select_indexed.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __SELECT_INDEXED_H__ -#define __SELECT_INDEXED_H__ - -#include "algorithms/engines/engine.h" -#include "services/internal/buffer.h" -#include "services/daal_defines.h" -#include "services/internal/error_handling_helpers.h" -#include "services/internal/sycl/types_utils.h" -#include "services/internal/sycl/execution_context.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace selection -{ -class SelectIndexed -{ -public: - struct Result - { - UniversalBuffer values; - UniversalBuffer indices; - - Result() {} - Result(ExecutionContextIface & context, uint32_t nK, uint32_t nVectors, TypeId valueType, services::Status & status) - : values(context.allocate(valueType, nVectors * nK, status)), indices(context.allocate(TypeIds::id(), nVectors * nK, status)) - {} - }; - struct Params - { - Params(uint32_t nK, TypeId fptype, uint32_t size, daal::algorithms::engines::EnginePtr eng) - : nK(nK), type(fptype), dataSize(size), engine(eng) - {} - uint32_t nK; - TypeId type; - uint32_t dataSize; - daal::algorithms::engines::EnginePtr engine; - }; - -public: - virtual ~SelectIndexed() {} - virtual Result & selectIndices(const UniversalBuffer & dataVectors, uint32_t nK, uint32_t nVectors, uint32_t vectorSize, uint32_t lastVectorSize, - uint32_t vectorOffset, Result & result, services::Status & status) = 0; - static services::Status convertIndicesToLabels(const UniversalBuffer & indices, const UniversalBuffer & labels, uint32_t nVectors, - uint32_t vectorSize, uint32_t vectorOffset); - services::Status selectNearestDistancesAndLabels(const UniversalBuffer & distances, const UniversalBuffer & dataLabels, uint32_t nK, - uint32_t nVectors, uint32_t vectorSize, uint32_t vectorOffset, uint32_t labelOffset, - Result & result) - { - services::Status status; - selectIndices(distances, nK, nVectors, vectorSize, vectorSize, vectorOffset, result, status); - DAAL_CHECK_STATUS_VAR(status); - DAAL_CHECK_STATUS_VAR(convertIndicesToLabels(result.indices, dataLabels, nVectors, nK, labelOffset)); - return services::Status(); - } -}; - -class SelectIndexedFactory -{ -public: - SelectIndexedFactory(); - SelectIndexed * create(int nK, SelectIndexed::Params & par, services::Status & st); - -private: - typedef SelectIndexed * (*CreateFuncType)(SelectIndexed::Params & par, services::Status & st); - struct Entry - { - int minK; - int maxK; - CreateFuncType createMethod; - bool inRange(int nK) const { return nK >= minK && nK <= maxK; } - }; - template - Entry makeEntry() - { - Entry e; - e.minK = T::minK; - e.maxK = T::maxK; - e.createMethod = T::create; - return e; - } - daal::services::Collection _entries; -}; - -class QuickSelectIndexed : public SelectIndexed -{ -public: - static const int minK = 33; - static const int maxK = INT_MAX; - static SelectIndexed * create(Params & par, daal::services::Status & st); - virtual Result & selectIndices(const UniversalBuffer & dataVectors, uint32_t nK, uint32_t nVectors, uint32_t vectorSize, uint32_t lastVectorSize, - uint32_t vectorOffset, Result & result, services::Status & status) - { - status |= adjustIndexBuffer(nVectors, vectorSize); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, result); - return selectIndices(dataVectors, _indices, _rndSeq, _nRndSeq, nK, nVectors, vectorSize, lastVectorSize, vectorOffset, result, status); - } - -private: - QuickSelectIndexed() {} - services::Status adjustIndexBuffer(uint32_t number, uint32_t size); - static Result & selectIndices(const UniversalBuffer & dataVectors, const UniversalBuffer & tempIndices, const UniversalBuffer & rndSeq, - uint32_t nRndSeq, uint32_t nK, uint32_t nVectors, uint32_t vectorSize, uint32_t lastVectorSize, - uint32_t vectorOffset, Result & result, services::Status & status); - daal::services::Status init(Params & par); - static services::Status buildProgram(ClKernelFactoryIface & kernelFactory, const TypeId & vectorTypeId); - -private: - static const uint32_t _maxSeqLength = 1024; - UniversalBuffer _indices; - uint32_t _indexSize = 0; - UniversalBuffer _rndSeq; - uint32_t _nRndSeq = 0; -}; - -class DirectSelectIndexed : public SelectIndexed -{ -public: - static const int minK = 1; - static const int maxK = 32; - static SelectIndexed * create(Params & par, daal::services::Status & st); - virtual Result & selectIndices(const UniversalBuffer & dataVectors, uint32_t nK, uint32_t nVectors, uint32_t vectorSize, uint32_t lastVectorSize, - uint32_t vectorOffset, Result & result, services::Status & status); - -private: - static services::Status buildProgram(ClKernelFactoryIface & kernelFactory, const TypeId & vectorTypeId, uint32_t nK); - -private: - DirectSelectIndexed(uint32_t nK) : _nK(nK) {} - uint32_t _nK; -}; - -} // namespace selection -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/sorter.cpp b/cpp/daal/src/sycl/sorter.cpp deleted file mode 100644 index 58752c66720..00000000000 --- a/cpp/daal/src/sycl/sorter.cpp +++ /dev/null @@ -1,287 +0,0 @@ -/* file: sorter.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/sycl/sorter.h" -#include "services/internal/execution_context.h" -#include "src/externals/service_profiler.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace sort -{ -services::String GetIntegerTypeForFPType(const TypeId & vectorTypeId) -{ - if (vectorTypeId == TypeIds::Id::float32) - { - return " -D radixIntType=uint "; - } - else - { - return " -D radixIntType=ulong "; - } -} - -services::Status buildProgram(ClKernelFactoryIface & kernelFactory, const TypeId & vectorTypeId) -{ - services::Status status; - - services::String fptype_name = getKeyFPType(vectorTypeId); - // add type from name at the end - auto radixtype_name = GetIntegerTypeForFPType(vectorTypeId); - auto build_options = fptype_name + radixtype_name; - build_options.add("-cl-std=CL1.2 -D sortedType=int"); - - services::String cachekey("__daal_oneapi_internal_sort_radix_sort__"); - cachekey.add(build_options); - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), radix_sort_simd, build_options.c_str(), status); - return status; -} - -static services::Status runRadixSortSimd(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, const UniversalBuffer & input, - const UniversalBuffer & output, const UniversalBuffer & buffer, uint32_t nVectors, uint32_t vectorSize, - uint32_t vectorOffset) -{ - services::Status status; - auto sum_kernel = kernelFactory.getKernel("radix_sort_group", status); - DAAL_CHECK_STATUS_VAR(status); - - const uint32_t maxWorkItemsPerGroup = 32; - KernelRange localRange(1, maxWorkItemsPerGroup); - KernelRange globalRange(nVectors, maxWorkItemsPerGroup); - - KernelNDRange range(2); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - DAAL_ASSERT_UNIVERSAL_BUFFER(input, int, nVectors); - DAAL_ASSERT_UNIVERSAL_BUFFER(output, int, nVectors); - DAAL_ASSERT_UNIVERSAL_BUFFER(buffer, int, nVectors); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, input, AccessModeIds::read); - args.set(1, output, AccessModeIds::write); - args.set(2, buffer, AccessModeIds::read); - args.set(3, vectorSize); - args.set(4, vectorOffset); - - context.run(range, sum_kernel, args, status); - - return status; -} - -services::Status RadixSort::sort(const UniversalBuffer & input, const UniversalBuffer & output, const UniversalBuffer & buffer, uint32_t nVectors, - uint32_t vectorSize, uint32_t vectorOffset) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(RadixSort.sort); - - auto & context = services::internal::getDefaultContext(); - auto & kernelFactory = context.getClKernelFactory(); - - services::Status status = buildProgram(kernelFactory, input.type()); - DAAL_CHECK_STATUS_VAR(status); - - status |= runRadixSortSimd(context, kernelFactory, input, output, buffer, nVectors, vectorSize, vectorOffset); - return status; -} - -services::Status RadixSort::sortIndices(UniversalBuffer & values, UniversalBuffer & indices, UniversalBuffer & valuesOut, - UniversalBuffer & indicesOut, uint32_t nRows) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(RadixSort.sortIndices); - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & kernelFactory = context.getClKernelFactory(); - - DAAL_CHECK_STATUS_VAR(buildProgram(kernelFactory, values.type())); - - const uint32_t sizeFPtype = values.type() == TypeIds::Id::float32 ? 4 : values.type() == TypeIds::Id::float64 ? 8 : 0; - - const uint32_t radixBits = 4; - const uint32_t subSize = _preferableSubGroup; - const uint32_t localSize = _preferableSubGroup; - const uint32_t nLocalHists = 1024 * localSize < nRows ? 1024 : (nRows / localSize) + !!(nRows % localSize); - const uint32_t nSubgroupHists = nLocalHists * (localSize / subSize); - - auto partialHists = context.allocate(TypeIds::id(), (nSubgroupHists + 1) << _radixBits, status); - DAAL_CHECK_STATUS_VAR(status); - auto partialPrefixHists = context.allocate(TypeIds::id(), (nSubgroupHists + 1) << _radixBits, status); - DAAL_CHECK_STATUS_VAR(status); - - uint32_t rev = 0; - - for (uint32_t bitOffset = 0; bitOffset < 8 * sizeFPtype; bitOffset += radixBits, rev ^= 1) - { - if (!rev) - { - DAAL_CHECK_STATUS_VAR(radixScan(values, partialHists, nRows, bitOffset, localSize, nLocalHists)); - DAAL_CHECK_STATUS_VAR(radixHistScan(values, partialHists, partialPrefixHists, localSize, nSubgroupHists)); - DAAL_CHECK_STATUS_VAR(radixReorder(values, indices, partialPrefixHists, valuesOut, indicesOut, nRows, bitOffset, localSize, nLocalHists)); - } - else - { - DAAL_CHECK_STATUS_VAR(radixScan(valuesOut, partialHists, nRows, bitOffset, localSize, nLocalHists)); - DAAL_CHECK_STATUS_VAR(radixHistScan(values, partialHists, partialPrefixHists, localSize, nSubgroupHists)); - DAAL_CHECK_STATUS_VAR(radixReorder(valuesOut, indicesOut, partialPrefixHists, values, indices, nRows, bitOffset, localSize, nLocalHists)); - } - } - - DAAL_ASSERT(rev == 0); // if not, we need to swap values/indices and - // valuesOut/indices_bufus); - return status; -} - -services::Status RadixSort::radixScan(UniversalBuffer & values, UniversalBuffer & partialHists, uint32_t nRows, uint32_t bitOffset, - uint32_t localSize, uint32_t nLocalHists) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(RadixSort.radixScan); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - DAAL_CHECK_STATUS_VAR(buildProgram(factory, values.type())); - - auto kernel = factory.getKernel("radixScan", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHists, int, (nLocalHists + 1) << _radixBits); - - KernelArguments args(4, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, values, AccessModeIds::read); - args.set(1, partialHists, AccessModeIds::write); - args.set(2, nRows); - args.set(3, bitOffset); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalHists); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -services::Status RadixSort::radixHistScan(UniversalBuffer & values, UniversalBuffer & partialHists, UniversalBuffer & partialPrefixHists, - uint32_t localSize, uint32_t nSubgroupHists) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(RadixSort.radixHistScan); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - DAAL_CHECK_STATUS_VAR(buildProgram(factory, values.type())); - auto kernel = factory.getKernel("radixHistScan", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER(partialHists, int, (nSubgroupHists + 1) << _radixBits); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialPrefixHists, int, (nSubgroupHists + 1) << _radixBits); - - KernelArguments args(3, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, partialHists, AccessModeIds::read); - args.set(1, partialPrefixHists, AccessModeIds::write); - args.set(2, nSubgroupHists); - - KernelRange local_range(localSize); - KernelRange global_range(localSize); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -services::Status RadixSort::radixReorder(UniversalBuffer & valuesSrc, UniversalBuffer & indicesSrc, UniversalBuffer & partialPrefixHists, - UniversalBuffer & valuesDst, UniversalBuffer & indicesDst, uint32_t nRows, uint32_t bitOffset, - uint32_t localSize, uint32_t nLocalHists) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(RadixSort.radixReorder); - - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - - DAAL_CHECK_STATUS_VAR(buildProgram(factory, valuesSrc.type())); - auto kernel = factory.getKernel("radixReorder", status); - DAAL_CHECK_STATUS_VAR(status); - - { - DAAL_ASSERT_UNIVERSAL_BUFFER2(indicesSrc, int, uint32_t, nRows); - DAAL_ASSERT_UNIVERSAL_BUFFER(partialPrefixHists, int, (nLocalHists + 1) << _radixBits); - DAAL_ASSERT_UNIVERSAL_BUFFER2(indicesDst, int, uint32_t, nRows); - - KernelArguments args(7, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, valuesSrc, AccessModeIds::read); - args.set(1, indicesSrc, AccessModeIds::read); - args.set(2, partialPrefixHists, AccessModeIds::read); - args.set(3, valuesDst, AccessModeIds::write); - args.set(4, indicesDst, AccessModeIds::write); - args.set(5, nRows); - args.set(6, bitOffset); - - KernelRange local_range(localSize); - KernelRange global_range(localSize * nLocalHists); - - KernelNDRange range(1); - range.global(global_range, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(local_range, status); - DAAL_CHECK_STATUS_VAR(status); - - context.run(range, kernel, args, status); - DAAL_CHECK_STATUS_VAR(status); - } - - return status; -} - -} // namespace sort -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/sorter.h b/cpp/daal/src/sycl/sorter.h deleted file mode 100644 index 6b9f901ab1f..00000000000 --- a/cpp/daal/src/sycl/sorter.h +++ /dev/null @@ -1,69 +0,0 @@ -/* file: sorter.h */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#ifndef __SORTER_H__ -#define __SORTER_H__ - -#include "src/sycl/math_service_types.h" -#include "services/internal/buffer.h" -#include "src/sycl/cl_kernels/radix_sort.cl" -#include "services/internal/sycl/types_utils.h" -#include "services/internal/sycl/execution_context.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace sort -{ -class RadixSort -{ -public: - RadixSort() = delete; - - static services::Status sort(const UniversalBuffer & input, const UniversalBuffer & output, const UniversalBuffer & buffer, uint32_t nVectors, - uint32_t vectorSize, uint32_t vectorOffset); - - static services::Status sortIndices(UniversalBuffer & values, UniversalBuffer & indices, UniversalBuffer & valuesOut, - UniversalBuffer & indicesOut, uint32_t nRows); - - static services::Status radixScan(UniversalBuffer & values, UniversalBuffer & partialHists, uint32_t nRows, uint32_t bitOffset, - uint32_t localSize, uint32_t nLocalHists); - - static services::Status radixHistScan(UniversalBuffer & values, UniversalBuffer & partialHists, UniversalBuffer & partialPrefixHists, - uint32_t localSize, uint32_t nSubgroupHists); - - static services::Status radixReorder(UniversalBuffer & valuesSrc, UniversalBuffer & indicesSrc, UniversalBuffer & partialPrefixHists, - UniversalBuffer & valuesDst, UniversalBuffer & indicesDst, uint32_t nRows, uint32_t bitOffset, - uint32_t localSize, uint32_t nLocalHists); - -private: - static const uint32_t _preferableSubGroup = 16; // preferable maximal sub-group size - static const uint32_t _radixBits = 4; // number of bits used for a single pass of radix sort -}; - -} // namespace sort -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/spblas_gpu.cpp b/cpp/daal/src/sycl/spblas_gpu.cpp deleted file mode 100755 index cdfcdf3508d..00000000000 --- a/cpp/daal/src/sycl/spblas_gpu.cpp +++ /dev/null @@ -1,91 +0,0 @@ -/* file: spblas_gpu.cpp */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/sycl/spblas_gpu.h" -#include "src/sycl/cl_kernels/kernel_sparse_blas.cl" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -template -services::Status SpBlasGpu::xgemm(const Transpose transa, const Transpose transb, const size_t m, const size_t n, const size_t k, - const algorithmFPType alpha, const services::internal::Buffer & a_buffer, - const services::internal::Buffer & aColsBuff, - const services::internal::Buffer & aRowIndBuff, - const services::internal::Buffer & b_buffer, - const services::internal::Buffer & bColsBuff, - const services::internal::Buffer & bRowIndBuff, const algorithmFPType beta, - services::internal::Buffer & c_buffer, const size_t ldc, const size_t offsetC) -{ - services::Status status; - - auto & context = services::internal::getDefaultContext(); - auto & factory = context.getClKernelFactory(); - services::String options = getKeyFPType(); - services::String cacheKey = "__daal_services_math_spmm_"; - cacheKey.add(options); - - factory.build(ExecutionTargetIds::device, cacheKey.c_str(), clKernelSpGemm, options.c_str(), status); - DAAL_CHECK_STATUS_VAR(status); - const char * const kernelName = beta != algorithmFPType(0) ? "spmm_kernel" : "spmm_kernel_without_sum"; - KernelPtr kernelSpGemm = factory.getKernel(kernelName, status); - DAAL_CHECK_STATUS_VAR(status); - - const size_t one = size_t(1); - - if (transa == Transpose::Trans && transb == Transpose::NoTrans) - { - KernelArguments args(11, status); - DAAL_CHECK_STATUS_VAR(status); - - args.set(0, alpha); - args.set(1, a_buffer); - args.set(2, aColsBuff); - args.set(3, aRowIndBuff); - args.set(4, b_buffer); - args.set(5, bColsBuff); - args.set(6, bRowIndBuff); - args.set(7, c_buffer); - args.set(8, ldc); - args.set(9, offsetC); - args.set(10, beta); - KernelRange range(m, n); - context.run(range, kernelSpGemm, args, status); - } - else - { - return services::ErrorMethodNotImplemented; - } - - return status; -} - -template class SpBlasGpu; -template class SpBlasGpu; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/daal/src/sycl/spblas_gpu.h b/cpp/daal/src/sycl/spblas_gpu.h deleted file mode 100755 index 71b2eceaea8..00000000000 --- a/cpp/daal/src/sycl/spblas_gpu.h +++ /dev/null @@ -1,61 +0,0 @@ -/* file: spblas_gpu.h */ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -/* -//++ -// Template wrappers for common GPU blas functions. -//-- -*/ - -#ifndef __SERVICE_ONEAPI_SPBLAS_GPU_H__ -#define __SERVICE_ONEAPI_SPBLAS_GPU_H__ - -#include "services/internal/sycl/execution_context.h" -#include "services/internal/sycl/types_utils.h" -#include "src/sycl/math_service_types.h" -#include "services/internal/buffer.h" -#include "services/internal/execution_context.h" -#include "services/internal/sycl/math/types.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -template -struct DAAL_EXPORT SpBlasGpu -{ - static services::Status xgemm(const Transpose transa, const Transpose transb, const size_t m, const size_t n, const size_t k, - const algorithmFPType alpha, const services::internal::Buffer & a_buffer, - const services::internal::Buffer & ja_buffer, const services::internal::Buffer & ia_buffer, - const services::internal::Buffer & b_buffer, const services::internal::Buffer & jb_buffer, - const services::internal::Buffer & ib_buffer, const algorithmFPType beta, - services::internal::Buffer & c_buffer, const size_t ldc, const size_t offsetC); -}; - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal - -#endif diff --git a/cpp/daal/src/sycl/sum_reducer.cpp b/cpp/daal/src/sycl/sum_reducer.cpp deleted file mode 100644 index a9c9af4e01f..00000000000 --- a/cpp/daal/src/sycl/sum_reducer.cpp +++ /dev/null @@ -1,257 +0,0 @@ -/* file: sum_reducer.cpp */ -/******************************************************************************* -* Copyright 2014 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "src/sycl/reducer.h" -#include "services/internal/execution_context.h" -#include "src/externals/service_profiler.h" -#include "services/daal_defines.h" - -namespace daal -{ -namespace services -{ -namespace internal -{ -namespace sycl -{ -namespace math -{ -services::Status buildProgram(ClKernelFactoryIface & kernelFactory, const TypeId & vectorTypeId) -{ - services::Status status; - - services::String fptype_name = getKeyFPType(vectorTypeId); - auto build_options = fptype_name; - build_options.add("-cl-std=CL1.2 -D LOCAL_BUFFER_SIZE=256"); - - services::String cachekey("__daal_oneapi_internal_math_sum_reducer_"); - cachekey.add(build_options); - kernelFactory.build(ExecutionTargetIds::device, cachekey.c_str(), sum_reducer, build_options.c_str(), status); - - return status; -} - -services::Status sum_singlepass(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, const UniversalBuffer & vectors, - uint32_t nVectors, uint32_t vectorSize, SumReducer::Result & result) -{ - services::Status status; - - auto sum_kernel = kernelFactory.getKernel("sum_singlesubgroup", status); - DAAL_CHECK_STATUS_VAR(status); - - // no need to check overflow for nVectors * vectorSize due to we already have buffer vectors of such size - if (vectors.type() == TypeIds::id()) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, float, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sum, float, nVectors); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sumOfSquares, float, nVectors); - } - else - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, double, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sum, double, nVectors); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sumOfSquares, double, nVectors); - } - - const uint32_t maxWorkItemsPerSubGroup = 32; - - KernelRange localRange(1, maxWorkItemsPerSubGroup); - KernelRange globalRange(nVectors, maxWorkItemsPerSubGroup); - - KernelNDRange range(2); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, vectors, AccessModeIds::read); - args.set(1, nVectors); - args.set(2, vectorSize); - args.set(3, result.sum, AccessModeIds::write); - args.set(4, result.sumOfSquares, AccessModeIds::write); - - context.run(range, sum_kernel, args, status); - return status; -} - -services::Status runStepColmajor(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, const UniversalBuffer & vectors, - uint32_t nVectors, uint32_t vectorSize, uint32_t numWorkItems, uint32_t numWorkGroups, uint32_t numDivisionsByRow, - SumReducer::Result & stepResult) -{ - services::Status status; - - auto sum_kernel = kernelFactory.getKernel("sum_step_colmajor", status); - DAAL_CHECK_STATUS_VAR(status); - - // no need to check overflow for nVectors * vectorSize due to we already have buffer vectors of such size - if (vectors.type() == TypeIds::id()) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, float, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sum, float, nVectors * numDivisionsByRow); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sumOfSquares, float, nVectors * numDivisionsByRow); - } - else - { - DAAL_ASSERT_UNIVERSAL_BUFFER(vectors, double, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sum, double, nVectors * numDivisionsByRow); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sumOfSquares, double, nVectors * numDivisionsByRow); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, numWorkGroups, numWorkItems); - - KernelRange localRange(numWorkItems); - KernelRange globalRange(numWorkGroups * numWorkItems); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(5, status); - DAAL_CHECK_STATUS_VAR(status); - - args.set(0, vectors, AccessModeIds::read); - args.set(1, nVectors); - args.set(2, vectorSize); - args.set(3, stepResult.sum, AccessModeIds::write); - args.set(4, stepResult.sumOfSquares, AccessModeIds::write); - - context.run(range, sum_kernel, args, status); - - return status; -} - -services::Status runFinalStepRowmajor(ExecutionContextIface & context, ClKernelFactoryIface & kernelFactory, SumReducer::Result & stepResult, - uint32_t nVectors, uint32_t vectorSize, uint32_t workItemsPerGroup, SumReducer::Result & result) -{ - services::Status status; - - auto sum_kernel = kernelFactory.getKernel("sum_final_step_rowmajor", status); - DAAL_CHECK_STATUS_VAR(status); - - if (result.sum.type() == TypeIds::id()) - { - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sum, float, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sumOfSquares, float, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sum, float, nVectors); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sumOfSquares, float, nVectors); - } - else - { - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sum, double, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(stepResult.sumOfSquares, double, nVectors * vectorSize); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sum, double, nVectors); - DAAL_ASSERT_UNIVERSAL_BUFFER(result.sumOfSquares, double, nVectors); - } - - DAAL_OVERFLOW_CHECK_BY_MULTIPLICATION(size_t, workItemsPerGroup, nVectors); - - KernelRange localRange(workItemsPerGroup); - KernelRange globalRange(workItemsPerGroup * nVectors); - - KernelNDRange range(1); - range.global(globalRange, status); - DAAL_CHECK_STATUS_VAR(status); - range.local(localRange, status); - DAAL_CHECK_STATUS_VAR(status); - - KernelArguments args(6, status); - DAAL_CHECK_STATUS_VAR(status); - args.set(0, stepResult.sum, AccessModeIds::read); - args.set(1, stepResult.sumOfSquares, AccessModeIds::read); - args.set(2, nVectors); - args.set(3, vectorSize); - args.set(4, result.sum, AccessModeIds::write); - args.set(5, result.sumOfSquares, AccessModeIds::write); - - context.run(range, sum_kernel, args, status); - - return status; -} -SumReducer::Result SumReducer::sum(Layout vectorsLayout, const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, - services::Status & status) -{ - auto & context = services::internal::getDefaultContext(); - Result result(context, nVectors, vectors.type(), status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, result); - return sum(vectorsLayout, vectors, nVectors, vectorSize, result, status); -} -SumReducer::Result SumReducer::sum(Layout vectorsLayout, const UniversalBuffer & vectors, uint32_t nVectors, uint32_t vectorSize, Result & result, - services::Status & status) -{ - DAAL_ITTNOTIFY_SCOPED_TASK(SumReducer.sum); - - auto & context = services::internal::getDefaultContext(); - auto & kernelFactory = context.getClKernelFactory(); - - status |= buildProgram(kernelFactory, vectors.type()); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, SumReducer::Result()); - - DAAL_ASSERT(vectors.type() == TypeIds::id() || vectors.type() == TypeIds::id()); - - const uint32_t maxWorkItemsPerGroup = 256; - const uint32_t maxNumSubSlices = 9; - - if (vectorsLayout == Layout::RowMajor) - { - status |= sum_singlepass(context, kernelFactory, vectors, nVectors, vectorSize, result); - } - else - { - const uint32_t numDivisionsByCol = (nVectors + maxWorkItemsPerGroup - 1) / maxWorkItemsPerGroup; - uint32_t numDivisionsByRow = 9; - if (vectorSize < 5000) - numDivisionsByRow = 1; - else if (vectorSize < 10000) - numDivisionsByRow = 3; - else if (vectorSize < 20000) - numDivisionsByRow = 6; - - const uint32_t workItemsPerGroup = (maxWorkItemsPerGroup < nVectors) ? maxWorkItemsPerGroup : nVectors; - - if (numDivisionsByRow > 1) - { - // no need to check overflow for numDivisionsByRow * nVectors due to numDivisionsByRow less than vectorSize, - // and input vectors buffer has size of vectorSize * numDivisionsByRow - Result stepResult(context, numDivisionsByRow * nVectors, vectors.type(), status); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, result); - - status |= runStepColmajor(context, kernelFactory, vectors, nVectors, vectorSize, workItemsPerGroup, numDivisionsByCol * numDivisionsByRow, - numDivisionsByRow, stepResult); - DAAL_CHECK_STATUS_RETURN_IF_FAIL(status, result); - - const uint32_t stepWorkItems = maxNumSubSlices / 2; //need to be power of two - status |= runFinalStepRowmajor(context, kernelFactory, stepResult, nVectors, numDivisionsByRow, stepWorkItems, result); - } - else - { - status |= runStepColmajor(context, kernelFactory, vectors, nVectors, vectorSize, workItemsPerGroup, numDivisionsByCol, numDivisionsByRow, - result); - } - } - - return result; -} - -} // namespace math -} // namespace sycl -} // namespace internal -} // namespace services -} // namespace daal diff --git a/cpp/oneapi/dal/algo/basic_statistics/backend/basic_statistics_interop.hpp b/cpp/oneapi/dal/algo/basic_statistics/backend/basic_statistics_interop.hpp index 6ea3eebe7fc..640321d4a15 100644 --- a/cpp/oneapi/dal/algo/basic_statistics/backend/basic_statistics_interop.hpp +++ b/cpp/oneapi/dal/algo/basic_statistics/backend/basic_statistics_interop.hpp @@ -17,7 +17,6 @@ #pragma once #include "oneapi/dal/algo/basic_statistics/common.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" #include diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp index 478bf9de85d..dba39bb9d01 100644 --- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp +++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/kernels_csr_impl.hpp @@ -15,7 +15,6 @@ *******************************************************************************/ #include "oneapi/dal/backend/primitives/reduction.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" #include "oneapi/dal/backend/primitives/ndarray.hpp" diff --git a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp index a8c02f27318..a51de745fab 100644 --- a/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans/backend/gpu/train_kernel_lloyd_dense_dpc.cpp @@ -26,7 +26,6 @@ #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/backend/transfer.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" diff --git a/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp b/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp index 544e24546f8..935c95107f2 100644 --- a/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp +++ b/cpp/oneapi/dal/algo/kmeans/detail/train_init_centroids.hpp @@ -15,7 +15,7 @@ *******************************************************************************/ #include -#include "oneapi/dal/backend/interop/common_dpc.hpp" +#include "oneapi/dal/backend/interop/common.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" #include "oneapi/dal/backend/transfer.hpp" diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_dense_dpc.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_dense_dpc.cpp index 724f776e7a1..8ee39450fa4 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_dense_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_dense_dpc.cpp @@ -17,7 +17,6 @@ #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel.hpp" #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_distr.hpp" #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernels_impl.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/primitives/utils.hpp" #include "oneapi/dal/detail/error_messages.hpp" diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_distr_random_dense_dpc.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_distr_random_dense_dpc.cpp index 1dc6eb5f7ef..642e937b562 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_distr_random_dense_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_distr_random_dense_dpc.cpp @@ -17,7 +17,6 @@ #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel.hpp" #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_distr.hpp" #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernels_impl.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/backend/primitives/utils.hpp" diff --git a/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_sparse_dpc.cpp b/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_sparse_dpc.cpp index 2be9dbb54c0..d5b3f574bf2 100644 --- a/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_sparse_dpc.cpp +++ b/cpp/oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_sparse_dpc.cpp @@ -17,7 +17,6 @@ #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel.hpp" #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernel_distr.hpp" #include "oneapi/dal/algo/kmeans_init/backend/gpu/compute_kernels_impl.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/primitives/utils.hpp" #include "oneapi/dal/detail/error_messages.hpp" diff --git a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_brute_force_dpc.cpp b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_brute_force_dpc.cpp index f805d8322c3..5abc74d1dd1 100644 --- a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_brute_force_dpc.cpp +++ b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_brute_force_dpc.cpp @@ -14,7 +14,6 @@ * limitations under the License. *******************************************************************************/ -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" diff --git a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl.hpp b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl.hpp index bd58dda9424..e42cb78e47e 100644 --- a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl.hpp +++ b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl.hpp @@ -14,7 +14,6 @@ * limitations under the License. *******************************************************************************/ -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" diff --git a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc.hpp b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc.hpp index 1ba10bf8737..87106996152 100644 --- a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc.hpp +++ b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc.hpp @@ -14,7 +14,6 @@ * limitations under the License. *******************************************************************************/ -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" diff --git a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc_distr.hpp b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc_distr.hpp index daf3caa9187..6046155c2a4 100644 --- a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc_distr.hpp +++ b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_impl_dpc_distr.hpp @@ -14,7 +14,6 @@ * limitations under the License. *******************************************************************************/ -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" diff --git a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_kd_tree_dpc.cpp b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_kd_tree_dpc.cpp index 6a8cb50a170..2297925bbe5 100644 --- a/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_kd_tree_dpc.cpp +++ b/cpp/oneapi/dal/algo/knn/backend/gpu/infer_kernel_kd_tree_dpc.cpp @@ -18,7 +18,6 @@ #include "oneapi/dal/algo/knn/backend/gpu/infer_kernel.hpp" #include "oneapi/dal/backend/interop/common.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/detail/common.hpp" diff --git a/cpp/oneapi/dal/algo/knn/backend/gpu/train_kernel_kd_tree_dpc.cpp b/cpp/oneapi/dal/algo/knn/backend/gpu/train_kernel_kd_tree_dpc.cpp index 2c432909939..040600b1c02 100644 --- a/cpp/oneapi/dal/algo/knn/backend/gpu/train_kernel_kd_tree_dpc.cpp +++ b/cpp/oneapi/dal/algo/knn/backend/gpu/train_kernel_kd_tree_dpc.cpp @@ -16,7 +16,6 @@ #include "oneapi/dal/algo/knn/backend/gpu/train_kernel.hpp" #include "oneapi/dal/backend/dispatcher.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" namespace oneapi::dal::knn::backend { diff --git a/cpp/oneapi/dal/algo/knn/backend/model_conversion.hpp b/cpp/oneapi/dal/algo/knn/backend/model_conversion.hpp index 37ae9333e2d..8678d0f93ed 100755 --- a/cpp/oneapi/dal/algo/knn/backend/model_conversion.hpp +++ b/cpp/oneapi/dal/algo/knn/backend/model_conversion.hpp @@ -20,7 +20,7 @@ #include "oneapi/dal/algo/knn/backend/model_impl.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" -#include +#include "src/algorithms/k_nearest_neighbors/bf_knn_classification_model_impl.h" namespace oneapi::dal::knn::backend { diff --git a/cpp/oneapi/dal/algo/linear_regression/backend/gpu/infer_kernel_norm_eq_dpc.cpp b/cpp/oneapi/dal/algo/linear_regression/backend/gpu/infer_kernel_norm_eq_dpc.cpp index 0132f154b43..ae81aedecd7 100644 --- a/cpp/oneapi/dal/algo/linear_regression/backend/gpu/infer_kernel_norm_eq_dpc.cpp +++ b/cpp/oneapi/dal/algo/linear_regression/backend/gpu/infer_kernel_norm_eq_dpc.cpp @@ -17,7 +17,6 @@ #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/backend/interop/common.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" diff --git a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_dense_batch_dpc.cpp b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_dense_batch_dpc.cpp index f3ccfa3da8f..01fcad9095d 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_dense_batch_dpc.cpp +++ b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_dense_batch_dpc.cpp @@ -17,7 +17,6 @@ #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/backend/interop/common.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" diff --git a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_sparse_dpc.cpp b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_sparse_dpc.cpp index 51fab5a66e1..c4e18f63bbf 100644 --- a/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_sparse_dpc.cpp +++ b/cpp/oneapi/dal/algo/logistic_regression/backend/gpu/infer_kernel_sparse_dpc.cpp @@ -21,7 +21,6 @@ #include "oneapi/dal/backend/dispatcher.hpp" #include "oneapi/dal/backend/interop/common.hpp" -#include "oneapi/dal/backend/interop/common_dpc.hpp" #include "oneapi/dal/backend/interop/error_converter.hpp" #include "oneapi/dal/backend/interop/table_conversion.hpp" #include "oneapi/dal/backend/primitives/blas.hpp" diff --git a/cpp/oneapi/dal/backend/interop/common_dpc.cpp b/cpp/oneapi/dal/backend/interop/common_dpc.cpp deleted file mode 100644 index e76b6567882..00000000000 --- a/cpp/oneapi/dal/backend/interop/common_dpc.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "oneapi/dal/backend/interop/common_dpc.hpp" - -#include -#include -#include -#include -#include - -namespace oneapi::dal::backend::interop { - -using daal_sycl_ex_ctx_t = daal::services::internal::SyclExecutionContext; - -class execution_context_cache { -public: - static execution_context_cache& get_instance() { - static execution_context_cache cache; - return cache; - } - - ~execution_context_cache() { - // We do not delete map entries intentionally as deallocation order of global object is not - // defined. The desctructors of objects stored by DAAL `SyclExecutionContext` likely require - // access to dynamic libraries, which may be unloaded before the call to this desctructor. - // The workaround leads to memory leak, however does not affect user application as always - // happens at the end. This will be no longer required once kernels are migrated to DPC++. - } - - daal_sycl_ex_ctx_t lookup(const sycl::queue& queue) { - const std::size_t hash = std::hash{}(queue); - const auto it = map_.find(hash); - if (it == map_.end()) { - const auto ctx = new daal_sycl_ex_ctx_t{ queue }; - map_.emplace(hash, ctx); - return *ctx; - } - return *it->second; - } - - void cleanup() { - map_.clear(); - } - -private: - execution_context_cache() = default; - std::unordered_map map_; -}; - -execution_context_guard::execution_context_guard(const sycl::queue& queue) { - auto ctx = execution_context_cache::get_instance().lookup(queue); - daal::services::Environment::getInstance()->setDefaultExecutionContext(ctx); -} - -execution_context_guard::~execution_context_guard() { - daal::services::Environment::getInstance()->setDefaultExecutionContext( - daal::services::internal::CpuExecutionContext()); -} - -} // namespace oneapi::dal::backend::interop diff --git a/cpp/oneapi/dal/backend/interop/common_dpc.hpp b/cpp/oneapi/dal/backend/interop/common_dpc.hpp deleted file mode 100644 index e4db7a8dd19..00000000000 --- a/cpp/oneapi/dal/backend/interop/common_dpc.hpp +++ /dev/null @@ -1,35 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#pragma once - -#include "oneapi/dal/backend/interop/common.hpp" - -#ifdef ONEDAL_DATA_PARALLEL - -namespace oneapi::dal::backend::interop { - -struct [[deprecated]] execution_context_guard { - explicit execution_context_guard(const sycl::queue &queue); - - ~execution_context_guard(); - - execution_context_guard(const execution_context_guard &) = delete; -}; - -} // namespace oneapi::dal::backend::interop - -#endif diff --git a/cpp/oneapi/dal/backend/interop/table_conversion.hpp b/cpp/oneapi/dal/backend/interop/table_conversion.hpp index d68f1f179c6..00186b3cf57 100644 --- a/cpp/oneapi/dal/backend/interop/table_conversion.hpp +++ b/cpp/oneapi/dal/backend/interop/table_conversion.hpp @@ -16,16 +16,12 @@ #pragma once -#ifdef ONEDAL_DATA_PARALLEL -#include -#endif - #include #include "daal/src/data_management/service_numeric_table.h" #include "oneapi/dal/backend/memory.hpp" #include "oneapi/dal/table/detail/table_builder.hpp" -#include "oneapi/dal/table/backend/interop/sycl_table_adapter.hpp" +#include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/table/backend/interop/host_homogen_table_adapter.hpp" #include "oneapi/dal/table/backend/interop/host_soa_table_adapter.hpp" #include "oneapi/dal/table/backend/interop/host_csr_table_adapter.hpp" @@ -267,48 +263,4 @@ inline table convert_from_daal_table(const daal::data_management::NumericTablePt } } -#ifdef ONEDAL_DATA_PARALLEL -inline daal::data_management::NumericTablePtr convert_to_daal_table(const sycl::queue& queue, - const table& table) { - if (!table.has_data()) { - return daal::data_management::NumericTablePtr{}; - } - return interop::sycl_table_adapter::create(queue, table); -} - -template -inline daal::data_management::NumericTablePtr convert_to_daal_table(const sycl::queue& queue, - const array& data, - std::int64_t row_count, - std::int64_t column_count) { - using daal::services::Status; - using daal::services::SharedPtr; - using daal::services::internal::Buffer; - using daal::data_management::internal::SyclHomogenNumericTable; - using dal::detail::integral_cast; - - ONEDAL_ASSERT(data.get_count() == row_count * column_count); - ONEDAL_ASSERT(data.has_mutable_data()); - ONEDAL_ASSERT(is_same_context(queue, data)); - - const SharedPtr data_shared{ data.get_mutable_data(), daal_object_owner{ data } }; - - Status status; - const Buffer buffer{ data_shared, - integral_cast(data.get_count()), - queue, - status }; - status_to_exception(status); - - const auto table = - SyclHomogenNumericTable::create(buffer, - integral_cast(column_count), - integral_cast(row_count), - &status); - status_to_exception(status); - - return table; -} -#endif - } // namespace oneapi::dal::backend::interop diff --git a/cpp/oneapi/dal/backend/primitives/rng/BUILD b/cpp/oneapi/dal/backend/primitives/rng/BUILD index 8e272b90751..ef49abf0bde 100755 --- a/cpp/oneapi/dal/backend/primitives/rng/BUILD +++ b/cpp/oneapi/dal/backend/primitives/rng/BUILD @@ -11,7 +11,7 @@ dal_module( "@onedal//cpp/oneapi/dal/backend/primitives:common", ], extra_deps = [ - "@onedal//cpp/daal:sycl", + "@onedal//cpp/daal:engines", ], ) diff --git a/cpp/oneapi/dal/backend/primitives/selection/BUILD b/cpp/oneapi/dal/backend/primitives/selection/BUILD index 5e3f16ce88e..8cd925b2b57 100755 --- a/cpp/oneapi/dal/backend/primitives/selection/BUILD +++ b/cpp/oneapi/dal/backend/primitives/selection/BUILD @@ -12,9 +12,6 @@ dal_module( "@onedal//cpp/oneapi/dal/backend/primitives:heap", "@onedal//cpp/oneapi/dal/backend/primitives:rng", ], - extra_deps = [ - "@onedal//cpp/daal:sycl", - ], ) dal_test_suite( diff --git a/cpp/oneapi/dal/table/backend/interop/sycl_table_adapter.hpp b/cpp/oneapi/dal/table/backend/interop/sycl_table_adapter.hpp deleted file mode 100644 index 07843da30cd..00000000000 --- a/cpp/oneapi/dal/table/backend/interop/sycl_table_adapter.hpp +++ /dev/null @@ -1,137 +0,0 @@ -/******************************************************************************* -* Copyright 2020 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#pragma once - -#include - -#include "oneapi/dal/table/homogen.hpp" -#include "oneapi/dal/table/row_accessor.hpp" -#include "oneapi/dal/table/column_accessor.hpp" -#include "oneapi/dal/backend/interop/error_converter.hpp" -#include "oneapi/dal/backend/interop/daal_object_owner.hpp" -#include "oneapi/dal/table/backend/interop/block_info.hpp" - -namespace oneapi::dal::backend::interop { - -#ifdef ONEDAL_DATA_PARALLEL -// This class shall be used only to represent immutable data on DAAL side. Any -// attempts to change the data inside objects of that class lead to exception. -class sycl_table_adapter : public daal::data_management::NumericTable { - using base = daal::data_management::NumericTable; - using status_t = daal::services::Status; - using rw_mode_t = daal::data_management::ReadWriteMode; - using ptr_t = daal::services::SharedPtr; - - template - using block_desc_t = daal::data_management::BlockDescriptor; - - template - using daal_buffer_t = daal::services::internal::Buffer; - - template - using daal_buffer_and_status_t = std::tuple, status_t>; - -public: - static ptr_t create(const sycl::queue& q, const table& table); - -private: - explicit sycl_table_adapter(const sycl::queue& q, const table& table, status_t& stat); - - status_t getBlockOfRows(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block) override; - - status_t getBlockOfRows(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block) override; - - status_t getBlockOfRows(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block) override; - - status_t getBlockOfColumnValues(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block) override; - - status_t getBlockOfColumnValues(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block) override; - - status_t getBlockOfColumnValues(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block) override; - - status_t releaseBlockOfRows(block_desc_t& block) override; - status_t releaseBlockOfRows(block_desc_t& block) override; - status_t releaseBlockOfRows(block_desc_t& block) override; - - status_t releaseBlockOfColumnValues(block_desc_t& block) override; - status_t releaseBlockOfColumnValues(block_desc_t& block) override; - status_t releaseBlockOfColumnValues(block_desc_t& block) override; - - status_t assign(float value) override; - status_t assign(double value) override; - status_t assign(int value) override; - - int getSerializationTag() const override; - status_t serializeImpl(daal::data_management::InputDataArchive* arch) override; - status_t deserializeImpl(const daal::data_management::OutputDataArchive* arch) override; - - status_t allocateDataMemoryImpl(daal::MemType) override; - status_t setNumberOfColumnsImpl(std::size_t) override; - void freeDataMemoryImpl() override; - - template - status_t read_rows_impl(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block); - - template - status_t read_column_values_impl(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block); - - bool check_row_indexes_in_range(const block_info& info) const; - bool check_column_index_in_range(const block_info& info) const; - - template - daal_buffer_and_status_t convert_to_daal_buffer(const array& ary) const; - - template - daal_buffer_and_status_t pull_rows_buffer(const block_info& info); - - template - daal_buffer_and_status_t pull_columns_buffer(const block_info& info); - - sycl::queue queue_; - table original_table_; -}; -#endif - -} // namespace oneapi::dal::backend::interop diff --git a/cpp/oneapi/dal/table/backend/interop/sycl_table_adapter_dpc.cpp b/cpp/oneapi/dal/table/backend/interop/sycl_table_adapter_dpc.cpp deleted file mode 100644 index 967d776d458..00000000000 --- a/cpp/oneapi/dal/table/backend/interop/sycl_table_adapter_dpc.cpp +++ /dev/null @@ -1,313 +0,0 @@ -/******************************************************************************* -* Copyright 2021 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#include "oneapi/dal/table/backend/interop/sycl_table_adapter.hpp" - -namespace oneapi::dal::backend::interop { - -namespace daal_dm = daal::data_management; - -template -static daal::services::Status convert_exception_to_status(Body&& body) { - try { - return body(); - } - catch (const bad_alloc&) { - return daal::services::ErrorMemoryAllocationFailed; - } - catch (const out_of_range&) { - return daal::services::ErrorIncorrectDataRange; - } - catch (...) { - return daal::services::UnknownError; - } -} - -static daal_dm::features::FeatureType get_daal_feature_type(feature_type t) { - switch (t) { - case feature_type::nominal: return daal_dm::features::DAAL_CATEGORICAL; - case feature_type::ordinal: return daal_dm::features::DAAL_ORDINAL; - case feature_type::interval: return daal_dm::features::DAAL_CONTINUOUS; - case feature_type::ratio: return daal_dm::features::DAAL_CONTINUOUS; - default: throw dal::internal_error(detail::error_messages::unsupported_feature_type()); - } -} - -static void convert_feature_information_to_daal(const table_metadata& src, - daal_dm::NumericTableDictionary& dst) { - ONEDAL_ASSERT(std::size_t(src.get_feature_count()) == dst.getNumberOfFeatures()); - for (std::int64_t i = 0; i < src.get_feature_count(); i++) { - auto& daal_feature = dst[i]; - daal_feature.featureType = get_daal_feature_type(src.get_feature_type(i)); - } -} - -auto sycl_table_adapter::create(const sycl::queue& q, const table& table) -> ptr_t { - status_t internal_stat; - auto result = ptr_t{ new sycl_table_adapter(q, table, internal_stat) }; - status_to_exception(internal_stat); - return result; -} - -sycl_table_adapter::sycl_table_adapter(const sycl::queue& q, const table& table, status_t& status) - : base(table.get_column_count(), table.get_row_count(), daal_dm::DictionaryIface::equal), - queue_(q), - original_table_(table) { - if (!status.ok()) { - return; - } - else if (!table.has_data()) { - status.add(daal::services::ErrorIncorrectParameter); - return; - } - - this->_memStatus = daal_dm::NumericTableIface::userAllocated; - this->_layout = daal_dm::NumericTableIface::aos; - - convert_feature_information_to_daal(original_table_.get_metadata(), - *this->getDictionarySharedPtr()); -} - -auto sycl_table_adapter::getBlockOfRows(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - return convert_exception_to_status([&]() { - return read_rows_impl(vector_idx, vector_num, rwflag, block); - }); -} - -auto sycl_table_adapter::getBlockOfRows(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - return convert_exception_to_status([&]() { - return read_rows_impl(vector_idx, vector_num, rwflag, block); - }); -} - -auto sycl_table_adapter::getBlockOfRows(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - return convert_exception_to_status([&]() { - return read_rows_impl(vector_idx, vector_num, rwflag, block); - }); -} - -auto sycl_table_adapter::getBlockOfColumnValues(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - return convert_exception_to_status([&]() { - return read_column_values_impl(feature_idx, vector_idx, value_num, rwflag, block); - }); -} - -auto sycl_table_adapter::getBlockOfColumnValues(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - return convert_exception_to_status([&]() { - return read_column_values_impl(feature_idx, vector_idx, value_num, rwflag, block); - }); -} - -auto sycl_table_adapter::getBlockOfColumnValues(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - return convert_exception_to_status([&]() { - return read_column_values_impl(feature_idx, vector_idx, value_num, rwflag, block); - }); -} - -auto sycl_table_adapter::releaseBlockOfRows(block_desc_t& block) -> status_t { - block.reset(); - return status_t(); -} - -auto sycl_table_adapter::releaseBlockOfRows(block_desc_t& block) -> status_t { - block.reset(); - return status_t(); -} - -auto sycl_table_adapter::releaseBlockOfRows(block_desc_t& block) -> status_t { - block.reset(); - return status_t(); -} - -auto sycl_table_adapter::releaseBlockOfColumnValues(block_desc_t& block) -> status_t { - block.reset(); - return status_t(); -} - -auto sycl_table_adapter::releaseBlockOfColumnValues(block_desc_t& block) -> status_t { - block.reset(); - return status_t(); -} - -auto sycl_table_adapter::releaseBlockOfColumnValues(block_desc_t& block) -> status_t { - block.reset(); - return status_t(); -} - -auto sycl_table_adapter::assign(float) -> status_t { - return daal::services::ErrorMethodNotImplemented; -} - -auto sycl_table_adapter::assign(double) -> status_t { - return daal::services::ErrorMethodNotImplemented; -} - -auto sycl_table_adapter::assign(int) -> status_t { - return daal::services::ErrorMethodNotImplemented; -} - -auto sycl_table_adapter::allocateDataMemoryImpl(daal::MemType) -> status_t { - return daal::services::ErrorMethodNotImplemented; -} - -auto sycl_table_adapter::setNumberOfColumnsImpl(std::size_t) -> status_t { - return daal::services::ErrorMethodNotImplemented; -} - -int sycl_table_adapter::getSerializationTag() const { - ONEDAL_ASSERT(!"sycl_table_adapter: getSerializationTag() is not implemented"); - return -1; -} - -auto sycl_table_adapter::serializeImpl(daal_dm::InputDataArchive* arch) -> status_t { - return daal::services::ErrorMethodNotImplemented; -} - -auto sycl_table_adapter::deserializeImpl(const daal_dm::OutputDataArchive* arch) -> status_t { - return daal::services::ErrorMethodNotImplemented; -} - -void sycl_table_adapter::freeDataMemoryImpl() { - base::freeDataMemoryImpl(); - original_table_ = homogen_table{}; -} - -template -auto sycl_table_adapter::read_rows_impl(std::size_t vector_idx, - std::size_t vector_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - if (rwflag != daal_dm::readOnly) { - ONEDAL_ASSERT(!"Data is accessible in read-only mode by design"); - return daal::services::ErrorMethodNotImplemented; - } - - const block_info info{ block, vector_idx, vector_num }; - if (!check_row_indexes_in_range(info)) { - return daal::services::ErrorIncorrectIndex; - } - - const auto [buffer, status] = pull_rows_buffer(info); - if (status.ok()) { - block.setDetails(0, vector_idx, rwflag); - block.setBuffer(buffer, info.row_count, original_table_.get_column_count()); - } - - return status; -} - -template -auto sycl_table_adapter::read_column_values_impl(std::size_t feature_idx, - std::size_t vector_idx, - std::size_t value_num, - rw_mode_t rwflag, - block_desc_t& block) -> status_t { - if (rwflag != daal_dm::readOnly) { - ONEDAL_ASSERT(!"Data is accessible in read-only mode by design"); - return daal::services::ErrorMethodNotImplemented; - } - - const block_info info{ block, vector_idx, value_num, feature_idx }; - if (!check_row_indexes_in_range(info) || !check_column_index_in_range(info)) { - return daal::services::ErrorIncorrectIndex; - } - - const auto [buffer, status] = pull_columns_buffer(info); - if (status.ok()) { - block.setDetails(feature_idx, vector_idx, rwflag); - block.setBuffer(buffer, info.row_count, original_table_.get_column_count()); - } - - return status; -} - -template -auto sycl_table_adapter::convert_to_daal_buffer(const array& ary) const - -> daal_buffer_and_status_t { - using daal::services::SharedPtr; - - status_t status; - ONEDAL_ASSERT(ary.get_data() != nullptr); - - // `const_cast` is safe assuming read-only access to the table on DAAL side and - // correct `rwflag` passed to `getBlockOfRows` or `getBlockOfColumnValues`. - SharedPtr ary_data_shared(const_cast(ary.get_data()), - daal_object_owner{ ary }); - - const auto buffer = - daal_buffer_t{ std::move(ary_data_shared), - dal::detail::integral_cast(ary.get_count()), - queue_, - status }; - return { buffer, status }; -} - -bool sycl_table_adapter::check_row_indexes_in_range(const block_info& info) const { - const std::int64_t row_count = original_table_.get_row_count(); - return info.row_begin_index < row_count && info.row_end_index <= row_count; -} - -bool sycl_table_adapter::check_column_index_in_range(const block_info& info) const { - const std::int64_t column_count = original_table_.get_column_count(); - return info.single_column_requested && info.column_index < column_count; -} - -constexpr inline sycl::usm::alloc get_accessor_alloc_kind() { - // We always request device-allocated data assuming adapter is used within - // DAAL kernels, which rely on device USM. - return sycl::usm::alloc::device; -} - -template -auto sycl_table_adapter::pull_rows_buffer(const block_info& info) - -> daal_buffer_and_status_t { - const auto values = // - row_accessor{ original_table_ } // - .pull(queue_, info.get_row_range(), get_accessor_alloc_kind()); - return convert_to_daal_buffer(values); -} - -template -auto sycl_table_adapter::pull_columns_buffer(const block_info& info) - -> daal_buffer_and_status_t { - const auto values = // - column_accessor{ original_table_ } // - .pull(queue_, info.column_index, info.get_row_range(), get_accessor_alloc_kind()); - return convert_to_daal_buffer(values); -} - -} // namespace oneapi::dal::backend::interop diff --git a/dev/bazel/daal.bzl b/dev/bazel/daal.bzl index ea97af7712e..150c424d70a 100644 --- a/dev/bazel/daal.bzl +++ b/dev/bazel/daal.bzl @@ -30,12 +30,10 @@ load("@onedal//dev/bazel/config:config.bzl", def daal_module(name, features=[], lib_tag="daal", hdrs=[], srcs=[], auto=False, - opencl=False, local_defines=[], **kwargs): + local_defines=[], **kwargs): if auto: auto_hdrs = native.glob(["**/*.h", "**/*.i"]) auto_srcs = native.glob(["**/*.cpp"]) - if opencl: - auto_hdrs += native.glob(["**/*.cl"]) else: auto_hdrs = [] auto_srcs = [] diff --git a/dev/bazel/dal.bzl b/dev/bazel/dal.bzl index 11e78334575..05f357ed6f9 100644 --- a/dev/bazel/dal.bzl +++ b/dev/bazel/dal.bzl @@ -500,7 +500,6 @@ def _dal_module(name, lib_tag="dal", is_dpc=False, features=[], "avx512": [ "__CPU_TAG__=__CPU_TAG_AVX512__" ], }, local_defines = local_defines + ([ - "DAAL_SYCL_INTERFACE", "ONEDAL_DATA_PARALLEL" ] if is_dpc else []) + select({ "@config//:test_fp64_disabled": [ diff --git a/dev/bazel/deps/micromkldpc.tpl.BUILD b/dev/bazel/deps/micromkldpc.tpl.BUILD index 7c485effcf1..844e1150264 100644 --- a/dev/bazel/deps/micromkldpc.tpl.BUILD +++ b/dev/bazel/deps/micromkldpc.tpl.BUILD @@ -13,5 +13,6 @@ cc_library( ], deps = [ ":headers", + "@opencl//:opencl_binary", ], ) diff --git a/examples/daal/cpp/source/optimization_solvers/custom_obj_func.h b/examples/daal/cpp/source/optimization_solvers/custom_obj_func.h index f95bb464f19..df5fb03910d 100644 --- a/examples/daal/cpp/source/optimization_solvers/custom_obj_func.h +++ b/examples/daal/cpp/source/optimization_solvers/custom_obj_func.h @@ -24,6 +24,7 @@ #include #include +#include #include "daal.h" diff --git a/makefile b/makefile index 96081ea0354..9cfd463f8bb 100644 --- a/makefile +++ b/makefile @@ -666,7 +666,6 @@ $(ONEAPI.objs_a.dpc): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_a.dpc)/inc_a_fold $(ONEAPI.objs_a.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \ -DDAAL_NOTHROW_EXCEPTIONS \ -DDAAL_HIDE_DEPRECATED \ - -DDAAL_SYCL_INTERFACE \ -DONEDAL_DATA_PARALLEL \ -D__TBB_NO_IMPLICIT_LINKAGE \ -D_ENABLE_ATOMIC_ALIGNMENT_FIX \ @@ -693,7 +692,6 @@ $(ONEAPI.objs_y.dpc): $(ONEAPI.dispatcher_cpu) $(ONEAPI.tmpdir_y.dpc)/inc_y_fold $(ONEAPI.objs_y.dpc): COPT += $(-fPIC) $(-cxx17) $(-DEBC) $(-EHsc) $(pedantic.opts.dpcpp) \ -DDAAL_NOTHROW_EXCEPTIONS \ -DDAAL_HIDE_DEPRECATED \ - -DDAAL_SYCL_INTERFACE \ -DONEDAL_DATA_PARALLEL \ -D_ENABLE_ATOMIC_ALIGNMENT_FIX \ $(if $(CHECK_DLL_SIG),-DDAAL_CHECK_DLL_SIG) \