diff --git a/.ci/pipeline/ci.yml b/.ci/pipeline/ci.yml index d9937918d1e..609256539c4 100755 --- a/.ci/pipeline/ci.yml +++ b/.ci/pipeline/ci.yml @@ -122,7 +122,6 @@ jobs: # displayName: 'Uploading on fail' # condition: failed() # continueOnError: true - # - job: 'LinuxMakeDPCPP' # timeoutInMinutes: 0 # variables: @@ -395,8 +394,8 @@ jobs: .ci/scripts/build.sh --compiler clang --target daal --optimizations "sse2 avx2" --conda-env ci-env displayName: 'make daal' - script: | - .ci/scripts/build.sh --compiler clang --target oneapi_c --optimizations "sse2 avx2" - displayName: 'make oneapi_c' + .ci/scripts/build.sh --compiler clang --target onedal_c --optimizations "sse2 avx2" + displayName: 'make onedal_c' - task: PublishPipelineArtifact@1 inputs: artifactName: '$(platform.type) build' @@ -471,4 +470,4 @@ jobs: # targetPath: '$(Build.Repository.LocalPath)/$(release.dir)' # displayName: 'Uploading on fail' # condition: failed() -# continueOnError: true +# continueOnError: true \ No newline at end of file diff --git a/cpp/daal/include/algorithms/algorithm_container_base_batch.h b/cpp/daal/include/algorithms/algorithm_container_base_batch.h index 2f2da045e30..2efe6fdf942 100644 --- a/cpp/daal/include/algorithms/algorithm_container_base_batch.h +++ b/cpp/daal/include/algorithms/algorithm_container_base_batch.h @@ -139,7 +139,7 @@ class AlgorithmContainerImpl : public AlgorithmContainer * * \tparam mode Computation mode of the algorithm, \ref ComputeMode * \tparam sse2Container Implementation for Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2) - * \tparam sse42Container Implementation for Intel(R) Streaming SIMD Extensions 42 (Intel(R) SSE42) + * \tparam sse42Container Implementation for Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) * \tparam avx2Container Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) * \tparam avx512Container Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512 */ diff --git a/cpp/daal/include/algorithms/algorithm_container_base_common.h b/cpp/daal/include/algorithms/algorithm_container_base_common.h index 0b51f65a48d..6d0c946fead 100644 --- a/cpp/daal/include/algorithms/algorithm_container_base_common.h +++ b/cpp/daal/include/algorithms/algorithm_container_base_common.h @@ -49,7 +49,7 @@ namespace interface1 * * \tparam mode Computation mode of the algorithm, \ref ComputeMode * \tparam sse2Container Implementation for Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2) - * \tparam sse42Container Implementation for Intel(R) Streaming SIMD Extensions 42 (Intel(R) SSE42) + * \tparam sse42Container Implementation for Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) * \tparam avx2Container Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) * \tparam avx512Container Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512 */ diff --git a/cpp/daal/include/services/daal_defines.h b/cpp/daal/include/services/daal_defines.h index 83d41d0eec4..6d43749d7c8 100644 --- a/cpp/daal/include/services/daal_defines.h +++ b/cpp/daal/include/services/daal_defines.h @@ -130,7 +130,7 @@ namespace daal { /** * -* Computation modes of Intel(R) oneDAL algorithms +* Computation modes of Intel(R) oneAPI Data Analytics Library (oneDAL) algorithms */ enum ComputeMode { diff --git a/cpp/daal/include/services/daal_string.h b/cpp/daal/include/services/daal_string.h index 802b4f5194a..4342becfc54 100644 --- a/cpp/daal/include/services/daal_string.h +++ b/cpp/daal/include/services/daal_string.h @@ -17,7 +17,7 @@ /* //++ -// Intel(R) oneDAL string class. +// Intel(R) oneAPI Data Analytics Library (oneDAL) string class. //-- */ diff --git a/cpp/daal/include/services/error_handling.h b/cpp/daal/include/services/error_handling.h index b247c4f7cd1..58bc78a1f40 100644 --- a/cpp/daal/include/services/error_handling.h +++ b/cpp/daal/include/services/error_handling.h @@ -17,7 +17,7 @@ /* //++ -// Handling errors in Intel(R) oneDAL. +// Handling errors in Intel(R) oneAPI Data Analytics Library (oneDAL). //-- */ diff --git a/cpp/daal/include/services/error_id.h b/cpp/daal/include/services/error_id.h index 14ebe48c144..a4cb6913f78 100644 --- a/cpp/daal/include/services/error_id.h +++ b/cpp/daal/include/services/error_id.h @@ -17,7 +17,7 @@ /* //++ -// Data types for error handling in Intel(R) oneDAL. +// Data types for error handling in Intel(R) oneAPI Data Analytics Library (oneDAL). //-- */ diff --git a/cpp/daal/include/services/error_indexes.h b/cpp/daal/include/services/error_indexes.h index 6a8daf6b254..8d5ca7c79e7 100644 --- a/cpp/daal/include/services/error_indexes.h +++ b/cpp/daal/include/services/error_indexes.h @@ -17,7 +17,7 @@ /* //++ -// Details of errors in Intel(R) oneDAL. +// Details of errors in Intel(R) oneAPI Data Analytics Library (oneDAL). //-- */ diff --git a/cpp/daal/include/services/internal/execution_context.h b/cpp/daal/include/services/internal/execution_context.h index 0c70942f63f..ee2bb3bb395 100644 --- a/cpp/daal/include/services/internal/execution_context.h +++ b/cpp/daal/include/services/internal/execution_context.h @@ -58,7 +58,7 @@ class ExecutionContext : public Base // This branch is needed to avoid problems with deleting SYCL entities // after SYCL RT static objects are already released. // This is caused by "C++ static initialization order fiasco" problem between - // oneDAL static Environment object and internal static contexts of SYCL RT. + // Intel(R) oneAPI Data Analytics Library (oneDAL) static Environment object and internal static contexts of SYCL RT. // Here we solve this temporary with a small memory leak. // TODO: remove this after complete transition to DPC++ kernels. if (needEmptyDeleter) diff --git a/cpp/daal/include/services/internal/sycl/math/mkl_blas.h b/cpp/daal/include/services/internal/sycl/math/mkl_blas.h index a5df69fc2b3..73b2797b143 100644 --- a/cpp/daal/include/services/internal/sycl/math/mkl_blas.h +++ b/cpp/daal/include/services/internal/sycl/math/mkl_blas.h @@ -46,7 +46,7 @@ namespace interface1 /** * - * \brief Adapter for MKL GEMM routine + * \brief Adapter for Intel(R) MKL GEMM routine */ template struct MKLGemm @@ -112,7 +112,7 @@ struct MKLGemm /** * - * \brief Adapter for MKL SYRK routine + * \brief Adapter for Intel(R) MKL SYRK routine */ template struct MKLSyrk @@ -171,7 +171,7 @@ struct MKLSyrk /** * - * \brief Adapter for MKL AXPY routine + * \brief Adapter for Intel(R) MKL AXPY routine */ template struct MKLAxpy diff --git a/cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h b/cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h index c6a3567e873..0c39f4a6ab2 100644 --- a/cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h +++ b/cpp/daal/include/services/internal/sycl/math/mkl_dal_utils.h @@ -17,7 +17,7 @@ /* //++ -// Utility functions for DAL wrappers over MKL routines. +// Utility functions for DAL wrappers over Intel(R) MKL routines. //-- */ diff --git a/cpp/daal/include/services/internal/sycl/math/mkl_lapack.h b/cpp/daal/include/services/internal/sycl/math/mkl_lapack.h index 492ff84b5f3..32a2f65bf61 100644 --- a/cpp/daal/include/services/internal/sycl/math/mkl_lapack.h +++ b/cpp/daal/include/services/internal/sycl/math/mkl_lapack.h @@ -45,7 +45,7 @@ namespace interface1 /** * - * \brief Adapter for MKL POTRF routine + * \brief Adapter for Intel(R) MKL POTRF routine */ template struct MKLPotrf @@ -98,7 +98,7 @@ struct MKLPotrf /** * sycl::event compute_probabilities(sycl::queue& q, const ndview& parameters, const ndview& data, - ndview& predictions, + ndview& probabilities, bool fit_intercept = true, const event_vector& deps = {}); template sycl::event compute_logloss(sycl::queue& q, - const ndview& parameters, - const ndview& data, - const ndview& labels, - ndview& out, - Float L1 = Float(0), - Float L2 = Float(0), - bool fit_intercept = true, - const event_vector& deps = {}); - -template -sycl::event compute_logloss(sycl::queue& q, - const ndview& parameters, - const ndview& data, const ndview& labels, const ndview& probabilities, ndview& out, - Float L1 = Float(0), - Float L2 = Float(0), bool fit_intercept = true, const event_vector& deps = {}); template sycl::event compute_logloss_with_der(sycl::queue& q, - const ndview& parameters, const ndview& data, const ndview& labels, const ndview& probabilities, ndview& out, ndview& out_derivative, - Float L1 = Float(0), - Float L2 = Float(0), bool fit_intercept = true, const event_vector& deps = {}); template sycl::event compute_derivative(sycl::queue& q, - const ndview& parameters, const ndview& data, const ndview& labels, const ndview& probabilities, ndview& out_derivative, - Float L1 = Float(0), - Float L2 = Float(0), bool fit_intercept = true, const event_vector& deps = {}); +template +sycl::event add_regularization_loss(sycl::queue& q, + const ndview& parameters, + ndview& out, + Float L1 = Float(0), + Float L2 = Float(0), + bool fit_intercept = true, + const event_vector& deps = {}); + +template +sycl::event add_regularization_gradient_loss(sycl::queue& q, + const ndview& parameters, + ndview& out, + ndview& out_derivative, + Float L1 = Float(0), + Float L2 = Float(0), + bool fit_intercept = true, + const event_vector& deps = {}); + +template +sycl::event add_regularization_gradient(sycl::queue& q, + const ndview& parameters, + ndview& out_derivative, + Float L1 = Float(0), + Float L2 = Float(0), + bool fit_intercept = true, + const event_vector& deps = {}); + template sycl::event compute_hessian(sycl::queue& q, - const ndview& parameters, const ndview& data, const ndview& labels, const ndview& probabilities, @@ -95,19 +104,17 @@ sycl::event compute_raw_hessian(sycl::queue& q, const event_vector& deps = {}); template -class logloss_hessian_product { +class LogLossHessianProduct : public BaseMatrixOperator { public: - logloss_hessian_product(sycl::queue& q, - const ndview& data, - const Float L2 = Float(0), - const bool fit_intercept = true); - - sycl::event set_raw_hessian(const ndview& raw_hessian, const event_vector& deps = {}); - - ndview& get_raw_hessian(); + LogLossHessianProduct(sycl::queue& q, + const table& data, + Float L2 = Float(0), + bool fit_intercept = true, + std::int64_t bsz = -1); sycl::event operator()(const ndview& vec, ndview& out, - const event_vector& deps = {}); + const event_vector& deps) final; + ndview& get_raw_hessian(); private: sycl::event compute_with_fit_intercept(const ndview& vec, @@ -118,13 +125,48 @@ class logloss_hessian_product { const event_vector& deps); sycl::queue q_; + const table data_; + Float L2_; + bool fit_intercept_; ndarray raw_hessian_; - const ndview data_; ndarray buffer_; - const Float L2_; - const bool fit_intercept_; const std::int64_t n_; const std::int64_t p_; + const std::int64_t bsz_; +}; + +template +class LogLossFunction : public BaseFunction { +public: + LogLossFunction(sycl::queue queue, + const table& data, + ndview& labels, + Float L2 = 0.0, + bool fit_intercept = true, + std::int64_t bsz = -1); + Float get_value() final; + ndview& get_gradient() final; + BaseMatrixOperator& get_hessian_product() final; + + event_vector update_x(const ndview& x, + bool need_hessp = false, + const event_vector& deps = {}) final; + +private: + sycl::queue q_; + const table data_; + ndview labels_; + const std::int64_t n_; + const std::int64_t p_; + Float L2_; + bool fit_intercept_; + const std::int64_t bsz_; + ndarray probabilities_; + ndarray gradient_; + ndarray buffer_; + LogLossHessianProduct hessp_; + const std::int64_t dimension_; + Float value_; }; } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp b/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp index 417c21b1bbb..ef73aa0a107 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/logloss_dpc.cpp @@ -17,42 +17,40 @@ #include "oneapi/dal/backend/primitives/objective_function/logloss.hpp" #include "oneapi/dal/backend/primitives/blas/gemv.hpp" #include "oneapi/dal/backend/primitives/element_wise.hpp" +#include "oneapi/dal/detail/profiler.hpp" namespace oneapi::dal::backend::primitives { +namespace pr = dal::backend::primitives; + template sycl::event compute_probabilities(sycl::queue& q, const ndview& parameters, const ndview& data, ndview& probabilities, - const bool fit_intercept, + bool fit_intercept, const event_vector& deps) { const std::int64_t n = data.get_dimension(0); const std::int64_t p = data.get_dimension(1); + ONEDAL_ASSERT(data.has_data()); ONEDAL_ASSERT(parameters.has_data()); ONEDAL_ASSERT(probabilities.has_mutable_data()); - ONEDAL_ASSERT(parameters.get_dimension(0) == p + 1); + ONEDAL_ASSERT(parameters.get_dimension(0) == fit_intercept ? p + 1 : p); ONEDAL_ASSERT(probabilities.get_dimension(0) == n); auto fill_event = fill(q, probabilities, Float(1), {}); using oneapi::dal::backend::operator+; - auto param_arr = ndarray::wrap(parameters.get_data(), 1); - Float w0 = fit_intercept ? param_arr.slice(0, 1).to_host(q, deps).at(0) : 0; // Poor perfomance + Float w0 = fit_intercept ? parameters.get_slice(0, 1).at_device(q, 0l) : 0; // Poor perfomance + ndview param_suf = fit_intercept ? parameters.get_slice(1, p + 1) : parameters; - auto event = gemv(q, - data, - parameters.get_slice(1, parameters.get_dimension(0)), - probabilities, - Float(1), - w0, - { fill_event }); + auto event = gemv(q, data, param_suf, probabilities, Float(1), w0, { fill_event }); auto* const prob_ptr = probabilities.get_mutable_data(); const Float bottom = sizeof(Float) == 4 ? 1e-7 : 1e-15; const Float top = Float(1.0) - bottom; - // Log Loss is undefined fot p = 0 and p = 1 so probabilities are clipped into [eps, 1 - eps] + // Log Loss is undefined for p = 0 and p = 1 so probabilities are clipped into [eps, 1 - eps] return q.submit([&](sycl::handler& cgh) { cgh.depends_on(event); @@ -71,23 +69,14 @@ sycl::event compute_probabilities(sycl::queue& q, template sycl::event compute_logloss(sycl::queue& q, - const ndview& parameters, - const ndview& data, const ndview& labels, const ndview& probabilities, ndview& out, - const Float L1, - const Float L2, - const bool fit_intercept, + bool fit_intercept, const event_vector& deps) { - const std::int64_t n = data.get_dimension(0); - const std::int64_t p = data.get_dimension(1); - ONEDAL_ASSERT(parameters.get_dimension(0) == p + 1); - ONEDAL_ASSERT(labels.get_dimension(0) == n); + const std::int64_t n = labels.get_dimension(0); ONEDAL_ASSERT(probabilities.get_dimension(0) == n); ONEDAL_ASSERT(labels.has_data()); - ONEDAL_ASSERT(parameters.has_data()); - ONEDAL_ASSERT(data.has_data()); ONEDAL_ASSERT(probabilities.has_data()); const auto* const labels_ptr = labels.get_data(); @@ -110,95 +99,29 @@ sycl::event compute_logloss(sycl::queue& q, sum += -label * sycl::log(prob) - (1 - label) * sycl::log(1 - prob); }); }); - - auto [out_reg, out_reg_e] = ndarray::zeros(q, { 1 }, sycl::usm::alloc::device); - auto* const reg_ptr = out_reg.get_mutable_data(); - const event_vector vector_out_reg = { out_reg_e }; - - const auto* const param_ptr = parameters.get_data(); - - if (L1 > 0 || L2 > 0) { - auto reg_event = q.submit([&](sycl::handler& cgh) { - cgh.depends_on(vector_out_reg); - const auto range = make_range_1d(p); - auto sum_reduction = sycl::reduction(reg_ptr, sycl::plus<>()); - cgh.parallel_for(range, sum_reduction, [=](sycl::id<1> idx, auto& sum) { - const Float param = param_ptr[idx + 1]; - sum += L1 * sycl::abs(param) + L2 * param * param; - }); - }); - auto final_event = q.submit([&](sycl::handler& cgh) { - cgh.depends_on({ reg_event, loss_event }); - cgh.single_task([=] { - out_ptr[0] += reg_ptr[0]; - }); - }); - return final_event; - } return loss_event; } -template -sycl::event compute_logloss(sycl::queue& q, - const ndview& parameters, - const ndview& data, - const ndview& labels, - ndview& out, - const Float L1, - const Float L2, - const bool fit_intercept, - const event_vector& deps) { - const std::int64_t n = data.get_dimension(0); - const std::int64_t p = data.get_dimension(1); - ONEDAL_ASSERT(parameters.get_dimension(0) == p + 1); - ONEDAL_ASSERT(labels.get_dimension(0) == n); - ONEDAL_ASSERT(labels.has_data()); - ONEDAL_ASSERT(parameters.has_data()); - ONEDAL_ASSERT(data.has_data()); - - // out should be filled with zero - - auto probabilities = ndarray::empty(q, { n }, sycl::usm::alloc::device); - auto prediction_event = - compute_probabilities(q, parameters, data, probabilities, fit_intercept, deps); - - return compute_logloss(q, - parameters, - data, - labels, - probabilities, - out, - L1, - L2, - fit_intercept, - { prediction_event }); -} - template sycl::event compute_logloss_with_der(sycl::queue& q, - const ndview& parameters, const ndview& data, const ndview& labels, const ndview& probabilities, ndview& out, ndview& out_derivative, - const Float L1, - const Float L2, - const bool fit_intercept, + bool fit_intercept, const event_vector& deps) { // out, out_derivative should be filled with zeros const std::int64_t n = data.get_dimension(0); const std::int64_t p = data.get_dimension(1); - ONEDAL_ASSERT(parameters.get_dimension(0) == p + 1); ONEDAL_ASSERT(labels.get_dimension(0) == n); ONEDAL_ASSERT(probabilities.get_dimension(0) == n); - ONEDAL_ASSERT(out.get_count() == 1); - ONEDAL_ASSERT(out_derivative.get_count() == p + 1); + ONEDAL_ASSERT(out.get_dimension(0) == 1); + ONEDAL_ASSERT(out_derivative.get_dimension(0) == fit_intercept ? p + 1 : p); ONEDAL_ASSERT(labels.has_data()); - ONEDAL_ASSERT(parameters.has_data()); ONEDAL_ASSERT(data.has_data()); ONEDAL_ASSERT(probabilities.has_data()); ONEDAL_ASSERT(out.has_mutable_data()); @@ -210,7 +133,6 @@ sycl::event compute_logloss_with_der(sycl::queue& q, auto* const der_obj_ptr = derivative_object.get_mutable_data(); const auto* const proba_ptr = probabilities.get_data(); const auto* const labels_ptr = labels.get_data(); - const auto* const param_ptr = parameters.get_data(); auto* const out_ptr = out.get_mutable_data(); auto* const out_derivative_ptr = out_derivative.get_mutable_data(); @@ -255,61 +177,30 @@ sycl::event compute_logloss_with_der(sycl::queue& q, }); }); } - auto out_der_suffix = out_derivative.get_slice(1, p + 1); - auto der_event = gemv(q, data.t(), derivative_object, out_der_suffix, { loss_event }); - if (L1 == 0 && L2 == 0) { - return der_event; - } - auto [reg_val, reg_val_e] = ndarray::zeros(q, { 1 }, sycl::usm::alloc::device); - - const event_vector reg_deps = { reg_val_e, der_event }; - auto* const reg_ptr = reg_val.get_mutable_data(); - - auto reg_event = q.submit([&](sycl::handler& cgh) { - cgh.depends_on(reg_deps); - const auto range = make_range_1d(p); - auto sum_reduction = sycl::reduction(reg_ptr, sycl::plus<>()); - cgh.parallel_for(range, sum_reduction, [=](sycl::id<1> idx, auto& sum) { - const Float param = param_ptr[idx + 1]; - sum += L1 * sycl::abs(param) + L2 * param * param; - out_derivative_ptr[idx + 1] += L2 * 2 * param; - }); - }); - - auto final_event = q.submit([&](sycl::handler& cgh) { - cgh.depends_on({ reg_event, loss_event, derw0_event }); - cgh.single_task([=] { - out_ptr[0] += reg_ptr[0]; - }); - }); + auto out_der_suffix = fit_intercept ? out_derivative.get_slice(1, p + 1) : out_derivative; - return final_event; + return gemv(q, data.t(), derivative_object, out_der_suffix, { loss_event, derw0_event }); } template sycl::event compute_derivative(sycl::queue& q, - const ndview& parameters, const ndview& data, const ndview& labels, const ndview& probabilities, ndview& out_derivative, - const Float L1, - const Float L2, - const bool fit_intercept, + bool fit_intercept, const event_vector& deps) { // out_derivative should be filled with zeros const std::int64_t n = data.get_dimension(0); const std::int64_t p = data.get_dimension(1); - ONEDAL_ASSERT(parameters.get_dimension(0) == p + 1); ONEDAL_ASSERT(labels.get_dimension(0) == n); ONEDAL_ASSERT(probabilities.get_dimension(0) == n); - ONEDAL_ASSERT(out_derivative.get_count() == p + 1); + ONEDAL_ASSERT(out_derivative.get_dimension(0) == fit_intercept ? p + 1 : p); ONEDAL_ASSERT(labels.has_data()); - ONEDAL_ASSERT(parameters.has_data()); ONEDAL_ASSERT(data.has_data()); ONEDAL_ASSERT(probabilities.has_data()); ONEDAL_ASSERT(out_derivative.has_mutable_data()); @@ -320,7 +211,6 @@ sycl::event compute_derivative(sycl::queue& q, auto* const der_obj_ptr = derivative_object.get_mutable_data(); const auto* const proba_ptr = probabilities.get_data(); const auto* const labels_ptr = labels.get_data(); - const auto* const param_ptr = parameters.get_data(); auto* const out_derivative_ptr = out_derivative.get_mutable_data(); auto loss_event = q.submit([&](sycl::handler& cgh) { @@ -356,49 +246,129 @@ sycl::event compute_derivative(sycl::queue& q, } }); - auto out_der_suffix = out_derivative.get_slice(1, p + 1); + auto out_der_suffix = fit_intercept ? out_derivative.get_slice(1, p + 1) : out_derivative; auto der_event = gemv(q, data.t(), derivative_object, out_der_suffix, { loss_event }); - if (L1 == 0 && L2 == 0) { - return der_event; - } + return der_event; +} +template +sycl::event add_regularization_loss(sycl::queue& q, + const ndview& parameters, + ndview& out, + Float L1, + Float L2, + bool fit_intercept, + const event_vector& deps) { + using dal::backend::operator+; + auto [out_reg, out_reg_e] = ndarray::zeros(q, { 1 }, sycl::usm::alloc::device); + auto* const reg_ptr = out_reg.get_mutable_data(); + auto* const out_ptr = out.get_mutable_data(); + const auto* const param_ptr = parameters.get_data(); + auto new_deps = deps + out_reg_e; + const std::int64_t p = + fit_intercept ? parameters.get_dimension(0) - 1 : parameters.get_dimension(0); auto reg_event = q.submit([&](sycl::handler& cgh) { - using oneapi::dal::backend::operator+; - cgh.depends_on({ der_event }); + cgh.depends_on(new_deps); const auto range = make_range_1d(p); - cgh.parallel_for(range, [=](sycl::id<1> idx) { - const Float param = param_ptr[idx + 1]; - out_derivative_ptr[idx + 1] += L2 * 2 * param; + auto sum_reduction = sycl::reduction(reg_ptr, sycl::plus<>()); + const std::int64_t st_id = fit_intercept; + cgh.parallel_for(range, sum_reduction, [=](sycl::id<1> idx, auto& sum) { + const Float param = param_ptr[idx + st_id]; + sum += L1 * sycl::abs(param) + L2 * param * param; + }); + }); + return q.submit([&](sycl::handler& cgh) { + cgh.depends_on({ reg_event }); + cgh.single_task([=] { + *out_ptr += *reg_ptr; }); }); +} + +template +sycl::event add_regularization_gradient_loss(sycl::queue& q, + const ndview& parameters, + ndview& out, + ndview& out_derivative, + Float L1, + Float L2, + bool fit_intercept, + const event_vector& deps) { + using dal::backend::operator+; + auto [reg_val, reg_val_e] = ndarray::zeros(q, { 1 }, sycl::usm::alloc::device); - return reg_event; + const std::int64_t p = + fit_intercept ? parameters.get_dimension(0) - 1 : parameters.get_dimension(0); + + const auto* const param_ptr = parameters.get_data(); + auto* const reg_ptr = reg_val.get_mutable_data(); + auto* const out_ptr = out.get_mutable_data(); + auto* const grad_ptr = out_derivative.get_mutable_data(); + auto new_deps = deps + reg_val_e; + auto reg_event = q.submit([&](sycl::handler& cgh) { + cgh.depends_on(new_deps); + const auto range = make_range_1d(p); + auto sum_reduction = sycl::reduction(reg_ptr, sycl::plus<>()); + std::int64_t st_id = fit_intercept; + cgh.parallel_for(range, sum_reduction, [=](sycl::id<1> idx, auto& sum) { + const Float param = param_ptr[idx + st_id]; + sum += L1 * sycl::abs(param) + L2 * param * param; + grad_ptr[idx + st_id] += L2 * 2 * param; + }); + }); + + return q.submit([&](sycl::handler& cgh) { + cgh.depends_on({ reg_event }); + cgh.single_task([=] { + *out_ptr += *reg_ptr; + }); + }); +} + +template +sycl::event add_regularization_gradient(sycl::queue& q, + const ndview& parameters, + ndview& out_derivative, + Float L1, + Float L2, + bool fit_intercept, + const event_vector& deps) { + auto* const grad_ptr = out_derivative.get_mutable_data(); + const auto* const param_ptr = parameters.get_data(); + const std::int64_t p = + fit_intercept ? parameters.get_dimension(0) - 1 : parameters.get_dimension(0); + return q.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + const auto range = make_range_1d(p); + std::int64_t st_id = fit_intercept; + cgh.parallel_for(range, [=](sycl::id<1> idx) { + const Float param = param_ptr[idx + st_id]; + grad_ptr[idx + st_id] += L2 * 2 * param; + }); + }); } template sycl::event compute_hessian(sycl::queue& q, - const ndview& parameters, const ndview& data, const ndview& labels, const ndview& probabilities, ndview& out_hessian, const Float L1, const Float L2, - const bool fit_intercept, + bool fit_intercept, const event_vector& deps) { const int64_t n = data.get_dimension(0); const int64_t p = data.get_dimension(1); - ONEDAL_ASSERT(parameters.get_dimension(0) == p + 1); ONEDAL_ASSERT(labels.get_dimension(0) == n); ONEDAL_ASSERT(probabilities.get_dimension(0) == n); ONEDAL_ASSERT(out_hessian.get_dimension(0) == (p + 1)); ONEDAL_ASSERT(out_hessian.get_dimension(1) == (p + 1)); ONEDAL_ASSERT(labels.has_data()); - ONEDAL_ASSERT(parameters.has_data()); ONEDAL_ASSERT(data.has_data()); ONEDAL_ASSERT(probabilities.has_data()); ONEDAL_ASSERT(out_hessian.has_mutable_data()); @@ -473,46 +443,44 @@ sycl::event compute_raw_hessian(sycl::queue& q, ONEDAL_ASSERT(out_hessian.get_dimension(0) == n); ONEDAL_ASSERT(probabilities.has_data()); ONEDAL_ASSERT(out_hessian.has_mutable_data()); - - const auto kernel = [=](const Float& val, Float*) -> Float { + const auto kernel = [=](const Float val, Float) -> Float { constexpr Float one(1); return val * (one - val); }; + return element_wise(q, kernel, probabilities, Float(0), out_hessian, deps); +} - return element_wise(q, kernel, probabilities, nullptr, out_hessian, deps); +std::int64_t get_block_size(std::int64_t n, std::int64_t p) { + constexpr std::int64_t max_alloc_size = 1 << 21; + return p > max_alloc_size ? 512 : max_alloc_size / p; } template -logloss_hessian_product::logloss_hessian_product(sycl::queue& q, - const ndview& data, - const Float L2, - const bool fit_intercept) +LogLossHessianProduct::LogLossHessianProduct(sycl::queue& q, + const table& data, + Float L2, + bool fit_intercept, + std::int64_t bsz) : q_(q), data_(data), - L2_{ L2 }, - fit_intercept_{ fit_intercept }, - n_{ data.get_dimension(0) }, - p_{ data.get_dimension(1) } { - raw_hessian_ = ndarray::empty(q_, { n_ }); - buffer_ = ndarray::empty(q_, { n_ }); + L2_(L2), + fit_intercept_(fit_intercept), + n_(data.get_row_count()), + p_(data.get_column_count()), + bsz_(bsz == -1 ? get_block_size(n_, p_) : bsz) { + raw_hessian_ = ndarray::empty(q_, { n_ }, sycl::usm::alloc::device); + buffer_ = ndarray::empty(q_, { n_ }, sycl::usm::alloc::device); } template -sycl::event logloss_hessian_product::set_raw_hessian(const ndview& raw_hessian, - const event_vector& deps) { - ONEDAL_ASSERT(raw_hessian.get_dimension(0) == n_); - return copy(q_, raw_hessian_, raw_hessian, deps); -} - -template -ndview& logloss_hessian_product::get_raw_hessian() { +ndview& LogLossHessianProduct::get_raw_hessian() { return raw_hessian_; } template -sycl::event logloss_hessian_product::compute_with_fit_intercept(const ndview& vec, - ndview& out, - const event_vector& deps) { +sycl::event LogLossHessianProduct::compute_with_fit_intercept(const ndview& vec, + ndview& out, + const event_vector& deps) { auto* const buffer_ptr = buffer_.get_mutable_data(); const auto* const hess_ptr = raw_hessian_.get_data(); auto* const out_ptr = out.get_mutable_data(); @@ -526,7 +494,14 @@ sycl::event logloss_hessian_product::compute_with_fit_intercept(const ndv sycl::event fill_out_event = fill(q_, out, Float(0), deps); Float v0 = vec.at_device(q_, 0, deps); - sycl::event event_xv = gemv(q_, data_, vec_suf, buffer_, Float(1), v0, { fill_buffer_event }); + + // TODO: Add batch matrix-vector multiplication + auto data_nd = table2ndarray(q_, data_, sycl::usm::alloc::device); + + sycl::event event_xv = gemv(q_, data_nd, vec_suf, buffer_, Float(1), v0, { fill_buffer_event }); + event_xv.wait_and_throw(); // Without this line gemv does not work correctly + + auto tmp_host = buffer_.to_host(q_); sycl::event event_dxv = q_.submit([&](sycl::handler& cgh) { cgh.depends_on({ event_xv, fill_out_event }); @@ -537,10 +512,12 @@ sycl::event logloss_hessian_product::compute_with_fit_intercept(const ndv sum_v0 += buffer_ptr[idx]; }); }); - auto event_xtdxv = - gemv(q_, data_.t(), buffer_, out_suf, Float(1), Float(0), { event_dxv, fill_out_event }); - const Float regularization_factor = L2_ * 2; + sycl::event event_xtdxv = + gemv(q_, data_nd.t(), buffer_, out_suf, Float(1), Float(0), { event_dxv, fill_out_event }); + event_xtdxv.wait_and_throw(); // Without this line gemv does not work correctly + + const Float regularization_factor = L2_; const auto kernel_regularization = [=](const Float a, const Float param) { return a + param * regularization_factor; @@ -552,16 +529,19 @@ sycl::event logloss_hessian_product::compute_with_fit_intercept(const ndv } template -sycl::event logloss_hessian_product::compute_without_fit_intercept( - const ndview& vec, - ndview& out, - const event_vector& deps) { +sycl::event LogLossHessianProduct::compute_without_fit_intercept(const ndview& vec, + ndview& out, + const event_vector& deps) { ONEDAL_ASSERT(vec.get_dimension(0) == p_); ONEDAL_ASSERT(out.get_dimension(0) == p_); sycl::event fill_out_event = fill(q_, out, Float(0), deps); - auto event_xv = gemv(q_, data_, vec, buffer_, Float(1), Float(0), deps); + // TODO: Add batch matrix-vector multiplication + auto data_nd = table2ndarray(q_, data_, sycl::usm::alloc::device); + + sycl::event event_xv = gemv(q_, data_nd, vec, buffer_, Float(1), Float(0), deps); + event_xv.wait_and_throw(); // Without this line gemv does not work correctly auto& buf_ndview = static_cast&>(buffer_); auto& hess_ndview = static_cast&>(raw_hessian_); @@ -569,10 +549,11 @@ sycl::event logloss_hessian_product::compute_without_fit_intercept( auto event_dxv = element_wise(q_, kernel_mul, buf_ndview, hess_ndview, buf_ndview, { event_xv }); - auto event_xtdxv = - gemv(q_, data_.t(), buffer_, out, Float(1), Float(0), { event_dxv, fill_out_event }); + sycl::event event_xtdxv = + gemv(q_, data_nd.t(), buffer_, out, Float(1), Float(0), { event_dxv, fill_out_event }); + event_xtdxv.wait_and_throw(); // Without this line gemv does not work correctly - const Float regularization_factor = L2_ * 2; + const Float regularization_factor = L2_; const auto kernel_regularization = [=](const Float a, const Float param) { return a + param * regularization_factor; @@ -585,9 +566,9 @@ sycl::event logloss_hessian_product::compute_without_fit_intercept( } template -sycl::event logloss_hessian_product::operator()(const ndview& vec, - ndview& out, - const event_vector& deps) { +sycl::event LogLossHessianProduct::operator()(const ndview& vec, + ndview& out, + const event_vector& deps) { if (fit_intercept_) { return compute_with_fit_intercept(vec, out, deps); } @@ -596,68 +577,196 @@ sycl::event logloss_hessian_product::operator()(const ndview& v } } +template +LogLossFunction::LogLossFunction(sycl::queue q, + const table& data, + ndview& labels, + Float L2, + bool fit_intercept, + std::int64_t bsz) + : q_(q), + data_(data), + labels_(labels), + n_(data.get_row_count()), + p_(data.get_column_count()), + L2_(L2), + fit_intercept_(fit_intercept), + bsz_(bsz == -1 ? get_block_size(n_, p_) : bsz), + hessp_(q, data, L2, fit_intercept, bsz_), + dimension_(fit_intercept ? p_ + 1 : p_) { + ONEDAL_ASSERT(labels.get_dimension(0) == n_); + probabilities_ = ndarray::empty(q_, { n_ }, sycl::usm::alloc::device); + gradient_ = ndarray::empty(q_, { dimension_ }, sycl::usm::alloc::device); + buffer_ = ndarray::empty(q_, { p_ + 2 }, sycl::usm::alloc::device); +} + +template +event_vector LogLossFunction::update_x(const ndview& x, + bool need_hessp, + const event_vector& deps) { + using dal::backend::operator+; + value_ = 0; + auto fill_event = fill(q_, gradient_, Float(0), deps); + const uniform_blocking blocking(n_, bsz_); + + event_vector last_iter_e = { fill_event }; + + ndview grad_ndview = gradient_; + ndview grad_batch = buffer_.slice(1, dimension_); + ndview loss_batch = buffer_.slice(0, 1); + + ndview raw_hessian = hessp_.get_raw_hessian(); + + for (std::int64_t b = 0; b < blocking.get_block_count(); ++b) { + const auto first = blocking.get_block_start_index(b); + const auto last = blocking.get_block_end_index(b); + const std::int64_t cursize = last - first; + + const auto data_rows = + row_accessor(data_).pull(q_, { first, last }, sycl::usm::alloc::device); + const auto data_batch = ndarray::wrap(data_rows, { cursize, p_ }); + const auto labels_batch = labels_.get_slice(first, first + cursize); + auto prob_batch = probabilities_.slice(first, cursize); + sycl::event prob_e = + compute_probabilities(q_, x, data_batch, prob_batch, fit_intercept_, last_iter_e); + + constexpr Float zero(0); + + auto fill_buffer_e = fill(q_, buffer_, zero, last_iter_e); + + sycl::event compute_e = compute_logloss_with_der(q_, + data_batch, + labels_batch, + prob_batch, + loss_batch, + grad_batch, + fit_intercept_, + { fill_buffer_e, prob_e }); + + sycl::event update_grad_e = + element_wise(q_, sycl::plus<>(), grad_ndview, grad_batch, grad_ndview, { compute_e }); + + value_ += loss_batch.at_device(q_, 0, { compute_e }); + + last_iter_e = { update_grad_e }; + + if (need_hessp) { + auto raw_hessian_batch = raw_hessian.get_slice(first, first + cursize); + auto hess_e = compute_raw_hessian(q_, prob_batch, raw_hessian_batch, { prob_e }); + last_iter_e = last_iter_e + hess_e; + } + + // TODO: Delete this wait_and_throw + // ensure that while event is running in the background data is not overwritten + wait_or_pass(last_iter_e).wait_and_throw(); + } + + if (L2_ > 0) { + auto fill_loss_e = fill(q_, loss_batch, Float(0), { last_iter_e }); + auto loss_ptr = loss_batch.get_mutable_data(); + auto grad_ptr = gradient_.get_mutable_data(); + auto w_ptr = x.get_data(); + Float regularization_factor = L2_; + + auto regularization_e = q_.submit([&](sycl::handler& cgh) { + cgh.depends_on(last_iter_e + fill_loss_e); + const auto range = make_range_1d(p_); + const std::int64_t st_id = fit_intercept_; + auto sum_reduction = sycl::reduction(loss_ptr, sycl::plus<>()); + cgh.parallel_for(range, sum_reduction, [=](sycl::id<1> idx, auto& sum_v0) { + const Float param = w_ptr[st_id + idx]; + grad_ptr[st_id + idx] += regularization_factor * param; + sum_v0 += regularization_factor * param * param / 2; + }); + }); + + value_ += loss_batch.at_device(q_, 0, { regularization_e }); + + last_iter_e = { regularization_e }; + } + + return last_iter_e; +} + +template +Float LogLossFunction::get_value() { + return value_; +} +template +ndview& LogLossFunction::get_gradient() { + return gradient_; +} + +template +BaseMatrixOperator& LogLossFunction::get_hessian_product() { + return hessp_; +} + #define INSTANTIATE(F) \ template sycl::event compute_probabilities(sycl::queue&, \ const ndview&, \ const ndview&, \ ndview&, \ - const bool, \ + bool, \ const event_vector&); \ template sycl::event compute_logloss(sycl::queue&, \ - const ndview&, \ - const ndview&, \ - const ndview&, \ - ndview&, \ - const F, \ - const F, \ - const bool, \ - const event_vector&); \ - template sycl::event compute_logloss(sycl::queue&, \ - const ndview&, \ - const ndview&, \ const ndview&, \ const ndview&, \ ndview&, \ - const F, \ - const F, \ - const bool, \ + bool, \ const event_vector&); \ template sycl::event compute_logloss_with_der(sycl::queue&, \ - const ndview&, \ const ndview&, \ const ndview&, \ const ndview&, \ ndview&, \ ndview&, \ - const F, \ - const F, \ - const bool, \ + bool, \ const event_vector&); \ template sycl::event compute_derivative(sycl::queue&, \ - const ndview&, \ const ndview&, \ const ndview&, \ const ndview&, \ ndview&, \ - const F, \ - const F, \ - const bool, \ + bool, \ const event_vector&); \ + template sycl::event add_regularization_loss(sycl::queue&, \ + const ndview&, \ + ndview&, \ + F, \ + F, \ + bool, \ + const event_vector&); \ + template sycl::event add_regularization_gradient_loss(sycl::queue&, \ + const ndview&, \ + ndview&, \ + ndview&, \ + F, \ + F, \ + bool, \ + const event_vector&); \ + template sycl::event add_regularization_gradient(sycl::queue&, \ + const ndview&, \ + ndview&, \ + F, \ + F, \ + bool, \ + const event_vector&); \ template sycl::event compute_hessian(sycl::queue&, \ - const ndview&, \ const ndview&, \ const ndview&, \ const ndview&, \ ndview&, \ const F, \ const F, \ - const bool, \ + bool, \ const event_vector&); \ template sycl::event compute_raw_hessian(sycl::queue&, \ const ndview&, \ ndview&, \ const event_vector&); \ - template class logloss_hessian_product; + template class LogLossHessianProduct; \ + template class LogLossFunction; INSTANTIATE(float); INSTANTIATE(double); diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_dpc.cpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_dpc.cpp index d61e9382578..b983e109232 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_dpc.cpp @@ -20,11 +20,14 @@ #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" #include "oneapi/dal/table/row_accessor.hpp" +#include "oneapi/dal/detail/debug.hpp" #include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" namespace oneapi::dal::backend::primitives::test { +using oneapi::dal::detail::operator<<; + namespace te = dal::test::engine; template @@ -74,13 +77,18 @@ class logloss_test : public te::float_algo_fixture { } } - void run_test(const float_t L1 = 0, const float_t L2 = 0, bool fit_intercept = true) { + void run_test(const float_t L1 = 0, + const float_t L2 = 0, + bool fit_intercept = true, + bool batch_test = false) { auto data_array = row_accessor{ this->data_ }.pull(this->get_queue()); auto data_host = ndarray::wrap(data_array.get_data(), { n_, p_ }); + std::int64_t dim = fit_intercept ? this->p_ + 1 : this->p_; + auto param_array = row_accessor{ this->params_ }.pull(this->get_queue()); - auto params_host = ndarray::wrap(param_array.get_data(), { p_ + 1 }); - test_input(data_host, params_host, this->labels_, L1, L2, fit_intercept); + auto params_host = ndarray::wrap(param_array.get_data(), { dim }); + test_input(data_host, params_host, this->labels_, L1, L2, fit_intercept, batch_test); SUCCEED(); } @@ -98,8 +106,18 @@ class logloss_test : public te::float_algo_fixture { constexpr float_t cur_param[p + 1] = { -0.2, 0.1, -1, 0.4 }; auto data_host = ndarray::wrap(data, { n, p }); + + this->data_ = homogen_table::wrap(data_host.get_data(), n, p); + auto labels_host = ndarray::wrap(labels, n); - auto params_host = ndarray::wrap(cur_param, p + 1); + + ndarray params_host; + if (fit_intercept) { + params_host = ndarray::wrap(cur_param, p + 1); + } + else { + params_host = ndarray::wrap(cur_param + 1, p); + } test_input(data_host, params_host, labels_host, L1, L2, fit_intercept); @@ -111,12 +129,13 @@ class logloss_test : public te::float_algo_fixture { const ndarray& labels_host, const float_t L1, const float_t L2, - bool fit_intercept) { + bool fit_intercept, + bool batch_test = false) { constexpr float_t rtol = sizeof(float_t) > 4 ? 1e-6 : 1e-4; - constexpr float_t atol = sizeof(float_t) > 4 ? 1e-6 : 1; - constexpr float_t atol2 = sizeof(float_t) > 4 ? 1e-6 : 1e-4; + constexpr float_t atol = sizeof(float_t) > 4 ? 1e-6 : 1e-1; const std::int64_t n = data_host.get_dimension(0); const std::int64_t p = data_host.get_dimension(1); + const std::int64_t dim = params_host.get_dimension(0); auto data_gpu = data_host.to_device(this->get_queue()); auto labels_gpu = labels_host.to_device(this->get_queue()); @@ -132,8 +151,8 @@ class logloss_test : public te::float_algo_fixture { fit_intercept, {}); p_event.wait_and_throw(); - auto predictions_host = out_predictions.to_host(this->get_queue(), {}); + const float_t logloss = test_predictions_and_logloss(data_host, params_host, labels_host, @@ -147,57 +166,75 @@ class logloss_test : public te::float_algo_fixture { auto [out_logloss, out_e] = ndarray::zeros(this->get_queue(), { 1 }, sycl::usm::alloc::device); sycl::event logloss_event = compute_logloss(this->get_queue(), - params_gpu, - data_gpu, labels_gpu, + out_predictions, out_logloss, - L1, - L2, fit_intercept, { out_e }); - logloss_event.wait_and_throw(); + sycl::event logloss_reg_event = add_regularization_loss(this->get_queue(), + params_gpu, + out_logloss, + L1, + L2, + fit_intercept, + { logloss_event }); + logloss_reg_event.wait_and_throw(); const float_t val_logloss1 = out_logloss.to_host(this->get_queue(), {}).at(0); + check_val(val_logloss1, logloss, rtol, atol); + auto fill_event = fill(this->get_queue(), out_logloss, float_t(0), {}); auto [out_derivative, out_der_e] = - ndarray::zeros(this->get_queue(), { p + 1 }, sycl::usm::alloc::device); + ndarray::zeros(this->get_queue(), { dim }, sycl::usm::alloc::device); auto logloss_event_der = compute_logloss_with_der(this->get_queue(), - params_gpu, data_gpu, labels_gpu, out_predictions, out_logloss, out_derivative, - L1, - L2, fit_intercept, { fill_event, out_der_e }); - logloss_event_der.wait_and_throw(); + auto regul_logloss_and_der_event = add_regularization_gradient_loss(this->get_queue(), + params_gpu, + out_logloss, + out_derivative, + L1, + L2, + fit_intercept, + { logloss_event_der }); + regul_logloss_and_der_event.wait_and_throw(); auto out_derivative_host = out_derivative.to_host(this->get_queue()); + const float_t val_logloss2 = out_logloss.to_host(this->get_queue(), {}).at(0); + check_val(val_logloss2, logloss, rtol, atol); auto [out_derivative2, out_der_e2] = - ndarray::zeros(this->get_queue(), { p + 1 }, sycl::usm::alloc::device); + ndarray::zeros(this->get_queue(), { dim }, sycl::usm::alloc::device); auto der_event = compute_derivative(this->get_queue(), - params_gpu, data_gpu, labels_gpu, out_predictions, out_derivative2, - L1, - L2, fit_intercept, { out_der_e2 }); - der_event.wait_and_throw(); + auto der_reg_event = add_regularization_gradient(this->get_queue(), + params_gpu, + out_derivative2, + L1, + L2, + fit_intercept, + { der_event }); + + der_reg_event.wait_and_throw(); auto out_derivative_host2 = out_derivative2.to_host(this->get_queue()); - for (auto i = 0; i <= p; ++i) { + + for (std::int64_t i = 0; i < dim; ++i) { REQUIRE(abs(out_derivative_host.at(i) - out_derivative_host2.at(i)) < atol); } auto [out_hessian, out_hess_e] = ndarray::zeros(this->get_queue(), { p + 1, p + 1 }, sycl::usm::alloc::device); auto hess_event = compute_hessian(this->get_queue(), - params_gpu, data_gpu, labels_gpu, out_predictions, @@ -209,16 +246,6 @@ class logloss_test : public te::float_algo_fixture { auto hessian_host = out_hessian.to_host(this->get_queue(), { hess_event }); - auto out_raw_hessian = - ndarray::empty(this->get_queue(), { n }, sycl::usm::alloc::device); - - auto hessp = logloss_hessian_product(this->get_queue(), data_gpu, L2, fit_intercept); - - auto raw_hess_event = - compute_raw_hessian(this->get_queue(), out_predictions, hessp.get_raw_hessian(), {}); - - raw_hess_event.wait_and_throw(); - test_formula_derivative(data_host, predictions_host, params_host, @@ -228,26 +255,43 @@ class logloss_test : public te::float_algo_fixture { L2, fit_intercept, rtol, - atol2); + atol); + test_formula_hessian(data_host, predictions_host, hessian_host, L2, fit_intercept, rtol, - atol2); - test_derivative_and_hessian(data_gpu, - labels_gpu, - out_derivative_host, - hessian_host, - params_host, - L1, - L2, - fit_intercept, - rtol, - atol); - - test_hessian_product(hessian_host, hessp, fit_intercept, L2, rtol, atol); + atol); + + if (L1 == 0) { + std::int64_t bsz = -1; + if (batch_test) { + bsz = GENERATE(4, 8, 16, 20, 37, 512); + } + + // LogLossFunction has different regularization so we need to multiply it by 2 to allign with other implementations + auto functor = LogLossFunction(this->get_queue(), + data_, + labels_gpu, + L2 * 2, + fit_intercept, + bsz); + auto set_point_event = functor.update_x(params_gpu, true, {}); + wait_or_pass(set_point_event).wait_and_throw(); + + check_val(logloss, functor.get_value(), rtol, atol); + auto grad_func = functor.get_gradient(); + auto grad_func_host = grad_func.to_host(this->get_queue()); + + std::int64_t dim = fit_intercept ? p + 1 : p; + for (std::int64_t i = 0; i < dim; ++i) { + check_val(out_derivative_host.at(i), grad_func_host.at(i), rtol, atol); + } + BaseMatrixOperator& hessp = functor.get_hessian_product(); + test_hessian_product(hessian_host, hessp, fit_intercept, L2, rtol, atol); + } } float_t test_predictions_and_logloss(const ndview& data_host, @@ -261,12 +305,13 @@ class logloss_test : public te::float_algo_fixture { const float_t atol = 1e-3) { const std::int64_t n = data_host.get_dimension(0); const std::int64_t p = data_host.get_dimension(1); - + const std::int64_t start_ind = fit_intercept ? 1 : 0; float_t logloss = 0; for (std::int64_t i = 0; i < n; ++i) { float_t pred = 0; + for (std::int64_t j = 0; j < p; ++j) { - pred += params_host.at(j + 1) * data_host.at(i, j); + pred += params_host.at(j + start_ind) * data_host.at(i, j); } if (fit_intercept) { pred += params_host.at(0); @@ -277,9 +322,10 @@ class logloss_test : public te::float_algo_fixture { float_t out_val = probabilities.at(i); REQUIRE(abs(out_val - prob) < atol); } - for (std::int64_t i = 1; i < p + 1; ++i) { - logloss += L1 * abs(params_host.at(i)); - logloss += L2 * params_host.at(i) * params_host.at(i); + for (std::int64_t i = 0; i < p; ++i) { + float_t param = params_host.at(i + start_ind); + logloss += L1 * abs(param); + logloss += L2 * param * param; } return logloss; } @@ -320,21 +366,23 @@ class logloss_test : public te::float_algo_fixture { float_t L2, bool fit_intercept) { const std::int64_t n = data.get_dimension(0); - const std::int64_t p = data.get_dimension(1); - const std::int64_t start_ind = (fit_intercept ? 0 : 1); - for (std::int64_t j = start_ind; j <= p; ++j) { + const std::int64_t dim = params.get_dimension(0); + for (std::int64_t j = 0; j < dim; ++j) { double val = 0; for (std::int64_t i = 0; i < n; ++i) { - double x1 = j > 0 ? data.at(i, j - 1) : 1; + double x1; + if (fit_intercept) { + x1 = j > 0 ? data.at(i, j - 1) : 1; + } + else { + x1 = data.at(i, j); + } double prob = probabilities.at(i); val += (prob - labels.at(i)) * x1; } - val += j > 0 ? L2 * 2 * params.at(j) : 0; + val += (!fit_intercept || 0 < j) ? L2 * 2 * params.at(j) : 0; out_der.at(j) = val; } - if (!fit_intercept) { - out_der.at(0) = 0; - } } void naive_hessian(const ndview& data_host, @@ -378,9 +426,9 @@ class logloss_test : public te::float_algo_fixture { bool fit_intercept, const float_t rtol = 1e-3, const float_t atol = 1e-3) { - const std::int64_t p = data.get_dimension(1); + const std::int64_t dim = params.get_dimension(0); auto out_derivative = - ndarray::empty(this->get_queue(), { p + 1 }, sycl::usm::alloc::host); + ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); naive_derivative(data, probabilities, @@ -391,7 +439,7 @@ class logloss_test : public te::float_algo_fixture { L2, fit_intercept); - for (std::int64_t i = 0; i < p + 1; ++i) { + for (std::int64_t i = 0; i < dim; ++i) { check_val(out_derivative.at(i), derivative.at(i), rtol, atol); } } @@ -416,149 +464,31 @@ class logloss_test : public te::float_algo_fixture { } } - void test_derivative_and_hessian(const ndview& data, - const ndview& labels, - const ndview& derivative, - const ndview& hessian, - const ndview& params_host, - const float_t L1, - const float_t L2, - bool fit_intercept, - const float_t rtol = 1e-3, - const float_t atol = 1e-3) { - const std::int64_t n = data.get_dimension(0); - const std::int64_t p = data.get_dimension(1); - constexpr std::int64_t max_n = 2000; - constexpr float_t step = sizeof(float_t) > 4 ? 1e-4 : 1e-3; - - const auto data_host = data.to_host(this->get_queue()); - const auto labels_host = labels.to_host(this->get_queue()); - - std::array cur_param; - for (std::int64_t i = 0; i < p + 1; ++i) { - cur_param[i] = params_host.at(i); - } - - auto out_logloss = - ndarray::empty(this->get_queue(), { 1 }, sycl::usm::alloc::device); - auto out_predictions = - ndarray::empty(this->get_queue(), { n }, sycl::usm::alloc::device); - auto out_derivative_up = - ndarray::empty(this->get_queue(), { p + 1 }, sycl::usm::alloc::device); - auto out_derivative_down = - ndarray::empty(this->get_queue(), { p + 1 }, sycl::usm::alloc::device); - - std::int64_t start_ind = fit_intercept ? 0 : 1; - - for (std::int64_t i = start_ind; i < p + 1; ++i) { - auto fill_event_1 = fill(this->get_queue(), out_logloss, float_t(0), {}); - auto fill_event_2 = fill(this->get_queue(), out_derivative_up, float_t(0), {}); - auto fill_event_3 = - fill(this->get_queue(), out_derivative_down, float_t(0), {}); - - cur_param[i] = params_host.at(i) + step; - auto params_host_up = ndarray::wrap(cur_param.begin(), p + 1); - auto params_gpu_up = params_host_up.to_device(this->get_queue()); - - // Compute logloss and derivative with params [w0, w1, ... w_i + eps, ...., w_p] - - sycl::event pred_up_event = compute_probabilities(this->get_queue(), - params_gpu_up, - data, - out_predictions, - fit_intercept, - {}); - sycl::event der_event_up = - compute_logloss_with_der(this->get_queue(), - params_gpu_up, - data, - labels, - out_predictions, - out_logloss, - out_derivative_up, - L1, - L2, - fit_intercept, - { fill_event_1, fill_event_2, pred_up_event }); - der_event_up.wait_and_throw(); - double logloss_up = - naive_logloss(data_host, params_host_up, labels_host, L1, L2, fit_intercept); - auto der_up_host = out_derivative_up.to_host(this->get_queue(), {}); - - cur_param[i] = params_host.at(i) - step; - - auto params_host_down = ndarray::wrap(cur_param.begin(), p + 1); - auto params_gpu_down = params_host_down.to_device(this->get_queue()); - auto fill_event_4 = fill(this->get_queue(), out_logloss, float_t(0), {}); - - // Compute logloss and derivative with params [w0, w1, ... w_i - eps, ...., w_p] - - sycl::event pred_down_event = compute_probabilities(this->get_queue(), - params_gpu_down, - data, - out_predictions, - fit_intercept, - {}); - sycl::event der_event_down = - compute_logloss_with_der(this->get_queue(), - params_gpu_down, - data, - labels, - out_predictions, - out_logloss, - out_derivative_down, - L1, - L2, - fit_intercept, - { fill_event_3, fill_event_4, pred_down_event }); - der_event_down.wait_and_throw(); - - double logloss_down = - naive_logloss(data_host, params_host_down, labels_host, L1, L2, fit_intercept); - auto der_down_host = out_derivative_down.to_host(this->get_queue(), {}); - // Check condition: (logloss(w_i + eps) - logloss(w_i - eps)) / 2eps ~ d logloss / dw_i - if (L1 == 0) { - check_val(derivative.at(i), (logloss_up - logloss_down) / (2 * step), rtol, atol); - } - if (sizeof(float_t) > 4) { - for (std::int64_t j = 0; j < p + 1; ++j) { - // Check condition (d logloss(w_i + eps) / d w_j - d logloss(w_i - eps) / d w_j) / 2eps ~ h_i,j - // due to lack of precision this condition is not checked for 32-bit floating point numbers - check_val(hessian.at(i, j), - (der_up_host.at(j) - der_down_host.at(j)) / (2 * step), - rtol, - atol); - } - } - cur_param[i] += step; - } - } - void test_hessian_product(const ndview& hessian_host, - logloss_hessian_product& hessp, + BaseMatrixOperator& hessp, bool fit_intercept, double L2, const float_t rtol = 1e-3, const float_t atol = 1e-3, std::int32_t num_checks = 5) { const std::int64_t p = hessian_host.get_dimension(0) - 1; - const std::int64_t k = fit_intercept ? p + 1 : p; + const std::int64_t dim = fit_intercept ? p + 1 : p; primitives::rng rn_gen; auto vec_host = - ndarray::empty(this->get_queue(), { k }, sycl::usm::alloc::host); + ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); for (std::int32_t ij = 0; ij < num_checks; ++ij) { - primitives::engine eng(2007 + k * num_checks + ij); - rn_gen.uniform(k, vec_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); + primitives::engine eng(2007 + dim * num_checks + ij); + rn_gen.uniform(dim, vec_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = - ndarray::empty(this->get_queue(), { k }, sycl::usm::alloc::device); + ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); hessp(vec_gpu, out_vector, {}).wait_and_throw(); auto out_vector_host = out_vector.to_host(this->get_queue()); - const std::int64_t st = fit_intercept ? 0 : 1; + for (std::int64_t i = st; i < p + 1; ++i) { float_t correct = 0; for (std::int64_t j = st; j < p + 1; ++j) { @@ -606,6 +536,30 @@ TEMPLATE_TEST_M(logloss_test, "test random input - double without L1", "[logloss this->run_test(0.0, 1.3); } +TEMPLATE_TEST_M(logloss_test, + "test random input - double without L1 - no fit intercept", + "[logloss]", + double) { + SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->get_policy().is_cpu()); + this->generate_input(); + this->run_test(0.0, 1.3, false); +} + +TEMPLATE_TEST_M(logloss_test, "batch test - double", "[logloss]", double) { + SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->get_policy().is_cpu()); + this->generate_input(); + this->run_test(0.0, 1.3, true, true); +} + +TEMPLATE_TEST_M(logloss_test, "batch test - double - no fit intercept", "[logloss]", double) { + SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->get_policy().is_cpu()); + this->generate_input(); + this->run_test(0.0, 1.3, false, true); +} + TEMPLATE_TEST_M(logloss_test, "test random input - double with L1", "[logloss]", double) { SKIP_IF(this->not_float64_friendly()); SKIP_IF(this->get_policy().is_cpu()); @@ -613,10 +567,26 @@ TEMPLATE_TEST_M(logloss_test, "test random input - double with L1", "[logloss]", this->run_test(0.4, 1.3); } +TEMPLATE_TEST_M(logloss_test, + "test random input - double with L1 -- no fit intercept", + "[logloss]", + double) { + SKIP_IF(this->not_float64_friendly()); + SKIP_IF(this->get_policy().is_cpu()); + this->generate_input(); + this->run_test(0.4, 1.3, false); +} + TEMPLATE_TEST_M(logloss_test, "test random input - float", "[logloss]", float) { SKIP_IF(this->get_policy().is_cpu()); this->generate_input(); this->run_test(0.4, 1.3); } +TEMPLATE_TEST_M(logloss_test, "test random input - float - no fit intercept", "[logloss]", float) { + SKIP_IF(this->get_policy().is_cpu()); + this->generate_input(); + this->run_test(0.4, 1.3, false); +} + } // namespace oneapi::dal::backend::primitives::test diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_perf_dpc.cpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_perf_dpc.cpp index c7f1e954daf..cdb3b7ddd5c 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_perf_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/logloss_perf_dpc.cpp @@ -75,8 +75,12 @@ class logloss_perf_test : public te::float_algo_fixture { auto out_predictions = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::device); - auto p_event = - compute_probabilities(this->get_queue(), params_gpu, data_gpu, out_predictions, {}); + auto p_event = compute_probabilities(this->get_queue(), + params_gpu, + data_gpu, + out_predictions, + true, + {}); p_event.wait_and_throw(); auto out_logloss = @@ -84,24 +88,29 @@ class logloss_perf_test : public te::float_algo_fixture { auto out_derivative = ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::device); - BENCHMARK("Derivative computation") { auto fill_event1 = fill(this->get_queue(), out_logloss, float_t(0), {}); auto fill_event2 = fill(this->get_queue(), out_derivative, float_t(0), {}); auto logloss_event_der = compute_logloss_with_der(this->get_queue(), - params_gpu, data_gpu, labels_gpu, out_predictions, out_logloss, out_derivative, - L1, - L2, + true, { fill_event1, fill_event2 }); - logloss_event_der.wait_and_throw(); + auto logloss_event_reg = add_regularization_gradient_loss(this->get_queue(), + params_gpu, + out_logloss, + out_derivative, + L1, + L2, + true, + { logloss_event_der }); + + logloss_event_reg.wait_and_throw(); }; - auto out_hessian = ndarray::empty(this->get_queue(), { p_ + 1, p_ + 1 }, sycl::usm::alloc::device); @@ -109,13 +118,13 @@ class logloss_perf_test : public te::float_algo_fixture { BENCHMARK("Hessian computation") { auto fill_event = fill(this->get_queue(), out_hessian, float_t(0), {}); auto hess_event = compute_hessian(this->get_queue(), - params_gpu, data_gpu, labels_gpu, out_predictions, out_hessian, L1, L2, + /*fit_intercept=*/true, { fill_event }); hess_event.wait_and_throw(); }; diff --git a/cpp/oneapi/dal/backend/primitives/optimizers.hpp b/cpp/oneapi/dal/backend/primitives/optimizers.hpp index 0463f5ce4a7..570a6e41326 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers.hpp @@ -17,3 +17,5 @@ #pragma once #include "oneapi/dal/backend/primitives/newton_cg/cg_solver.hpp" +#include "oneapi/dal/backend/primitives/newton_cg/newton_cg.hpp" +#include "oneapi/dal/backend/primitives/newton_cg/line_search.hpp" diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/cg_solver_dpc.cpp index a2409c42e33..c9e996709e1 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/cg_solver_dpc.cpp @@ -76,13 +76,14 @@ sycl::event cg_solve(sycl::queue& queue, atol = std::max(atol, min_eps); Float threshold = std::max(tol * b_norm, atol); - const auto init_conj_kernel = [=](const Float residual_val, Float*) -> Float { + const auto init_conj_kernel = [=](const Float residual_val, Float) -> Float { return -residual_val; }; + auto compute_conj_event = element_wise(queue, init_conj_kernel, residual, - nullptr, + Float(0), conj_vector, { compute_r0_event }); // p0 = -r0 + 0 * p auto conj_host = conj_vector.to_host(queue, {}); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp index 6f486cf0008..4e4c94c6ca5 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/newton_cg_dpc.cpp @@ -34,7 +34,7 @@ sycl::event newton_cg(sycl::queue& queue, const event_vector& deps) { std::int64_t n = x.get_dimension(0); - const auto kernel_minus = [=](const Float& val, Float*) -> Float { + const auto kernel_minus = [=](const Float val, Float) -> Float { return -val; }; auto buffer = ndarray::empty(queue, { 4 * n + 1 }, sycl::usm::alloc::device); @@ -63,7 +63,7 @@ sycl::event newton_cg(sycl::queue& queue, Float tol_k = std::min(sqrt(grad_norm), 0.5); auto prepare_grad_event = - element_wise(queue, kernel_minus, gradient, nullptr, gradient, update_event_vec); + element_wise(queue, kernel_minus, gradient, Float(0), gradient, update_event_vec); auto copy_event = copy(queue, direction, gradient, { prepare_grad_event }); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index a86a2636790..89501ce53ef 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -94,7 +94,7 @@ class newton_cg_test : public te::float_algo_fixture { for (std::int64_t i = 0; i < n_; ++i) { val_gth -= b_host.at(i) * x_host.at(i); } - check_val(val_gth, val, float_t(1e-5), float_t(1e-5)); + check_val(val_gth, val, float_t(5e-5), float_t(5e-5)); } } diff --git a/cpp/oneapi/dal/detail/error_messages.hpp b/cpp/oneapi/dal/detail/error_messages.hpp index aa8bfb6377c..462e146e477 100644 --- a/cpp/oneapi/dal/detail/error_messages.hpp +++ b/cpp/oneapi/dal/detail/error_messages.hpp @@ -51,7 +51,7 @@ namespace v1 { // - If a new mathematical operator introduced, stick to LaTeX // conventions as well. // -// - For oneDAL specific terms use the following shortening: +// - For Intel(R) oneAPI Data Analytics Library (oneDAL) specific terms use the following shortening: // // row count rc // column count cc diff --git a/cpp/oneapi/dal/detail/mpi/communicator.hpp b/cpp/oneapi/dal/detail/mpi/communicator.hpp index 9f14be97a0e..cb406b289a6 100644 --- a/cpp/oneapi/dal/detail/mpi/communicator.hpp +++ b/cpp/oneapi/dal/detail/mpi/communicator.hpp @@ -229,7 +229,7 @@ class mpi_communicator_impl : public via_host_interface_selector