From fc97a570d31567b847f5d0742870f24ad74e0560 Mon Sep 17 00:00:00 2001 From: jackzipu <74961298+jackzipu@users.noreply.github.com> Date: Thu, 20 Jan 2022 14:28:16 +0800 Subject: [PATCH] Add the test for the cache configuration (#730) * Add information to check the cache sdk version and the working sdk version * Set the log level to info for test * Add environment variable for error injection test * format the code for lint check * Add information to show the ENGION OPTIONS set * Make it recompile when cache version mismatch * Remote the environment for error simulator to make it set by environment variable * Add error injector for temporarily test * Fix the problem when there's not the error injection config file * Move the error injector to the odla_CreateComputation to make sure it will be called for each inference request * Add try catch to computation create and init * make the error handling code at one place * Use config file setting for batch per step when it's inconsistent with ODLA set * set the log level when error injector used * pass the error code to caller * Make sdk version mismatch cache recompile when error injector set * Make the popart log level take POPART_LOG_LEVEL firstly * change format to pass lint Co-authored-by: tianboh <54729592+tianboh@users.noreply.github.com> --- ODLA/platforms/odla_popart/odla_compute.cc | 31 ++- ODLA/platforms/odla_popart/odla_popart.cc | 276 +++++++++----------- ODLA/platforms/odla_popart/popart_config.cc | 38 +++ ODLA/platforms/odla_popart/popart_config.h | 6 +- 4 files changed, 185 insertions(+), 166 deletions(-) diff --git a/ODLA/platforms/odla_popart/odla_compute.cc b/ODLA/platforms/odla_popart/odla_compute.cc index 2ab4db99e..6bb0f921c 100644 --- a/ODLA/platforms/odla_popart/odla_compute.cc +++ b/ODLA/platforms/odla_popart/odla_compute.cc @@ -63,15 +63,16 @@ odla_status odla_SetComputationItem(odla_computation comp, odla_item_type type, comp->opts.cache_dir = (reinterpret_cast(value)); break; case 1001: // load cache directly, need set path of cache file + { popart::logging::info("set load_or_save_cache"); PopartConfig::instance()->set_load_or_save_cache(true); PopartConfig::instance()->set_cache_path( (std::string) reinterpret_cast(value)); - break; + } break; case 1002: setenv("POPART_LOG_LEVEL", "INFO", 1); default: - std::cerr << "Unsupported property type: " << type << std::endl; + popart::logging::err("Unsupported property type: {}", type); return ODLA_UNSUPPORTED_DATATYPE; } return ODLA_SUCCESS; @@ -96,7 +97,7 @@ odla_status odla_CreateExecutable(odla_executable* executable, // create executable if (ret != ODLA_SUCCESS) { popart::logging::err("Failed to init computation when compiling."); - return ODLA_FAILURE; + return ret; } _odla_computation::instance()->compile_and_export(); } @@ -117,6 +118,28 @@ odla_status odla_LoadExecutable(const odla_char* file_name, odla_device device, } odla_status odla_CreateComputation(odla_computation* comp) { + auto injector = PopartConfig::instance()->temp_get_error_inject_env(); + if (injector.empty()) { + popart::logging::warn("NO VALUE SET for error injector"); + if (nullptr == getenv("POPART_LOG_LEVEL")) { // if POPART_LOG_LEVEL not set + popart::logging::warn("popart logging level set to warn."); + popart::logging::setLogLevel(popart::logging::Module::popart, + popart::logging::Level::Warn); + } + } else { + setenv("POPLAR_ENGINE_OPTIONS", injector.c_str(), 1); + if (nullptr == getenv("POPART_LOG_LEVEL")) { // if POPART_LOG_LEVEL not set + popart::logging::warn("popart logging level set to info."); + popart::logging::setLogLevel(popart::logging::Module::popart, + popart::logging::Level::Info); + } + } + if (nullptr != getenv("POPLAR_ENGINE_OPTIONS")) + popart::logging::info("The env POPLAR_ENGINE_OPTIONS value is: {}", + getenv("POPLAR_ENGINE_OPTIONS")); + else + popart::logging::info("The env POPLAR_ENGINE_OPTIONS value is not set"); + static void* custom_op_handle = nullptr; *comp = _odla_computation::instance(); popart::logging::info("computation created"); @@ -159,7 +182,7 @@ odla_status odla_CreateContext(odla_context* context) { if (status != ODLA_SUCCESS && _odla_computation::instance()->session == nullptr) { popart::logging::err("init computation item in CreateContext failed."); - return ODLA_FAILURE; + return status; } *context = new _odla_pipeline_context(_odla_computation::instance()); return ODLA_SUCCESS; diff --git a/ODLA/platforms/odla_popart/odla_popart.cc b/ODLA/platforms/odla_popart/odla_popart.cc index 6c3a728dd..f2f2cec6c 100644 --- a/ODLA/platforms/odla_popart/odla_popart.cc +++ b/ODLA/platforms/odla_popart/odla_popart.cc @@ -36,62 +36,83 @@ _odla_computation* _odla_computation::instance_ = nullptr; std::mutex _odla_computation::comp_mutex_; +#define POPLAR_TRY try { +#define POPLAR_CATCH \ + } \ + catch (poplar::application_runtime_error & e) { \ + popart::logging::err( \ + "Poplar exception application_runtime_error caught:"); \ + RETURN_ERROR(ODLA_INTERNAL_LOGIC_ERR) \ + } \ + catch (poplar::recoverable_runtime_error & e) { \ + popart::logging::err("Poplar recoverable_runtime_error exception caught"); \ + auto action = e.getRecoveryAction(); \ + popart::logging::err("need to take action:{}", action); \ + if (action == poplar::RecoveryAction::IPU_RESET) { \ + RETURN_ERROR(ODLA_RECOVERABLE_ERR) \ + } else if (action == poplar::RecoveryAction::PARTITION_RESET) { \ + RETURN_ERROR(ODLA_PARTITION_RESET) \ + } else if (action == poplar::RecoveryAction::FULL_RESET) { \ + RETURN_ERROR(ODLA_FULL_RESET) \ + } \ + } \ + catch (poplar::unrecoverable_runtime_error & e) { \ + popart::logging::err( \ + "Poplar unrecoverable_runtime_error exception caught"); \ + RETURN_ERROR(ODLA_UNRECOVERABLE_ERR) \ + } \ + catch (poplar::unknown_runtime_error & e) { \ + popart::logging::err("Poplar unknown runtime exception caught"); \ + RETURN_ERROR(ODLA_UNRECOVERABLE_ERR) \ + } \ + catch (std::exception & e) { \ + popart::logging::err("std::exception gotten: {}", e.what()); \ + RETURN_ERROR(ODLA_UNRECOVERABLE_ERR) \ + } \ + catch (...) { \ + popart::logging::err("Poplar unknown exception caught"); \ + RETURN_ERROR(ODLA_UNRECOVERABLE_ERR) \ + } + +#define RETURN_ERROR(ERR_CODE) (QManager::instance()->set_status(ERR_CODE)); + void compute_loop(odla_computation comp) { // setup the stepio with allbacks popart::StepIOCallback stepio(input_callback, input_complete_callback, output_callback, output_complete_callback); int i = 0; - try { - while (!comp->is_done()) { - auto start = std::chrono::steady_clock::now(); - popart::logging::info("This is the {} time for the inference", i++); - if (i == INT_MAX) i = 0; - comp->session->run(stepio); - auto end = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - popart::logging::warn( - "[ {} ] ONE_STEP takes {} s. Check whether more inference tasks " - "wating.", - i, elapsed_seconds.count()); - // Make wait on CPU if there's not inference task - start = std::chrono::steady_clock::now(); - while (!comp->is_done() && QManager::instance()->getQ()->size() == 0) - std::this_thread::sleep_for(std::chrono::milliseconds(1)); - end = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_ms = end - start; - popart::logging::warn("Found new tasks in {} ms.", elapsed_ms.count()); - } - } catch (poplar::application_runtime_error& e) { - popart::logging::err("Poplar exception application_runtime_error caught:"); - QManager::instance()->set_status(ODLA_INTERNAL_LOGIC_ERR); - } catch (poplar::recoverable_runtime_error& e) { - popart::logging::err("Poplar recoverable_runtime_error exception caught"); - auto action = e.getRecoveryAction(); - popart::logging::err("need to take action:{}", action); - if (action == poplar::RecoveryAction::IPU_RESET) { - QManager::instance()->set_status(ODLA_RECOVERABLE_ERR); - } else if (action == poplar::RecoveryAction::PARTITION_RESET) { - QManager::instance()->set_status(ODLA_PARTITION_RESET); - } else if (action == poplar::RecoveryAction::FULL_RESET) { - QManager::instance()->set_status(ODLA_FULL_RESET); - } - } catch (poplar::unrecoverable_runtime_error& e) { - popart::logging::err("Poplar unrecoverable_runtime_error exception caught"); - QManager::instance()->set_status(ODLA_UNRECOVERABLE_ERR); - } catch (poplar::unknown_runtime_error& e) { - popart::logging::err("Poplar unknown runtime exception caught"); - QManager::instance()->set_status(ODLA_UNRECOVERABLE_ERR); - } catch (...) { - popart::logging::err("Poplar unknown exception caught"); - QManager::instance()->set_status(ODLA_UNRECOVERABLE_ERR); + POPLAR_TRY + while (!comp->is_done()) { + auto start = std::chrono::steady_clock::now(); + popart::logging::info("This is the {} time for the inference", i++); + if (i == INT_MAX) i = 0; + comp->session->run(stepio); + auto end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + popart::logging::warn( + "[ {} ] ONE_STEP takes {} s. Check whether more inference tasks " + "wating.", + i, elapsed_seconds.count()); + // Make wait on CPU if there's not inference task + start = std::chrono::steady_clock::now(); + while (!comp->is_done() && QManager::instance()->getQ()->size() == 0) + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_ms = end - start; + popart::logging::warn("Found new tasks in {} ms.", elapsed_ms.count()); } + POPLAR_CATCH popart::logging::info("The pipeline loop finished"); comp->thread_done(); } +#undef RETURN_ERROR +#define RETURN_ERROR(ERR_CODE) return ERR_CODE; + odla_status _odla_computation::compile_and_export() { odla_status ret_value = ODLA_SUCCESS; + POPLAR_TRY popart::logging::warn("Start compile and export"); const std::string& cache_file_name = PopartConfig::instance()->get_cache_path(); @@ -135,11 +156,17 @@ odla_status _odla_computation::compile_and_export() { } // add sdk_version in the file content std::string version_string(popart::core::packageHash()); - popart::logging::info("the popart version is: {}", version_string); + popart::logging::info("the popart package hash is: {}", version_string); if (config_string.find("sdk_version") == std::string::npos) { std::string item_string = "\n\"sdk_version\":\"" + version_string + "\","; config_string.insert(1, item_string); } + // change all true to false + std::string src_replace("true"); + std::string dest_replace("false"); + std::string::size_type pos = 0; + while ((pos = config_string.find(src_replace)) != std::string::npos) + config_string.replace(pos, src_replace.length(), dest_replace); popart::logging::info("the config_string with sdk_version is: {}", config_string); // added the sdk_version information to the file content @@ -147,19 +174,11 @@ odla_status _odla_computation::compile_and_export() { cache_fs.write((char*)&config_size, sizeof(config_size)); cache_fs.write(config_string.c_str(), config_string.size()); - try { - _odla_computation::instance()->session->compileAndExport(cache_fs.flush()); - } catch (std::exception& e) { - popart::logging::err("compileAndExport Falied: {}", e.what()); - ret_value = ODLA_FAILURE; - } catch (...) { - popart::logging::err("compileAndExport Falied"); - ret_value = ODLA_FAILURE; - } + _odla_computation::instance()->session->compileAndExport(cache_fs.flush()); cache_fs.flush(); cache_fs.close(); config_fs.close(); - + POPLAR_CATCH return ret_value; } @@ -167,6 +186,7 @@ odla_status _odla_computation::init(bool is_compile) { if (!session) { std::lock_guard guard(init_mutex_); if (!session) { + POPLAR_TRY odla_status status = set_opts(); if (status != ODLA_SUCCESS) { popart::logging::err("set computation option failed"); @@ -195,16 +215,7 @@ odla_status _odla_computation::init(bool is_compile) { // Create and config SessionOptions set_session_opts(); if (use_pipeline()) { - try { - builder = popart::Builder::createFromOnnxModel(set_pipeline_stage()); - } catch (std::exception& e) { - popart::logging::err("create builder from onnx model failed:{}", - e.what()); - return ODLA_FAILURE; - } catch (...) { - popart::logging::err("create builder from onnx model failed."); - return ODLA_FAILURE; - } + builder = popart::Builder::createFromOnnxModel(set_pipeline_stage()); } auto proto = builder->getModelProto(); // So, the init must be called at // odla_ExecuteCompute @@ -222,78 +233,47 @@ odla_status _odla_computation::init(bool is_compile) { } std::unique_ptr new_session; - try { - // Create InferenceSession - new_session = std::move(popart::InferenceSession::createFromOnnxModel( - proto, data_flow, device, popart::InputShapeInfo(), session_opts_)); - } catch (std::exception& e) { - popart::logging::err("Session::createFromOnnxModel failed:{}", - e.what()); - return ODLA_FAILURE; - } catch (...) { - popart::logging::err("Session::createFromOnnxModel failed"); - return ODLA_FAILURE; - } + // Create InferenceSession + new_session = std::move(popart::InferenceSession::createFromOnnxModel( + proto, data_flow, device, popart::InputShapeInfo(), session_opts_)); if (!is_compile) { if (PopartConfig::instance()->load_or_save_cache()) { popart::logging::info("Load cachefile from existing stream"); std::string version_string(popart::core::packageHash()); if (!PopartConfig::instance()->sdk_version_match(version_string)) { - popart::logging::err("The sdk version of cache does not match {}", - version_string); - return ODLA_FAILURE; - } - auto cache_fs = PopartConfig::instance()->get_cache_fs(); - if (cache_fs->is_open()) { - try { - cache_fs->seekg(0, std::ios::beg); - int config_len = 0; - cache_fs->read((char*)&config_len, sizeof(config_len)); - cache_fs->seekg(config_len + sizeof(config_len), std::ios::beg); - new_session->loadExecutableFromStream(*(cache_fs.get())); - } catch (std::exception& e) { - popart::logging::err("bad cache file, will compile the graph:{}", - e.what()); - return ODLA_FAILURE; - } catch (...) { - popart::logging::err("bad cache file, will compile the graph"); + popart::logging::err( + "The sdk version of cache does not match popart hash {}", + version_string); + auto injector = + PopartConfig::instance()->temp_get_error_inject_env(); + if (injector.empty()) return ODLA_FAILURE; + else + popart::logging::warn("Error injector set, will compile"); + } else { + auto cache_fs = PopartConfig::instance()->get_cache_fs(); + if (cache_fs->is_open()) { + try { + cache_fs->seekg(0, std::ios::beg); + int config_len = 0; + cache_fs->read((char*)&config_len, sizeof(config_len)); + cache_fs->seekg(config_len + sizeof(config_len), std::ios::beg); + new_session->loadExecutableFromStream(*(cache_fs.get())); + } catch (std::exception& e) { + popart::logging::err("bad cache file: {}", e.what()); + return ODLA_FAILURE; + } catch (...) { + popart::logging::err("bad cache file"); + return ODLA_FAILURE; + } } } } - try { - new_session->prepareDevice(); - new_session->setRandomSeed(0); // Init seed - new_session->weightsFromHost(); // Copy weights from host to IPU - } catch (poplar::application_runtime_error& e) { - popart::logging::err( - "Poplar exception application_runtime_error caught:{}", e.what()); - return ODLA_INTERNAL_LOGIC_ERR; - } catch (poplar::recoverable_runtime_error& e) { - popart::logging::err( - "Poplar recoverable_runtime_error exception caught"); - auto action = e.getRecoveryAction(); - popart::logging::err("need to take action:{}", action); - if (action == poplar::RecoveryAction::IPU_RESET) { - return ODLA_RECOVERABLE_ERR; - } else if (action == poplar::RecoveryAction::PARTITION_RESET) { - return ODLA_PARTITION_RESET; - } else if (action == poplar::RecoveryAction::FULL_RESET) { - return ODLA_FULL_RESET; - } - } catch (poplar::unrecoverable_runtime_error& e) { - popart::logging::err( - "Poplar unrecoverable_runtime_error exception caught"); - return ODLA_UNRECOVERABLE_ERR; - } catch (poplar::unknown_runtime_error& e) { - popart::logging::err("Poplar unknown runtime exception caught"); - return ODLA_UNRECOVERABLE_ERR; - } catch (...) { - popart::logging::err("Poplar unknown exception caught"); - return ODLA_UNRECOVERABLE_ERR; - } + new_session->prepareDevice(); + new_session->setRandomSeed(0); // Init seed + new_session->weightsFromHost(); // Copy weights from host to IPU // If in parallel mode, start the thread ExecutionMode mode = PopartConfig::instance()->execution_mode(); if (PIPELINE == mode || PARALLEL == mode) { @@ -308,6 +288,7 @@ odla_status _odla_computation::init(bool is_compile) { session = std::move(new_session); // set session after all initialization done. + POPLAR_CATCH } } return ODLA_SUCCESS; @@ -327,11 +308,11 @@ odla_status _odla_computation::set_opts() { return ODLA_FAILURE; } if (opts.batches_per_step != PopartConfig::instance()->batches_per_step()) { - popart::logging::err( + popart::logging::warn( "batches per step in pipeline configuration:" + std::to_string(PopartConfig::instance()->batches_per_step()) + - " must same with options: " + std::to_string(opts.batches_per_step)); - return ODLA_FAILURE; + " not same with options: " + std::to_string(opts.batches_per_step)); + opts.batches_per_step = PopartConfig::instance()->batches_per_step(); } } return ODLA_SUCCESS; @@ -521,37 +502,14 @@ odla_status Sequence::compute(odla_computation comp, odla_context context, auto start = std::chrono::steady_clock::now(); popart::StepIO stepio(inputs, outputs); // Run on ipu - try { - comp->session->run(stepio); - auto end = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_seconds = end - start; - popart::logging::info("[ {} ] [Sequence::compute] takes {} s.", i++, - elapsed_seconds.count()); - popart::logging::info("<<< Sequence::compute() with ctx: {}", context); - } catch (poplar::application_runtime_error& e) { - popart::logging::err("Poplar exception application_runtime_error caught:"); - return ODLA_INTERNAL_LOGIC_ERR; - } catch (poplar::recoverable_runtime_error& e) { - popart::logging::err("Poplar recoverable_runtime_error exception caught"); - auto action = e.getRecoveryAction(); - popart::logging::err("need to take action:{}", action); - if (action == poplar::RecoveryAction::IPU_RESET) { - return ODLA_RECOVERABLE_ERR; - } else if (action == poplar::RecoveryAction::PARTITION_RESET) { - return ODLA_PARTITION_RESET; - } else if (action == poplar::RecoveryAction::FULL_RESET) { - return ODLA_FULL_RESET; - } - } catch (poplar::unrecoverable_runtime_error& e) { - popart::logging::err("Poplar unrecoverable_runtime_error exception caught"); - return ODLA_UNRECOVERABLE_ERR; - } catch (poplar::unknown_runtime_error& e) { - popart::logging::err("Poplar unknown runtime exception caught."); - return ODLA_UNRECOVERABLE_ERR; - } catch (...) { - popart::logging::err("Poplar unknown exception caught"); - return ODLA_UNRECOVERABLE_ERR; - } + POPLAR_TRY + comp->session->run(stepio); + auto end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + popart::logging::info("[ {} ] [Sequence::compute] takes {} s.", i++, + elapsed_seconds.count()); + popart::logging::info("<<< Sequence::compute() with ctx: {}", context); + POPLAR_CATCH return ODLA_SUCCESS; } diff --git a/ODLA/platforms/odla_popart/popart_config.cc b/ODLA/platforms/odla_popart/popart_config.cc index 4258a3efe..cab192c08 100644 --- a/ODLA/platforms/odla_popart/popart_config.cc +++ b/ODLA/platforms/odla_popart/popart_config.cc @@ -191,6 +191,37 @@ odla_status PopartConfig::load_from_string(const std::string& config_string) { return ODLA_SUCCESS; } +std::string PopartConfig::temp_get_error_inject_env( + const std::string& temp_config_path) { + using json = nlohmann::json; + std::ifstream ifs(temp_config_path, std::ios_base::in); + popart::logging::warn("get injector info from config file: {}", + temp_config_path); + if (!ifs.good()) { + popart::logging::warn("config file {} not found, no injector for this run", + temp_config_path); + return std::string(""); + } + try { + json jf = json::parse(ifs); + if (jf.contains("POPLAR_ENGINE_OPTIONS")) { + auto poplar_engine_options = + jf["POPLAR_ENGINE_OPTIONS"].get(); + popart::logging::info( + "Read the POPLAR_ENGINE_OPTIONS from file:{} with value: {}.", + temp_config_path, poplar_engine_options); + return poplar_engine_options; + } + } catch (std::exception& e) { + popart::logging::err("parse config falied:{}", e.what()); + return std::string(""); + } catch (...) { + popart::logging::err("parse config falied"); + return std::string(""); + } + return std::string(""); +} + odla_status PopartConfig::load_from_file(const std::string& file_path) { if (inited_) { return ODLA_SUCCESS; @@ -295,3 +326,10 @@ odla_status PopartConfig::extract_config_from_cache() { } return ODLA_SUCCESS; } + +bool PopartConfig::sdk_version_match(std::string& sdk_version) { + popart::logging::warn( + "sdk version in the cache file is {}, and from sdk is {}", sdk_version_, + sdk_version); + return (sdk_version_.compare(sdk_version) == 0); +} diff --git a/ODLA/platforms/odla_popart/popart_config.h b/ODLA/platforms/odla_popart/popart_config.h index 54e55fd71..a3d8c858f 100644 --- a/ODLA/platforms/odla_popart/popart_config.h +++ b/ODLA/platforms/odla_popart/popart_config.h @@ -153,15 +153,15 @@ class PopartConfig { cache_path_ = catch_path; } - bool sdk_version_match(std::string& sdk_version) { - return (sdk_version_.compare(sdk_version) == 0); - } + bool sdk_version_match(std::string& sdk_version); void parse_from_json(const json&); odla_status load_from_string(const std::string& config_string); odla_status load_config(const char* file_path); bool get_pipeline_setting(const std::string& node_name, int64_t& ipu_idx, int64_t& pipeline_stage); odla_status extract_config_from_cache(); + std::string temp_get_error_inject_env( + const std::string& temp_config_path = "/tmp/temp_error_injector.json"); private: void set_pipeline_setting(const std::string& name_pattern, int ipu_idx,