From ed67bcb24f2d6ac0072cae72620b2bd971741b98 Mon Sep 17 00:00:00 2001 From: Chen Xi Date: Thu, 25 Jul 2024 11:45:18 +0000 Subject: [PATCH] [SYCL] fix multi-gpu issue on sycl (#8554) --------- Signed-off-by: Chen Xi Co-authored-by: Meng, Hengyu --- docs/backend/SYCL.md | 36 ++++------ ggml/src/ggml-sycl/common.hpp | 2 +- ggml/src/ggml-sycl/dpct/helper.hpp | 101 +++++++++++++++++++++++++---- src/llama.cpp | 4 +- 4 files changed, 102 insertions(+), 41 deletions(-) diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 885983e92277e..d36ac0a158dd4 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow ```sh ./build/bin/llama-ls-sycl-device ``` -A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: + | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-------------------------------------------------------------| -| compute capability 1.3 | Level-zero driver/runtime, recommended | -| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases | 4. Launch inference There are two device selection modes: - Single device: Use one device target specified by the user. -- Multiple devices: Automatically select the devices with the same largest Max compute-units. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| @@ -474,33 +469,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow build\bin\ls-sycl-device.exe ``` -The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: +This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following: ``` -found 6 SYCL devices: +found 2 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | |ID| Device Type| Name|capability|units |group |group |Global mem size| |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| | 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| | 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| -| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| -| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| -| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| -| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -| Attribute | Note | -|------------------------|-----------------------------------------------------------| -| compute capability 1.3 | Level-zero running time, recommended | -| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases | - 4. Launch inference There are two device selection modes: -- Single device: Use one device assigned by user. -- Multiple devices: Automatically choose the devices with the same biggest Max compute units. +- Single device: Use one device assigned by user. Default device id is 0. +- Multiple devices: Automatically choose the devices with the same backend. + +In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR. | Device selection | Parameter | |------------------|----------------------------------------| diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index 68d41411b5ece..397bd98dd22a9 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -267,7 +267,7 @@ struct ggml_backend_sycl_context { queue_ptr stream(int device, int stream) { if (qptrs[device][stream] == nullptr) { - qptrs[device][stream] = &(dpct::get_current_device().default_queue()); + qptrs[device][stream] = &(dpct::get_device(device).default_queue()); } return qptrs[device][stream]; } diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 31df1cb9e2cf4..4aaa76bfbbc73 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -588,7 +588,7 @@ namespace dpct out = prop; } - /// dpct device extension + /// dpct device extension class device_ext : public sycl::device { typedef std::mutex mutex_type; @@ -697,7 +697,7 @@ namespace dpct std::unique_lock lock(m_mutex); lock.unlock(); for (auto &q : _queues) { - q.wait_and_throw(); + q.wait_and_throw(); } // Guard the destruct of current_queues to make sure the ref count is // safe. @@ -734,7 +734,12 @@ namespace dpct void destroy_queue(sycl::queue queue) { std::lock_guard lock(m_mutex); - _queues.clear(); + _queues.erase(std::remove_if(_queues.begin(), _queues.end(), + [=](const sycl::queue &q) -> bool + { + return q == queue; + }), + _queues.end()); } void set_saved_queue(sycl::queue q) { std::lock_guard lock(m_mutex); @@ -764,13 +769,13 @@ namespace dpct if (enable_exception_handler) { eh = exception_handler; } - auto q = sycl::queue(*this, eh, - sycl::property_list( + _queues.push_back(sycl::queue( + *this, eh, + sycl::property_list( #ifdef DPCT_PROFILING_ENABLED - sycl::property::queue::enable_profiling(), + sycl::property::queue::enable_profiling(), #endif - properties...)); - _queues.push_back(q); + properties...))); return _queues.back(); } @@ -783,8 +788,8 @@ namespace dpct if (enable_exception_handler) { eh = exception_handler; } - _queues.push_back( - sycl::queue(device, eh, + _queues.push_back(sycl::queue( + device, eh, sycl::property_list( #ifdef DPCT_PROFILING_ENABLED sycl::property::queue::enable_profiling(), @@ -855,15 +860,75 @@ namespace dpct unsigned int get_device_id(const sycl::device &dev) { unsigned int id = 0; - for (auto dev_item : _devs) + for (auto &dev_item : _devs) { if (*dev_item == dev) { - break; + return id; } id++; } - return id; + return -1; + } + + inline std::string get_preferred_gpu_platform_name() { + std::string result; + + std::string filter = "level-zero"; + char* env = getenv("ONEAPI_DEVICE_SELECTOR"); + if (env) { + if (std::strstr(env, "level_zero")) { + filter = "level-zero"; + } + else if (std::strstr(env, "opencl")) { + filter = "opencl"; + } + else if (std::strstr(env, "cuda")) { + filter = "cuda"; + } + else if (std::strstr(env, "hip")) { + filter = "hip"; + } + else { + throw std::runtime_error("invalid device filter: " + std::string(env)); + } + } + + auto plaform_list = sycl::platform::get_platforms(); + + for (const auto& platform : plaform_list) { + auto devices = platform.get_devices(); + auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) { + return d.is_gpu(); + }); + + if (gpu_dev == devices.end()) { + // cout << "platform [" << platform_name + // << "] does not contain GPU devices, skipping\n"; + continue; + } + + auto platform_name = platform.get_info(); + std::string platform_name_low_case; + platform_name_low_case.resize(platform_name.size()); + + std::transform( + platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower); + + if (platform_name_low_case.find(filter) == std::string::npos) { + // cout << "platform [" << platform_name + // << "] does not match with requested " + // << filter << ", skipping\n"; + continue; + } + + result = platform_name; + } + + if (result.empty()) + throw std::runtime_error("can not find preferred GPU platform"); + + return result; } template @@ -930,10 +995,15 @@ namespace dpct // Keep track of the number of devices per backend std::map DeviceNums; std::map> backend_devices; + auto preferred_platform_name = get_preferred_gpu_platform_name(); while (!Platforms.empty()) { auto Platform = Platforms.back(); Platforms.pop_back(); + auto platform_name = Platform.get_info(); + if (platform_name.compare(preferred_platform_name) != 0) { + continue; + } auto devices = Platform.get_devices(); std::string backend_type = get_device_backend_and_type(devices[0]); for (const auto &device : devices) { @@ -1989,6 +2059,11 @@ namespace dpct return dev_mgr::instance().current_device(); } + static inline device_ext &get_device(unsigned int id) + { + return dev_mgr::instance().get_device(id); + } + static inline sycl::queue &get_in_order_queue() { return dev_mgr::instance().current_device().in_order_queue(); diff --git a/src/llama.cpp b/src/llama.cpp index 80235ae19b270..972f870b072b8 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -16643,9 +16643,7 @@ struct llama_context * llama_new_context_with_model( for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i); llama_free(ctx); return nullptr; }