From cd296feac3e4e0bb9424a92c3753f12081b82416 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2chen@intel.com>
Date: Wed, 17 Jul 2024 06:38:36 +0000
Subject: [PATCH 01/11] fix multi-gpu issue on sycl

Signed-off-by: Chen Xi <xi2chen@intel.com>
---
 ggml/src/ggml-sycl/dpct/helper.hpp | 152 +++++++++++++++++++++--------
 src/llama.cpp                      |   4 +-
 2 files changed, 113 insertions(+), 43 deletions(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 31df1cb9e2cf4..fef5e2d8d8df2 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -255,7 +255,7 @@ namespace dpct
         void set_pitch(size_t pitch) { _pitch = pitch; }
 
         size_t get_x() { return _x; }
-        void set_x(size_t x) { _x = x; }
+        void set_x(size_t x) { _x = x; };
 
         size_t get_y() { return _y; }
         void set_y(size_t y) { _y = y; }
@@ -588,7 +588,7 @@ namespace dpct
         out = prop;
     }
 
-   /// dpct device extension
+    /// dpct device extension
     class device_ext : public sycl::device {
       typedef std::mutex mutex_type;
 
@@ -687,119 +687,128 @@ namespace dpct
         init_queues();
       }
 
-      sycl::queue &in_order_queue() { return _q_in_order; }
+      sycl::queue &in_order_queue() { return *_q_in_order; }
 
-      sycl::queue &out_of_order_queue() { return _q_out_of_order; }
+      sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
 
       sycl::queue &default_queue() { return in_order_queue(); }
 
       void queues_wait_and_throw() {
         std::unique_lock<mutex_type> lock(m_mutex);
         lock.unlock();
-        for (auto &q : _queues) {
-          q.wait_and_throw();
+        for (const auto &q : _queues) {
+            q->wait_and_throw();
         }
         // Guard the destruct of current_queues to make sure the ref count is
         // safe.
         lock.lock();
       }
 
-      sycl::queue create_queue(bool enable_exception_handler = false) {
+      sycl::queue *create_queue(bool enable_exception_handler = false) {
         return create_in_order_queue(enable_exception_handler);
       }
 
-      sycl::queue create_queue(sycl::device device,
+      sycl::queue *create_queue(sycl::device device,
                                bool enable_exception_handler = false) {
         return create_in_order_queue(device, enable_exception_handler);
       }
 
-      sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
+      sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(enable_exception_handler,
                                  sycl::property::queue::in_order());
       }
 
-      sycl::queue create_in_order_queue(sycl::device device,
+      sycl::queue *create_in_order_queue(sycl::device device,
                                         bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(device, enable_exception_handler,
                                  sycl::property::queue::in_order());
       }
 
-      sycl::queue create_out_of_order_queue(
+      sycl::queue *create_out_of_order_queue(
           bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(enable_exception_handler);
       }
 
-      void destroy_queue(sycl::queue queue) {
+      void destroy_queue(sycl::queue *&queue) {
         std::lock_guard<mutex_type> lock(m_mutex);
-        _queues.clear();
+        _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                    [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                    {
+                                        return q.get() == queue;
+                                    }),
+                    _queues.end());
+        queue = nullptr;
       }
-      void set_saved_queue(sycl::queue q) {
+      void set_saved_queue(sycl::queue *q) {
         std::lock_guard<mutex_type> lock(m_mutex);
         _saved_queue = q;
       }
-      sycl::queue get_saved_queue() const {
+      sycl::queue *get_saved_queue() const {
         std::lock_guard<mutex_type> lock(m_mutex);
         return _saved_queue;
       }
 
      private:
-      void clear_queues() { _queues.clear(); }
+      void clear_queues() {
+        _queues.clear();
+        _q_in_order = _q_out_of_order = _saved_queue = nullptr;
+      }
 
       void init_queues() {
         _q_in_order =
             create_queue_impl(true, sycl::property::queue::in_order());
         _q_out_of_order = create_queue_impl(true);
-        _saved_queue = default_queue();
+        _saved_queue = &default_queue();
       }
 
       /// Caller should acquire resource \p m_mutex before calling this
       /// function.
       template <class... Properties>
-      sycl::queue create_queue_impl(bool enable_exception_handler,
+      sycl::queue *create_queue_impl(bool enable_exception_handler,
                                     Properties... properties) {
         sycl::async_handler eh = {};
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        auto q = sycl::queue(*this, eh,
-                             sycl::property_list(
+        _queues.push_back(std::make_shared<sycl::queue>(
+            *this, eh,
+            sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
-                                 sycl::property::queue::enable_profiling(),
+                sycl::property::queue::enable_profiling(),
 #endif
-                                 properties...));
-        _queues.push_back(q);
+                properties...)));
 
-        return _queues.back();
+        return _queues.back().get();
       }
 
       template <class... Properties>
-      sycl::queue create_queue_impl(sycl::device device,
+      sycl::queue *create_queue_impl(sycl::device device,
                                     bool enable_exception_handler,
                                     Properties... properties) {
         sycl::async_handler eh = {};
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        _queues.push_back(
-            sycl::queue(device, eh,
+        _queues.push_back(std::make_shared<sycl::queue>(
+            device, eh,
                         sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
                             sycl::property::queue::enable_profiling(),
 #endif
                             properties...)));
 
-        return _queues.back();
+        return _queues.back().get();
       }
 
       void get_version(int &major, int &minor) const {
         detail::get_version(*this, major, minor);
       }
-      sycl::queue _q_in_order, _q_out_of_order;
-      sycl::queue _saved_queue;
-      std::vector<sycl::queue> _queues;
+      sycl::queue *_q_in_order, *_q_out_of_order;
+      sycl::queue *_saved_queue;
+      std::vector<std::shared_ptr<sycl::queue>> _queues;
       mutable mutex_type m_mutex;
     };
 
@@ -855,15 +864,69 @@ namespace dpct
         unsigned int get_device_id(const sycl::device &dev)
         {
             unsigned int id = 0;
-            for (auto dev_item : _devs)
+            for (auto &dev_item : _devs)
             {
                 if (*dev_item == dev)
                 {
-                    break;
+                    return id;
                 }
                 id++;
             }
-            return id;
+            return -1;
+        }
+
+        inline std::string get_preferred_gpu_platform_name() {
+            std::string result;
+        
+            std::string filter = "level-zero";
+            char* env = getenv("ONEAPI_DEVICE_SELECTOR");
+            if (env) {
+                if (std::strstr(env, "level_zero")) {
+                    filter = "level-zero";
+                }
+                else if (std::strstr(env, "opencl")) {
+                    filter = "opencl";
+                }
+                else {
+                    throw std::runtime_error("invalid device filter: " + std::string(env));
+                }
+            }
+        
+            auto plaform_list = sycl::platform::get_platforms();
+        
+            for (const auto& platform : plaform_list) {
+                auto devices = platform.get_devices();
+                auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
+                    return d.is_gpu();
+                });
+        
+                if (gpu_dev == devices.end()) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not contain GPU devices, skipping\n";
+                    continue;
+                }
+        
+                auto platform_name = platform.get_info<sycl::info::platform::name>();
+                std::string platform_name_low_case;
+                platform_name_low_case.resize(platform_name.size());
+        
+                std::transform(
+                    platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
+        
+                if (platform_name_low_case.find(filter) == std::string::npos) {
+                    // cout << "platform [" << platform_name
+                    //      << "] does not match with requested "
+                    //      << filter << ", skipping\n";
+                    continue;
+                }
+        
+                result = platform_name;
+            }
+        
+            if (result.empty())
+                throw std::runtime_error("can not find preferred GPU platform");
+        
+            return result;
         }
 
         template <class DeviceSelector>
@@ -915,6 +978,7 @@ namespace dpct
         static bool compare_backend(std::string &backend1, std::string &backend2) {
             return convert_backend_index(backend1) < convert_backend_index(backend2);
         }
+
         dev_mgr()
         {
             sycl::device default_device =
@@ -928,12 +992,17 @@ namespace dpct
 
             auto Platforms = sycl::platform::get_platforms();
             // Keep track of the number of devices per backend
-            std::map<sycl::backend, size_t> DeviceNums;
+	    std::map<sycl::backend, size_t> DeviceNums;
             std::map<std::string, std::vector<sycl::device>> backend_devices;
+            auto preferred_platform_name = get_preferred_gpu_platform_name();
 
             while (!Platforms.empty()) {
                 auto Platform = Platforms.back();
                 Platforms.pop_back();
+		auto platform_name = Platform.get_info<sycl::info::platform::name>();
+		if (platform_name.compare(preferred_platform_name) != 0) {
+		    continue;
+		}
                 auto devices = Platform.get_devices();
                 std::string backend_type = get_device_backend_and_type(devices[0]);
                 for (const auto &device : devices) {
@@ -945,6 +1014,7 @@ namespace dpct
             for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
                 keys.push_back(it->first);
             }
+
             std::sort(keys.begin(), keys.end(), compare_backend);
 
             for (auto &key : keys) {
@@ -967,7 +1037,9 @@ namespace dpct
                     _cpu_device = _devs.size() - 1;
                 }
             }
-        }
+
+	}
+
         void check_id(unsigned int id) const
         {
             if (id >= _devs.size())
@@ -1056,7 +1128,7 @@ namespace dpct
 #error "Only support Windows and Linux."
 #endif
                 next_free = mapped_address_space;
-            }
+            };
 
         public:
             using buffer_id_t = int;
@@ -1077,7 +1149,7 @@ namespace dpct
 #else
 #error "Only support Windows and Linux."
 #endif
-            }
+            };
 
             mem_mgr(const mem_mgr &) = delete;
             mem_mgr &operator=(const mem_mgr &) = delete;
@@ -2426,7 +2498,6 @@ namespace dpct
                                            b, ldb, beta, c, ldc, batch_size);
             break;
         }
-#endif
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_int32, library_data_t::real_int32):
@@ -2459,6 +2530,7 @@ namespace dpct
                 batch_size);
             break;
         }
+#endif
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_float):
@@ -2595,7 +2667,6 @@ namespace dpct
                                            stride_c, batch_size);
             break;
         }
-#endif
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_int32, library_data_t::real_int32):
@@ -2624,6 +2695,7 @@ namespace dpct
                 beta, c, ldc, stride_c, batch_size);
             break;
         }
+#endif
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_float):
diff --git a/src/llama.cpp b/src/llama.cpp
index 400a4232beeb0..55ef90572e58b 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -19154,9 +19154,7 @@ struct llama_context * llama_new_context_with_model(
             for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
                 ggml_backend_t backend = ggml_backend_sycl_init(i);
                 if (backend == nullptr) {
-                    int id_list[GGML_SYCL_MAX_DEVICES];
-                    ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d for No.%d backend\n", __func__, i, i);
                     llama_free(ctx);
                     return nullptr;
                 }

From 6b4f7b2ac1ff4f77985430618c41e6e24ba929dd Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2chen@intel.com>
Date: Thu, 18 Jul 2024 07:13:01 +0000
Subject: [PATCH 02/11] fix some typo

Signed-off-by: Chen Xi <xi2chen@intel.com>
---
 ggml/src/ggml-sycl/dpct/helper.hpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index fef5e2d8d8df2..55f003a351d6b 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -255,7 +255,7 @@ namespace dpct
         void set_pitch(size_t pitch) { _pitch = pitch; }
 
         size_t get_x() { return _x; }
-        void set_x(size_t x) { _x = x; };
+        void set_x(size_t x) { _x = x; }
 
         size_t get_y() { return _y; }
         void set_y(size_t y) { _y = y; }
@@ -978,7 +978,6 @@ namespace dpct
         static bool compare_backend(std::string &backend1, std::string &backend2) {
             return convert_backend_index(backend1) < convert_backend_index(backend2);
         }
-
         dev_mgr()
         {
             sycl::device default_device =
@@ -992,17 +991,17 @@ namespace dpct
 
             auto Platforms = sycl::platform::get_platforms();
             // Keep track of the number of devices per backend
-	    std::map<sycl::backend, size_t> DeviceNums;
+            std::map<sycl::backend, size_t> DeviceNums;
             std::map<std::string, std::vector<sycl::device>> backend_devices;
             auto preferred_platform_name = get_preferred_gpu_platform_name();
 
             while (!Platforms.empty()) {
                 auto Platform = Platforms.back();
                 Platforms.pop_back();
-		auto platform_name = Platform.get_info<sycl::info::platform::name>();
-		if (platform_name.compare(preferred_platform_name) != 0) {
-		    continue;
-		}
+                auto platform_name = Platform.get_info<sycl::info::platform::name>();
+                if (platform_name.compare(preferred_platform_name) != 0) {
+                    continue;
+                }
                 auto devices = Platform.get_devices();
                 std::string backend_type = get_device_backend_and_type(devices[0]);
                 for (const auto &device : devices) {
@@ -1016,7 +1015,6 @@ namespace dpct
             }
 
             std::sort(keys.begin(), keys.end(), compare_backend);
-
             for (auto &key : keys) {
                 std::vector<sycl::device> devs = backend_devices[key];
                 std::sort(devs.begin(), devs.end(), compare_dev);

From d096de2e90b22c002b5d4296bc64750687eceb5e Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2chen@intel.com>
Date: Thu, 18 Jul 2024 07:16:17 +0000
Subject: [PATCH 03/11] remove unnecessary whitespace

Signed-off-by: Chen Xi <xi2chen@intel.com>
---
 ggml/src/ggml-sycl/dpct/helper.hpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 55f003a351d6b..09c0ef8d0ca9d 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -1013,8 +1013,8 @@ namespace dpct
             for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
                 keys.push_back(it->first);
             }
-
             std::sort(keys.begin(), keys.end(), compare_backend);
+
             for (auto &key : keys) {
                 std::vector<sycl::device> devs = backend_devices[key];
                 std::sort(devs.begin(), devs.end(), compare_dev);
@@ -1035,9 +1035,7 @@ namespace dpct
                     _cpu_device = _devs.size() - 1;
                 }
             }
-
 	}
-
         void check_id(unsigned int id) const
         {
             if (id >= _devs.size())
@@ -1126,7 +1124,7 @@ namespace dpct
 #error "Only support Windows and Linux."
 #endif
                 next_free = mapped_address_space;
-            };
+            }
 
         public:
             using buffer_id_t = int;
@@ -1147,7 +1145,7 @@ namespace dpct
 #else
 #error "Only support Windows and Linux."
 #endif
-            };
+            }
 
             mem_mgr(const mem_mgr &) = delete;
             mem_mgr &operator=(const mem_mgr &) = delete;

From bd71cdac0f53357d6f6562fb0a54be81afb91afd Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2chen@intel.com>
Date: Thu, 18 Jul 2024 07:19:24 +0000
Subject: [PATCH 04/11] file format issue

Signed-off-by: Chen Xi <xi2chen@intel.com>
---
 ggml/src/ggml-sycl/dpct/helper.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 09c0ef8d0ca9d..40ac8efc32a1d 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -1035,7 +1035,7 @@ namespace dpct
                     _cpu_device = _devs.size() - 1;
                 }
             }
-	}
+        }
         void check_id(unsigned int id) const
         {
             if (id >= _devs.size())

From 6160a76efbf5b0c2ddb30a5a237a976bac5a1c21 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2chen@intel.com>
Date: Thu, 18 Jul 2024 08:07:05 +0000
Subject: [PATCH 05/11] sycl::queue can directly use as shared_ptr

Signed-off-by: Chen Xi <xi2chen@intel.com>
---
 ggml/src/ggml-sycl/dpct/helper.hpp | 54 ++++++++++++++----------------
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 40ac8efc32a1d..0f18cff3268a6 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -687,93 +687,89 @@ namespace dpct
         init_queues();
       }
 
-      sycl::queue &in_order_queue() { return *_q_in_order; }
+      sycl::queue &in_order_queue() { return _q_in_order; }
 
-      sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
+      sycl::queue &out_of_order_queue() { return _q_out_of_order; }
 
       sycl::queue &default_queue() { return in_order_queue(); }
 
       void queues_wait_and_throw() {
         std::unique_lock<mutex_type> lock(m_mutex);
         lock.unlock();
-        for (const auto &q : _queues) {
-            q->wait_and_throw();
+        for (auto &q : _queues) {
+            q.wait_and_throw();
         }
         // Guard the destruct of current_queues to make sure the ref count is
         // safe.
         lock.lock();
       }
 
-      sycl::queue *create_queue(bool enable_exception_handler = false) {
+      sycl::queue create_queue(bool enable_exception_handler = false) {
         return create_in_order_queue(enable_exception_handler);
       }
 
-      sycl::queue *create_queue(sycl::device device,
+      sycl::queue create_queue(sycl::device device,
                                bool enable_exception_handler = false) {
         return create_in_order_queue(device, enable_exception_handler);
       }
 
-      sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
+      sycl::queue create_in_order_queue(bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(enable_exception_handler,
                                  sycl::property::queue::in_order());
       }
 
-      sycl::queue *create_in_order_queue(sycl::device device,
+      sycl::queue create_in_order_queue(sycl::device device,
                                         bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(device, enable_exception_handler,
                                  sycl::property::queue::in_order());
       }
 
-      sycl::queue *create_out_of_order_queue(
+      sycl::queue create_out_of_order_queue(
           bool enable_exception_handler = false) {
         std::lock_guard<mutex_type> lock(m_mutex);
         return create_queue_impl(enable_exception_handler);
       }
 
-      void destroy_queue(sycl::queue *&queue) {
+      void destroy_queue(sycl::queue queue) {
         std::lock_guard<mutex_type> lock(m_mutex);
         _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                    [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                    [=](const sycl::queue &q) -> bool
                                     {
-                                        return q.get() == queue;
+                                        return q == queue;
                                     }),
                     _queues.end());
-        queue = nullptr;
       }
-      void set_saved_queue(sycl::queue *q) {
+      void set_saved_queue(sycl::queue q) {
         std::lock_guard<mutex_type> lock(m_mutex);
         _saved_queue = q;
       }
-      sycl::queue *get_saved_queue() const {
+      sycl::queue get_saved_queue() const {
         std::lock_guard<mutex_type> lock(m_mutex);
         return _saved_queue;
       }
 
      private:
-      void clear_queues() {
-        _queues.clear();
-        _q_in_order = _q_out_of_order = _saved_queue = nullptr;
-      }
+      void clear_queues() { _queues.clear(); }
 
       void init_queues() {
         _q_in_order =
             create_queue_impl(true, sycl::property::queue::in_order());
         _q_out_of_order = create_queue_impl(true);
-        _saved_queue = &default_queue();
+        _saved_queue = default_queue();
       }
 
       /// Caller should acquire resource \p m_mutex before calling this
       /// function.
       template <class... Properties>
-      sycl::queue *create_queue_impl(bool enable_exception_handler,
+      sycl::queue create_queue_impl(bool enable_exception_handler,
                                     Properties... properties) {
         sycl::async_handler eh = {};
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        _queues.push_back(std::make_shared<sycl::queue>(
+        _queues.push_back(sycl::queue(
             *this, eh,
             sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
@@ -781,18 +777,18 @@ namespace dpct
 #endif
                 properties...)));
 
-        return _queues.back().get();
+        return _queues.back();
       }
 
       template <class... Properties>
-      sycl::queue *create_queue_impl(sycl::device device,
+      sycl::queue create_queue_impl(sycl::device device,
                                     bool enable_exception_handler,
                                     Properties... properties) {
         sycl::async_handler eh = {};
         if (enable_exception_handler) {
           eh = exception_handler;
         }
-        _queues.push_back(std::make_shared<sycl::queue>(
+        _queues.push_back(sycl::queue(
             device, eh,
                         sycl::property_list(
 #ifdef DPCT_PROFILING_ENABLED
@@ -800,15 +796,15 @@ namespace dpct
 #endif
                             properties...)));
 
-        return _queues.back().get();
+        return _queues.back();
       }
 
       void get_version(int &major, int &minor) const {
         detail::get_version(*this, major, minor);
       }
-      sycl::queue *_q_in_order, *_q_out_of_order;
-      sycl::queue *_saved_queue;
-      std::vector<std::shared_ptr<sycl::queue>> _queues;
+      sycl::queue _q_in_order, _q_out_of_order;
+      sycl::queue _saved_queue;
+      std::vector<sycl::queue> _queues;
       mutable mutex_type m_mutex;
     };
 

From e4b86a12952bc5acec54a4d7a8194191f9384ef9 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Tue, 23 Jul 2024 06:50:13 +0000
Subject: [PATCH 06/11] fix the perf issue of multi-device

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 ggml/src/ggml-sycl/common.hpp      |  2 +-
 ggml/src/ggml-sycl/dpct/helper.hpp | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 68d41411b5ece..397bd98dd22a9 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -267,7 +267,7 @@ struct ggml_backend_sycl_context {
 
     queue_ptr stream(int device, int stream) {
         if (qptrs[device][stream] == nullptr) {
-            qptrs[device][stream] = &(dpct::get_current_device().default_queue());
+            qptrs[device][stream] = &(dpct::get_device(device).default_queue());
         }
         return qptrs[device][stream];
     }
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 0f18cff3268a6..a313ca6e5d7b1 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -883,6 +883,12 @@ namespace dpct
                 else if (std::strstr(env, "opencl")) {
                     filter = "opencl";
                 }
+                else if (std::strstr(env, "cuda")) {
+                    filter = "cuda";
+                }
+                else if (std::strstr(env, "hip")) {
+                    filter = "hip";
+                }
                 else {
                     throw std::runtime_error("invalid device filter: " + std::string(env));
                 }
@@ -2053,6 +2059,11 @@ namespace dpct
         return dev_mgr::instance().current_device();
     }
 
+    static inline device_ext &get_device(unsigned int id)
+    {
+        return dev_mgr::instance().get_device(id);
+    }
+
     static inline sycl::queue &get_in_order_queue()
     {
         return dev_mgr::instance().current_device().in_order_queue();
@@ -2490,6 +2501,7 @@ namespace dpct
                                            b, ldb, beta, c, ldc, batch_size);
             break;
         }
+#endif
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_int32, library_data_t::real_int32):
@@ -2522,7 +2534,6 @@ namespace dpct
                 batch_size);
             break;
         }
-#endif
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_float):
@@ -2669,6 +2680,7 @@ namespace dpct
                                                   beta, c, ldc, stride_c, batch_size);
             break;
         }
+#endif
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_float, library_data_t::real_float):
@@ -2687,7 +2699,6 @@ namespace dpct
                 beta, c, ldc, stride_c, batch_size);
             break;
         }
-#endif
         case detail::get_type_combination_id(
             library_data_t::real_half, library_data_t::real_half,
             library_data_t::real_half, library_data_t::real_float):

From 22c72c5a0f5d80dbf10640adf94f2c6edba0773e Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Wed, 24 Jul 2024 04:05:08 +0000
Subject: [PATCH 07/11] fix intel mkl

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 ggml/src/ggml-sycl/dpct/helper.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index a313ca6e5d7b1..8774eff006251 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -2670,6 +2670,7 @@ namespace dpct
                                            stride_c, batch_size);
             break;
         }
+#endif
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_int32, library_data_t::real_int32):
@@ -2680,7 +2681,6 @@ namespace dpct
                                                   beta, c, ldc, stride_c, batch_size);
             break;
         }
-#endif
         case detail::get_type_combination_id(
             library_data_t::real_int8, library_data_t::real_int8,
             library_data_t::real_float, library_data_t::real_float):

From f3db6d7553f5a34e8f83490cda5003e757d66c1d Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Jul 2024 05:38:30 +0000
Subject: [PATCH 08/11] fix format

---
 ggml/src/ggml-sycl/dpct/helper.hpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 8774eff006251..4aaa76bfbbc73 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -873,7 +873,7 @@ namespace dpct
 
         inline std::string get_preferred_gpu_platform_name() {
             std::string result;
-        
+
             std::string filter = "level-zero";
             char* env = getenv("ONEAPI_DEVICE_SELECTOR");
             if (env) {
@@ -893,41 +893,41 @@ namespace dpct
                     throw std::runtime_error("invalid device filter: " + std::string(env));
                 }
             }
-        
+
             auto plaform_list = sycl::platform::get_platforms();
-        
+
             for (const auto& platform : plaform_list) {
                 auto devices = platform.get_devices();
                 auto gpu_dev = std::find_if(devices.begin(), devices.end(), [](const sycl::device& d) {
                     return d.is_gpu();
                 });
-        
+
                 if (gpu_dev == devices.end()) {
                     // cout << "platform [" << platform_name
                     //      << "] does not contain GPU devices, skipping\n";
                     continue;
                 }
-        
+
                 auto platform_name = platform.get_info<sycl::info::platform::name>();
                 std::string platform_name_low_case;
                 platform_name_low_case.resize(platform_name.size());
-        
+
                 std::transform(
                     platform_name.begin(), platform_name.end(), platform_name_low_case.begin(), ::tolower);
-        
+
                 if (platform_name_low_case.find(filter) == std::string::npos) {
                     // cout << "platform [" << platform_name
                     //      << "] does not match with requested "
                     //      << filter << ", skipping\n";
                     continue;
                 }
-        
+
                 result = platform_name;
             }
-        
+
             if (result.empty())
                 throw std::runtime_error("can not find preferred GPU platform");
-        
+
             return result;
         }
 

From fcce8732a1c65cf1b31f0a7a5afe194f1f96a4e2 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Thu, 25 Jul 2024 08:29:06 +0000
Subject: [PATCH 09/11] add doc for sycl multi-card

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 docs/backend/SYCL.md | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 885983e92277e..87d1bf8324fc5 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -471,10 +471,10 @@ On the oneAPI command line window, run the following and step into the llama.cpp
 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
 
 ```
-build\bin\ls-sycl-device.exe
+./build/bin/llama-ls-sycl-device
 ```
 
-The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default  backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
 found 6 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
@@ -482,25 +482,18 @@ found 6 SYCL devices:
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 
 ```
 
-| Attribute              | Note                                                      |
-|------------------------|-----------------------------------------------------------|
-| compute capability 1.3 | Level-zero running time, recommended                      |
-| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases |
-
 
 4. Launch inference
 
 There are two device selection modes:
 
-- Single device: Use one device assigned by user.
-- Multiple devices: Automatically choose the devices with the same biggest Max compute units.
+- Single device: Use one device assigned by user. Default device id is 0.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
 
 | Device selection | Parameter                              |
 |------------------|----------------------------------------|

From 8fe8086363e867ddced21ffc4e2cc60e254951d1 Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Thu, 25 Jul 2024 08:42:03 +0000
Subject: [PATCH 10/11] add linux part change on doc

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 docs/backend/SYCL.md | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 87d1bf8324fc5..99890a4cce338 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -293,31 +293,26 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 ```sh
 ./build/bin/llama-ls-sycl-device
 ```
-A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following:
+This command will only display the selected backend that is supported by SYCL. The default  backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
+
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|
 | 0|[level_zero:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       1.3|        512|    1024|     32|    16225243136|
 | 1|[level_zero:gpu:1]|                    Intel(R) UHD Graphics 770|       1.3|         32|     512|     32|    53651849216|
-| 2|    [opencl:gpu:0]|               Intel(R) Arc(TM) A770 Graphics|       3.0|        512|    1024|     32|    16225243136|
-| 3|    [opencl:gpu:1]|                    Intel(R) UHD Graphics 770|       3.0|         32|     512|     32|    53651849216|
-| 4|    [opencl:cpu:0]|         13th Gen Intel(R) Core(TM) i7-13700K|       3.0|         24|    8192|     64|    67064815616|
-| 5|    [opencl:acc:0]|               Intel(R) FPGA Emulation Device|       1.2|         24|67108864|     64|    67064815616|
 ```
 
-| Attribute              | Note                                                        |
-|------------------------|-------------------------------------------------------------|
-| compute capability 1.3 | Level-zero driver/runtime, recommended                      |
-| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases |
 
 4. Launch inference
 
 There are two device selection modes:
 
 - Single device: Use one device target specified by the user.
-- Multiple devices: Automatically select the devices with the same largest Max compute-units.
+- Multiple devices: Automatically choose the devices with the same backend.
+
+In two device selection modes, the default SYCL backend is level_zero, you can choose other backend supported by SYCL by setting environment variable ONEAPI_DEVICE_SELECTOR.
 
 | Device selection | Parameter                              |
 |------------------|----------------------------------------|
@@ -471,12 +466,12 @@ On the oneAPI command line window, run the following and step into the llama.cpp
 Similar to the native `sycl-ls`, available SYCL devices can be queried as follow:
 
 ```
-./build/bin/llama-ls-sycl-device
+build\bin\ls-sycl-device.exe
 ```
 
 This command will only display the selected backend that is supported by SYCL. The default  backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
-found 6 SYCL devices:
+found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |
 |ID|       Device Type|                                         Name|capability|units      |group   |group  |Global mem size|
 |--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|

From fc766843258c6bc01e76de9ef178d1c16de07a7d Mon Sep 17 00:00:00 2001
From: Chen Xi <xi2.chen@intel.com>
Date: Thu, 25 Jul 2024 08:45:00 +0000
Subject: [PATCH 11/11] fix typo

Signed-off-by: Chen Xi <xi2.chen@intel.com>
---
 docs/backend/SYCL.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 99890a4cce338..d36ac0a158dd4 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -293,7 +293,7 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 ```sh
 ./build/bin/llama-ls-sycl-device
 ```
-This command will only display the selected backend that is supported by SYCL. The default  backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
 found 2 SYCL devices:
 
@@ -469,7 +469,7 @@ Similar to the native `sycl-ls`, available SYCL devices can be queried as follow
 build\bin\ls-sycl-device.exe
 ```
 
-This command will only display the selected backend that is supported by SYCL. The default  backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
+This command will only display the selected backend that is supported by SYCL. The default backend is level_zero. For example, in a system with 2 *intel GPU* it would look like the following:
 ```
 found 2 SYCL devices:
 |  |                  |                                             |Compute   |Max compute|Max work|Max sub|               |