diff --git a/Makefile b/Makefile
index 36c7be21561b..4226c5d7fab5 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=dafae66cc242eb766797194d3c85c5e502625623
+CPPLLAMA_VERSION?=08ea539df211e46bb4d0dd275e541cb591d5ebc8
 
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp
index ea5c4e34a465..98dd8fde1c74 100644
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -681,7 +681,6 @@ struct llama_server_context
         slot->sparams.mirostat          = json_value(data, "mirostat",          default_sparams.mirostat);
         slot->sparams.mirostat_tau      = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
         slot->sparams.mirostat_eta      = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-        slot->sparams.penalize_nl       = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
         slot->params.n_keep             = json_value(data, "n_keep",            slot->params.n_keep);
         slot->sparams.seed               = json_value(data, "seed",              default_sparams.seed);
         slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
@@ -1213,13 +1212,12 @@ struct llama_server_context
             {"mirostat",          slot.sparams.mirostat},
             {"mirostat_tau",      slot.sparams.mirostat_tau},
             {"mirostat_eta",      slot.sparams.mirostat_eta},
-            {"penalize_nl",       slot.sparams.penalize_nl},
             {"stop",              slot.params.antiprompt},
             {"n_predict",         slot.params.n_predict},
             {"n_keep",            params.n_keep},
             {"ignore_eos",        slot.sparams.ignore_eos},
             {"stream",            slot.params.stream},
-      //      {"logit_bias",        slot.sparams.logit_bias},
+             //      {"logit_bias",        slot.sparams.logit_bias},
             {"n_probs",           slot.sparams.n_probs},
             {"min_keep",          slot.sparams.min_keep},
             {"grammar",           slot.sparams.grammar},
@@ -2112,7 +2110,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
     //     slot->sparams.mirostat        = json_value(data, "mirostat",          default_sparams.mirostat);
     //     slot->sparams.mirostat_tau    = json_value(data, "mirostat_tau",      default_sparams.mirostat_tau);
     //     slot->sparams.mirostat_eta    = json_value(data, "mirostat_eta",      default_sparams.mirostat_eta);
-    //     slot->sparams.penalize_nl     = json_value(data, "penalize_nl",       default_sparams.penalize_nl);
     //     slot->params.n_keep           = json_value(data, "n_keep",            slot->params.n_keep);
     //     slot->params.seed             = json_value(data, "seed",              default_params.seed);
     //     slot->sparams.grammar         = json_value(data, "grammar",           default_sparams.grammar);
@@ -2135,7 +2132,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
     data["mirostat"] = predict->mirostat();
     data["mirostat_tau"] = predict->mirostattau();
     data["mirostat_eta"] = predict->mirostateta();
-    data["penalize_nl"] = predict->penalizenl();
     data["n_keep"] = predict->nkeep();
     data["seed"] = predict->seed();
     data["grammar"] = predict->grammar();
@@ -2181,7 +2177,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     llama.params.sparams.mirostat = predict->mirostat();
 //     llama.params.sparams.mirostat_tau = predict->mirostattau();
 //     llama.params.sparams.mirostat_eta = predict->mirostateta();
-//     llama.params.sparams.penalize_nl = predict->penalizenl();
 //     llama.params.n_keep = predict->nkeep();
 //     llama.params.seed = predict->seed();
 //     llama.params.sparams.grammar = predict->grammar();
@@ -2228,6 +2223,35 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
 //     }
 // }
 
+const std::vector<ggml_type> kv_cache_types = {
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+};
+
+static ggml_type kv_cache_type_from_str(const std::string & s) {
+    for (const auto & type : kv_cache_types) {
+        if (ggml_type_name(type) == s) {
+            return type;
+        }
+    }
+    throw std::runtime_error("Unsupported cache type: " + s);
+}
+
+static std::string get_all_kv_cache_types() {
+    std::ostringstream msg;
+    for (const auto & type : kv_cache_types) {
+        msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
+    }
+    return msg.str();
+}
+
 static void params_parse(const backend::ModelOptions* request,
                                 common_params & params) {
    
@@ -2242,10 +2266,10 @@ static void params_parse(const backend::ModelOptions* request,
     //  params.model_alias ??
     params.model_alias =  request->modelfile();
     if (!request->cachetypekey().empty()) {
-        params.cache_type_k = request->cachetypekey();
+        params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
     }
     if (!request->cachetypevalue().empty()) {
-        params.cache_type_v = request->cachetypevalue();
+        params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue());
     }
     params.n_ctx = request->contextsize();
     //params.memory_f16 = request->f16memory();
diff --git a/backend/python/autogptq/requirements-intel.txt b/backend/python/autogptq/requirements-intel.txt
index d5e0173ea407..cec8bff477f5 100644
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/bark/requirements-intel.txt b/backend/python/bark/requirements-intel.txt
index c0e4dcaa5b99..1f043bbfdd02 100644
--- a/backend/python/bark/requirements-intel.txt
+++ b/backend/python/bark/requirements-intel.txt
@@ -3,6 +3,6 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
\ No newline at end of file
diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh
index 934b1fd37b14..6013cf765d4c 100644
--- a/backend/python/common/libbackend.sh
+++ b/backend/python/common/libbackend.sh
@@ -17,6 +17,9 @@
 # LIMIT_TARGETS="cublas12"
 # source $(dirname $0)/../common/libbackend.sh
 #
+
+PYTHON_VERSION="3.10"
+
 function init() {
     # Name of the backend (directory name)
     BACKEND_NAME=${PWD##*/}
@@ -88,7 +91,7 @@ function getBuildProfile() {
 # always result in an activated virtual environment
 function ensureVenv() {
     if [ ! -d "${EDIR}/venv" ]; then
-        uv venv ${EDIR}/venv
+        uv venv --python ${PYTHON_VERSION} ${EDIR}/venv
         echo "virtualenv created"
     fi
 
diff --git a/backend/python/coqui/requirements-intel.txt b/backend/python/coqui/requirements-intel.txt
index de3b4ee4b38c..7ed2fb4265e9 100644
--- a/backend/python/coqui/requirements-intel.txt
+++ b/backend/python/coqui/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 transformers
 accelerate
 coqui-tts
\ No newline at end of file
diff --git a/backend/python/diffusers/requirements-intel.txt b/backend/python/diffusers/requirements-intel.txt
index 566278a88a23..bd6632bfe812 100644
--- a/backend/python/diffusers/requirements-intel.txt
+++ b/backend/python/diffusers/requirements-intel.txt
@@ -3,7 +3,7 @@ intel-extension-for-pytorch
 torch
 torchvision
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 diffusers
 opencv-python
 transformers
diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt
index 6806d3e1c4a8..e6a1e5a5d612 100644
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -18,3 +18,4 @@ jieba==0.42.1
 gradio==3.48.0
 langid==1.1.6
 llvmlite==0.43.0
+setuptools
\ No newline at end of file
diff --git a/backend/python/parler-tts/requirements-intel.txt b/backend/python/parler-tts/requirements-intel.txt
index c0e4dcaa5b99..bcb8900e9bac 100644
--- a/backend/python/parler-tts/requirements-intel.txt
+++ b/backend/python/parler-tts/requirements-intel.txt
@@ -3,6 +3,5 @@ intel-extension-for-pytorch
 torch
 torchaudio
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
 transformers
 accelerate
\ No newline at end of file
diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt
index 75ea8a5915fd..faf4ea3d1f1b 100644
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,3 +1,4 @@
 grpcio==1.68.1
 certifi
 llvmlite==0.43.0
+setuptools
\ No newline at end of file
diff --git a/backend/python/rerankers/requirements-intel.txt b/backend/python/rerankers/requirements-intel.txt
index e6bb4cc70fea..a3cc600c9105 100644
--- a/backend/python/rerankers/requirements-intel.txt
+++ b/backend/python/rerankers/requirements-intel.txt
@@ -5,4 +5,4 @@ accelerate
 torch
 rerankers[transformers]
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/sentencetransformers/requirements-intel.txt b/backend/python/sentencetransformers/requirements-intel.txt
index 56e1744633f9..23e0d5f205c2 100644
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 accelerate
 sentence-transformers==3.3.1
 transformers
\ No newline at end of file
diff --git a/backend/python/transformers-musicgen/requirements-intel.txt b/backend/python/transformers-musicgen/requirements-intel.txt
index 608d693939f9..bb191163271f 100644
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -4,4 +4,4 @@ transformers
 accelerate
 torch
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt
index b556b9f1c140..d981fd99588e 100644
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
 grpcio==1.68.1
 protobuf
 certifi
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+setuptools
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements-intel.txt b/backend/python/vall-e-x/requirements-intel.txt
index adbabeac74f2..284e7131cd26 100644
--- a/backend/python/vall-e-x/requirements-intel.txt
+++ b/backend/python/vall-e-x/requirements-intel.txt
@@ -3,5 +3,4 @@ intel-extension-for-pytorch
 accelerate
 torch
 torchaudio
-optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
\ No newline at end of file
+optimum[openvino]
\ No newline at end of file
diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt
index 8e4eabf1b483..d981fd99588e 100644
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,3 +1,4 @@
 grpcio==1.68.1
 protobuf
-certifi
\ No newline at end of file
+certifi
+setuptools
\ No newline at end of file
diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt
index 9544336884ea..36326f95b5e1 100644
--- a/backend/python/vllm/requirements-intel.txt
+++ b/backend/python/vllm/requirements-intel.txt
@@ -4,5 +4,5 @@ accelerate
 torch
 transformers
 optimum[openvino]
-setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
+setuptools
 bitsandbytes
\ No newline at end of file
diff --git a/core/http/app_test.go b/core/http/app_test.go
index 34ebacf74408..7c57ba21a701 100644
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -704,7 +704,7 @@ var _ = Describe("API test", func() {
 			Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp))
 
 			Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat)))
-			Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav"))
+			Expect(resp.Header.Get("Content-Type")).To(Or(Equal("audio/x-wav"), Equal("audio/vnd.wave")))
 		})
 		It("installs and is capable to generate images", Label("stablediffusion"), func() {
 			if runtime.GOOS != "linux" {
diff --git a/gallery/index.yaml b/gallery/index.yaml
index 37664dd821ab..99c0e9a31a97 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -47,6 +47,36 @@
     - filename: Llama-3.3-70B-Instruct.Q4_K_M.gguf
       sha256: 4f3b04ecae278bdb0fd545b47c210bc5edf823e5ebf7d41e0b526c81d54b1ff3
       uri: huggingface://MaziyarPanahi/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct.Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "l3.3-70b-euryale-v2.3"
+  icon: https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3/resolve/main/Eury.png
+  urls:
+    - https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3
+    - https://huggingface.co/bartowski/L3.3-70B-Euryale-v2.3-GGUF
+  description: |
+    A direct replacement / successor to Euryale v2.2, not Hanami-x1, though it is slightly better than them in my opinion.
+  overrides:
+    parameters:
+      model: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+  files:
+    - filename: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+      sha256: 4e78bb0e65886bfcff89b829f6d38aa6f6846988bb8291857e387e3f60b3217b
+      uri: huggingface://bartowski/L3.3-70B-Euryale-v2.3-GGUF/L3.3-70B-Euryale-v2.3-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "l3.3-ms-evayale-70b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/HFCaVzRpiE05Y46p41qRy.webp
+  urls:
+    - https://huggingface.co/Steelskull/L3.3-MS-Evayale-70B
+    - https://huggingface.co/bartowski/L3.3-MS-Evayale-70B-GGUF
+  description: |
+    This model was created as I liked the storytelling of EVA but the prose and details of scenes from EURYALE, my goal is to merge the robust storytelling of both models while attempting to maintain the positives of both models.
+  overrides:
+    parameters:
+      model: L3.3-MS-Evayale-70B-Q4_K_M.gguf
+  files:
+    - filename: L3.3-MS-Evayale-70B-Q4_K_M.gguf
+      sha256: f941d88870fec8343946517a1802d159d23f3971eeea50b6cf12295330bd29cc
+      uri: huggingface://bartowski/L3.3-MS-Evayale-70B-GGUF/L3.3-MS-Evayale-70B-Q4_K_M.gguf
 - &rwkv
   url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
   name: "rwkv-6-world-7b"
@@ -428,7 +458,6 @@
   urls:
     - https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF
   overrides:
-    embeddings: true
     parameters:
       model: llama-3.2-1b-instruct-q4_k_m.gguf
   files:
@@ -777,6 +806,20 @@
     - filename: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
       sha256: 03d8d05e3821f4caa65defa82baaff658484d4405b66546431528153ceef4d9e
       uri: huggingface://mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF/Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "fusechat-llama-3.2-3b-instruct"
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Llama-3.2-3B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
+      sha256: a4f0e9a905b74886b79b72622c06a3219d6812818a564a53c39fc49032d7f842
+      uri: huggingface://bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF/FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf
 - &qwen25
   ## Qwen2.5
   name: "qwen2.5-14b-instruct"
@@ -1952,6 +1995,71 @@
     - filename: Sailor2-20B-Chat-Q4_K_M.gguf
       sha256: 0cf8fcd367accee19702ef15ee964bddd5035bde034afddd838f818e7655534a
       uri: huggingface://bartowski/Sailor2-20B-Chat-GGUF/Sailor2-20B-Chat-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "72b-qwen2.5-kunou-v1"
+  icon: https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1/resolve/main/knn.png
+  urls:
+    - https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1
+    - https://huggingface.co/bartowski/72B-Qwen2.5-Kunou-v1-GGUF
+  description: |
+    I do not really have anything planned for this model other than it being a generalist, and Roleplay Model? It was just something made and planned in minutes.
+    Same with the 14 and 32B version.
+    Kunou's the name of an OC I worked on for a couple of years, for a... fanfic. mmm...
+
+    A kind-of successor to L3-70B-Euryale-v2.2 in all but name? I'm keeping Stheno/Euryale lineage to Llama series for now.
+    I had a version made on top of Nemotron, a supposed Euryale 2.4 but that flopped hard, it was not my cup of tea.
+    This version is basically a better, more cleaned up Dataset used on Euryale and Stheno.
+  overrides:
+    parameters:
+      model: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+  files:
+    - filename: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+      sha256: 91907f29746625a62885793475956220b81d8a5a34b53686a1acd1d03fd403ea
+      uri: huggingface://bartowski/72B-Qwen2.5-Kunou-v1-GGUF/72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  icon: https://i.imgur.com/OxX2Usi.png
+  name: "evathene-v1.3"
+  urls:
+    - https://huggingface.co/sophosympatheia/Evathene-v1.3
+    - https://huggingface.co/bartowski/Evathene-v1.3-GGUF
+  description: |
+    This 72B parameter model is a merge of sophosympatheia/Evathene-v1.1 and sophosympatheia/Evathene-v1.2. See the merge recipe below for details.
+  overrides:
+    parameters:
+      model: Evathene-v1.3-Q4_K_M.gguf
+  files:
+    - filename: Evathene-v1.3-Q4_K_M.gguf
+      sha256: 0f54909b3ddca514994ee16417da8750f56e7bd59581b46ac47625c230e29d1f
+      uri: huggingface://bartowski/Evathene-v1.3-GGUF/Evathene-v1.3-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "fusechat-qwen-2.5-7b-instruct"
+  icon: https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/resolve/main/FuseChat-3.0.png
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+      sha256: 8cd8c317769f03125ac753c836ac92c5a76ee0b35502811d0e65bcbb8df9d55c
+      uri: huggingface://bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF/FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "neumind-math-7b-instruct"
+  urls:
+    - https://huggingface.co/prithivMLmods/Neumind-Math-7B-Instruct
+    - https://huggingface.co/QuantFactory/Neumind-Math-7B-Instruct-GGUF
+  description: |
+    The Neumind-Math-7B-Instruct is a fine-tuned model based on Qwen2.5-7B-Instruct, optimized for mathematical reasoning, step-by-step problem-solving, and instruction-based tasks in the mathematics domain. The model is designed for applications requiring structured reasoning, numerical computations, and mathematical proof generation.
+  overrides:
+    parameters:
+      model: Neumind-Math-7B-Instruct.Q4_K_M.gguf
+  files:
+    - filename: Neumind-Math-7B-Instruct.Q4_K_M.gguf
+      sha256: 3250abadeae4234e06dfaf7cf86fe871fe021e6c2dfcb4542c2a4f412d71e28c
+      uri: huggingface://QuantFactory/Neumind-Math-7B-Instruct-GGUF/Neumind-Math-7B-Instruct.Q4_K_M.gguf
 - &archfunct
   license: apache-2.0
   tags:
@@ -2005,6 +2113,20 @@
     - filename: Arch-Function-3B.Q4_K_M.gguf
       sha256: 9945cb8d070498d163e5df90c1987f591d35e4fd2222a6c51bcfff848c4b573b
       uri: huggingface://mradermacher/Arch-Function-3B-GGUF/Arch-Function-3B.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2-7b-multilingual-rp"
+  urls:
+    - https://huggingface.co/maywell/Qwen2-7B-Multilingual-RP
+    - https://huggingface.co/QuantFactory/Qwen2-7B-Multilingual-RP-GGUF
+  description: |
+    Multilingual Qwen2-7B model trained on Roleplaying.
+  overrides:
+    parameters:
+      model: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
+  files:
+    - filename: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
+      sha256: 31756c58fd135f2deb59b2d9b142f39134dc8d1a6eaa02f388dda7491fc95ccc
+      uri: huggingface://QuantFactory/Qwen2-7B-Multilingual-RP-GGUF/Qwen2-7B-Multilingual-RP.Q4_K_M.gguf
 - &smollm
   ## SmolLM
   url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -3008,6 +3130,22 @@
     - filename: hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
       sha256: 8cff9d399a0583616fe1f290da6daa091ab5c5493d0e173a8fffb45202d79417
       uri: huggingface://mlabonne/Hermes-3-Llama-3.1-8B-lorablated-GGUF/hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "hermes-3-llama-3.2-3b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/-kj_KflXsdpcZoTQsvx7W.jpeg
+  urls:
+    - https://huggingface.co/NousResearch/Hermes-3-Llama-3.2-3B
+    - https://huggingface.co/bartowski/Hermes-3-Llama-3.2-3B-GGUF
+  description: |
+    Hermes 3 3B is a small but mighty new addition to the Hermes series of LLMs by Nous Research, and is Nous's first fine-tune in this parameter class.
+    Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board.
+  overrides:
+    parameters:
+      model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+  files:
+    - filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
+      sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5
+      uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf
 - !!merge <<: *llama31
   name: "doctoraifinetune-3.1-8b-i1"
   urls:
@@ -3761,6 +3899,35 @@
     - filename: B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
       sha256: 625a54848dcd3f23bc06b639a7dfecae14142b5d177dd45acfe7724816bab4cd
       uri: huggingface://QuantFactory/B-NIMITA-L3-8B-v0.02-GGUF/B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "deepthought-8b-llama-v0.01-alpha"
+  urls:
+    - https://huggingface.co/ruliad/deepthought-8b-llama-v0.01-alpha
+    - https://huggingface.co/bartowski/deepthought-8b-llama-v0.01-alpha-GGUF
+  description: |
+    Deepthought-8B is a small and capable reasoning model built on LLaMA-3.1 8B, designed to make AI reasoning more transparent and controllable. Despite its relatively small size, it achieves sophisticated reasoning capabilities that rival much larger models.
+  overrides:
+    parameters:
+      model: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+  files:
+    - filename: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+      sha256: 33195ba7b898ef8b2997d095e8be42adf1d0e1f6e8291cf07e026fc8e45903fd
+      uri: huggingface://bartowski/deepthought-8b-llama-v0.01-alpha-GGUF/deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf
+- !!merge <<: *llama31
+  name: "fusechat-llama-3.1-8b-instruct"
+  icon: https://huggingface.co/FuseAI/FuseChat-Llama-3.1-8B-Instruct/resolve/main/FuseChat-3.0.png
+  urls:
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF
+    - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
+      sha256: fe58c8c9b695e36e6b0ee5e4d81ff71ea0a4f1a11fa7bb16e8d6f1b35a58dff6
+      uri: huggingface://bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF/FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf
 - &deepseek
   ## Deepseek
   url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
@@ -4146,6 +4313,20 @@
     - filename: Marco-o1.Q4_K_M.gguf
       sha256: 54dd9554cb54609bf0bf4b367dfba192fc982a2fc6b87a0f56fba5ea82762d0d
       uri: huggingface://QuantFactory/Marco-o1-GGUF/Marco-o1.Q4_K_M.gguf
+- !!merge <<: *qwen2
+  name: "marco-o1-uncensored"
+  urls:
+    - https://huggingface.co/thirdeyeai/marco-o1-uncensored
+    - https://huggingface.co/QuantFactory/marco-o1-uncensored-GGUF
+  description: |
+    Uncensored version of marco-o1
+  overrides:
+    parameters:
+      model: marco-o1-uncensored.Q4_K_M.gguf
+  files:
+    - filename: marco-o1-uncensored.Q4_K_M.gguf
+      sha256: ad0440270a7254098f90779744d3e5b34fe49b7baf97c819909ba9c5648cc0d9
+      uri: huggingface://QuantFactory/marco-o1-uncensored-GGUF/marco-o1-uncensored.Q4_K_M.gguf
 - &mistral03
   ## START Mistral
   url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"
@@ -4595,6 +4776,48 @@
     - filename: MN-Chunky-Lotus-12B.Q4_K_M.gguf
       sha256: 363defe0a769fdb715dab75517966a0a80bcdd981a610d4c759099b6c8ff143a
       uri: huggingface://QuantFactory/MN-Chunky-Lotus-12B-GGUF/MN-Chunky-Lotus-12B.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  name: "chronos-gold-12b-1.0"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/630417380907b9a115c6aa9f/3hc8zt8fzKdO3qHK1p1mW.webp
+  urls:
+    - https://huggingface.co/elinas/Chronos-Gold-12B-1.0
+    - https://huggingface.co/mradermacher/Chronos-Gold-12B-1.0-GGUF
+  description: |
+    Chronos Gold 12B 1.0 is a very unique model that applies to domain areas such as general chatbot functionatliy, roleplay, and storywriting. The model has been observed to write up to 2250 tokens in a single sequence. The model was trained at a sequence length of 16384 (16k) and will still retain the apparent 128k context length from Mistral-Nemo, though it deteriorates over time like regular Nemo does based on the RULER Test
+
+    As a result, is recommended to keep your sequence length max at 16384, or you will experience performance degredation.
+
+    The base model is mistralai/Mistral-Nemo-Base-2407 which was heavily modified to produce a more coherent model, comparable to much larger models.
+
+    Chronos Gold 12B-1.0 re-creates the uniqueness of the original Chronos with significiantly enhanced prompt adherence (following), coherence, a modern dataset, as well as supporting a majority of "character card" formats in applications like SillyTavern.
+
+    It went through an iterative and objective merge process as my previous models and was further finetuned on a dataset curated for it.
+
+    The specifics of the model will not be disclosed at the time due to dataset ownership.
+  overrides:
+    parameters:
+      model: Chronos-Gold-12B-1.0.Q4_K_M.gguf
+  files:
+    - filename: Chronos-Gold-12B-1.0.Q4_K_M.gguf
+      sha256: d75a6ed28781f0ea6fa6e58c0b25dfecdd160d4cab64aaf511ea156e99a1e1f3
+      uri: huggingface://mradermacher/Chronos-Gold-12B-1.0-GGUF/Chronos-Gold-12B-1.0.Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "naturallm-7b-instruct"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  urls:
+    - https://huggingface.co/qingy2024/NaturalLM-7B-Instruct
+    - https://huggingface.co/bartowski/NaturalLM-7B-Instruct-GGUF
+  description: |
+    This Mistral 7B fine-tune is trained (for 150 steps) to talk like a human, not a "helpful assistant"!
+    It's also very beta right now. The dataset (qingy2024/Natural-Text-ShareGPT) can definitely be improved.
+  overrides:
+    parameters:
+      model: NaturalLM-7B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: NaturalLM-7B-Instruct-Q4_K_M.gguf
+      sha256: 15b2f34116f690fea35790a9392b8a2190fe25827e370d426e88a2a543f4dcee
+      uri: huggingface://bartowski/NaturalLM-7B-Instruct-GGUF/NaturalLM-7B-Instruct-Q4_K_M.gguf
 - &mudler
   ### START mudler's LocalAI specific-models
   url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
@@ -5383,6 +5606,21 @@
     - filename: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
       sha256: 1e92fe80ccad80e97076ee26b002c2280f075dfe2507d534b46a4391a077f319
       uri: huggingface://QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF/BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
+- !!merge <<: *gemma
+  name: "fusechat-gemma-2-9b-instruct"
+  icon: "https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct/resolve/main/FuseChat-3.0.png"
+  urls:
+    - https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct
+    - https://huggingface.co/bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF
+  description: |
+    We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code.
+  overrides:
+    parameters:
+      model: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
+  files:
+    - filename: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
+      sha256: f5aef201be68f344bebff3433af87aac6428fd227adfd7e468c8bfbcf9660ece
+      uri: huggingface://bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF/FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf
 - &llama3
   url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
   icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
@@ -9513,6 +9751,10 @@
       llama3.2 embeddings model. Using as drop-in replacement for bert-embeddings
   tags:
     - embeddings
+  overrides:
+    embeddings: true
+    parameters:
+      model: llama-3.2-1b-instruct-q4_k_m.gguf
 ## Stable Diffusion
 - url: github:mudler/LocalAI/gallery/stablediffusion.yaml@master
   license: "BSD-3"