diff --git a/Makefile b/Makefile index 36c7be21561b..4226c5d7fab5 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ DETECT_LIBS?=true # llama.cpp versions GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be -CPPLLAMA_VERSION?=dafae66cc242eb766797194d3c85c5e502625623 +CPPLLAMA_VERSION?=08ea539df211e46bb4d0dd275e541cb591d5ebc8 # whisper.cpp version WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index ea5c4e34a465..98dd8fde1c74 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -681,7 +681,6 @@ struct llama_server_context slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); slot->sparams.seed = json_value(data, "seed", default_sparams.seed); slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); @@ -1213,13 +1212,12 @@ struct llama_server_context {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, {"stop", slot.params.antiprompt}, {"n_predict", slot.params.n_predict}, {"n_keep", params.n_keep}, {"ignore_eos", slot.sparams.ignore_eos}, {"stream", slot.params.stream}, - // {"logit_bias", slot.sparams.logit_bias}, + // {"logit_bias", slot.sparams.logit_bias}, {"n_probs", slot.sparams.n_probs}, {"min_keep", slot.sparams.min_keep}, {"grammar", slot.sparams.grammar}, @@ -2112,7 +2110,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama // slot->sparams.mirostat = json_value(data, "mirostat", default_sparams.mirostat); // slot->sparams.mirostat_tau = json_value(data, "mirostat_tau", default_sparams.mirostat_tau); // slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - // slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl); // slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); // slot->params.seed = json_value(data, "seed", default_params.seed); // slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar); @@ -2135,7 +2132,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama data["mirostat"] = predict->mirostat(); data["mirostat_tau"] = predict->mirostattau(); data["mirostat_eta"] = predict->mirostateta(); - data["penalize_nl"] = predict->penalizenl(); data["n_keep"] = predict->nkeep(); data["seed"] = predict->seed(); data["grammar"] = predict->grammar(); @@ -2181,7 +2177,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama // llama.params.sparams.mirostat = predict->mirostat(); // llama.params.sparams.mirostat_tau = predict->mirostattau(); // llama.params.sparams.mirostat_eta = predict->mirostateta(); -// llama.params.sparams.penalize_nl = predict->penalizenl(); // llama.params.n_keep = predict->nkeep(); // llama.params.seed = predict->seed(); // llama.params.sparams.grammar = predict->grammar(); @@ -2228,6 +2223,35 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama // } // } +const std::vector kv_cache_types = { + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_BF16, + GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_IQ4_NL, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, +}; + +static ggml_type kv_cache_type_from_str(const std::string & s) { + for (const auto & type : kv_cache_types) { + if (ggml_type_name(type) == s) { + return type; + } + } + throw std::runtime_error("Unsupported cache type: " + s); +} + +static std::string get_all_kv_cache_types() { + std::ostringstream msg; + for (const auto & type : kv_cache_types) { + msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", "); + } + return msg.str(); +} + static void params_parse(const backend::ModelOptions* request, common_params & params) { @@ -2242,10 +2266,10 @@ static void params_parse(const backend::ModelOptions* request, // params.model_alias ?? params.model_alias = request->modelfile(); if (!request->cachetypekey().empty()) { - params.cache_type_k = request->cachetypekey(); + params.cache_type_k = kv_cache_type_from_str(request->cachetypekey()); } if (!request->cachetypevalue().empty()) { - params.cache_type_v = request->cachetypevalue(); + params.cache_type_v = kv_cache_type_from_str(request->cachetypevalue()); } params.n_ctx = request->contextsize(); //params.memory_f16 = request->f16memory(); diff --git a/backend/python/autogptq/requirements-intel.txt b/backend/python/autogptq/requirements-intel.txt index d5e0173ea407..cec8bff477f5 100644 --- a/backend/python/autogptq/requirements-intel.txt +++ b/backend/python/autogptq/requirements-intel.txt @@ -2,4 +2,4 @@ intel-extension-for-pytorch torch optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 \ No newline at end of file +setuptools \ No newline at end of file diff --git a/backend/python/bark/requirements-intel.txt b/backend/python/bark/requirements-intel.txt index c0e4dcaa5b99..1f043bbfdd02 100644 --- a/backend/python/bark/requirements-intel.txt +++ b/backend/python/bark/requirements-intel.txt @@ -3,6 +3,6 @@ intel-extension-for-pytorch torch torchaudio optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 +setuptools transformers accelerate \ No newline at end of file diff --git a/backend/python/common/libbackend.sh b/backend/python/common/libbackend.sh index 934b1fd37b14..6013cf765d4c 100644 --- a/backend/python/common/libbackend.sh +++ b/backend/python/common/libbackend.sh @@ -17,6 +17,9 @@ # LIMIT_TARGETS="cublas12" # source $(dirname $0)/../common/libbackend.sh # + +PYTHON_VERSION="3.10" + function init() { # Name of the backend (directory name) BACKEND_NAME=${PWD##*/} @@ -88,7 +91,7 @@ function getBuildProfile() { # always result in an activated virtual environment function ensureVenv() { if [ ! -d "${EDIR}/venv" ]; then - uv venv ${EDIR}/venv + uv venv --python ${PYTHON_VERSION} ${EDIR}/venv echo "virtualenv created" fi diff --git a/backend/python/coqui/requirements-intel.txt b/backend/python/coqui/requirements-intel.txt index de3b4ee4b38c..7ed2fb4265e9 100644 --- a/backend/python/coqui/requirements-intel.txt +++ b/backend/python/coqui/requirements-intel.txt @@ -3,7 +3,7 @@ intel-extension-for-pytorch torch torchaudio optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 +setuptools transformers accelerate coqui-tts \ No newline at end of file diff --git a/backend/python/diffusers/requirements-intel.txt b/backend/python/diffusers/requirements-intel.txt index 566278a88a23..bd6632bfe812 100644 --- a/backend/python/diffusers/requirements-intel.txt +++ b/backend/python/diffusers/requirements-intel.txt @@ -3,7 +3,7 @@ intel-extension-for-pytorch torch torchvision optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 +setuptools diffusers opencv-python transformers diff --git a/backend/python/openvoice/requirements.txt b/backend/python/openvoice/requirements.txt index 6806d3e1c4a8..e6a1e5a5d612 100644 --- a/backend/python/openvoice/requirements.txt +++ b/backend/python/openvoice/requirements.txt @@ -18,3 +18,4 @@ jieba==0.42.1 gradio==3.48.0 langid==1.1.6 llvmlite==0.43.0 +setuptools \ No newline at end of file diff --git a/backend/python/parler-tts/requirements-intel.txt b/backend/python/parler-tts/requirements-intel.txt index c0e4dcaa5b99..bcb8900e9bac 100644 --- a/backend/python/parler-tts/requirements-intel.txt +++ b/backend/python/parler-tts/requirements-intel.txt @@ -3,6 +3,5 @@ intel-extension-for-pytorch torch torchaudio optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 transformers accelerate \ No newline at end of file diff --git a/backend/python/parler-tts/requirements.txt b/backend/python/parler-tts/requirements.txt index 75ea8a5915fd..faf4ea3d1f1b 100644 --- a/backend/python/parler-tts/requirements.txt +++ b/backend/python/parler-tts/requirements.txt @@ -1,3 +1,4 @@ grpcio==1.68.1 certifi llvmlite==0.43.0 +setuptools \ No newline at end of file diff --git a/backend/python/rerankers/requirements-intel.txt b/backend/python/rerankers/requirements-intel.txt index e6bb4cc70fea..a3cc600c9105 100644 --- a/backend/python/rerankers/requirements-intel.txt +++ b/backend/python/rerankers/requirements-intel.txt @@ -5,4 +5,4 @@ accelerate torch rerankers[transformers] optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 \ No newline at end of file +setuptools \ No newline at end of file diff --git a/backend/python/sentencetransformers/requirements-intel.txt b/backend/python/sentencetransformers/requirements-intel.txt index 56e1744633f9..23e0d5f205c2 100644 --- a/backend/python/sentencetransformers/requirements-intel.txt +++ b/backend/python/sentencetransformers/requirements-intel.txt @@ -2,7 +2,7 @@ intel-extension-for-pytorch torch optimum[openvino] -setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406 +setuptools accelerate sentence-transformers==3.3.1 transformers \ No newline at end of file diff --git a/backend/python/transformers-musicgen/requirements-intel.txt b/backend/python/transformers-musicgen/requirements-intel.txt index 608d693939f9..bb191163271f 100644 --- a/backend/python/transformers-musicgen/requirements-intel.txt +++ b/backend/python/transformers-musicgen/requirements-intel.txt @@ -4,4 +4,4 @@ transformers accelerate torch optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 \ No newline at end of file +setuptools \ No newline at end of file diff --git a/backend/python/transformers/requirements.txt b/backend/python/transformers/requirements.txt index b556b9f1c140..d981fd99588e 100644 --- a/backend/python/transformers/requirements.txt +++ b/backend/python/transformers/requirements.txt @@ -1,4 +1,4 @@ grpcio==1.68.1 protobuf certifi -setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406 \ No newline at end of file +setuptools \ No newline at end of file diff --git a/backend/python/vall-e-x/requirements-intel.txt b/backend/python/vall-e-x/requirements-intel.txt index adbabeac74f2..284e7131cd26 100644 --- a/backend/python/vall-e-x/requirements-intel.txt +++ b/backend/python/vall-e-x/requirements-intel.txt @@ -3,5 +3,4 @@ intel-extension-for-pytorch accelerate torch torchaudio -optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 \ No newline at end of file +optimum[openvino] \ No newline at end of file diff --git a/backend/python/vall-e-x/requirements.txt b/backend/python/vall-e-x/requirements.txt index 8e4eabf1b483..d981fd99588e 100644 --- a/backend/python/vall-e-x/requirements.txt +++ b/backend/python/vall-e-x/requirements.txt @@ -1,3 +1,4 @@ grpcio==1.68.1 protobuf -certifi \ No newline at end of file +certifi +setuptools \ No newline at end of file diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt index 9544336884ea..36326f95b5e1 100644 --- a/backend/python/vllm/requirements-intel.txt +++ b/backend/python/vllm/requirements-intel.txt @@ -4,5 +4,5 @@ accelerate torch transformers optimum[openvino] -setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406 +setuptools bitsandbytes \ No newline at end of file diff --git a/core/http/app_test.go b/core/http/app_test.go index 34ebacf74408..7c57ba21a701 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -704,7 +704,7 @@ var _ = Describe("API test", func() { Expect(err).ToNot(HaveOccurred(), fmt.Sprint(resp)) Expect(resp.StatusCode).To(Equal(200), fmt.Sprint(string(dat))) - Expect(resp.Header.Get("Content-Type")).To(Equal("audio/x-wav")) + Expect(resp.Header.Get("Content-Type")).To(Or(Equal("audio/x-wav"), Equal("audio/vnd.wave"))) }) It("installs and is capable to generate images", Label("stablediffusion"), func() { if runtime.GOOS != "linux" { diff --git a/gallery/index.yaml b/gallery/index.yaml index 37664dd821ab..99c0e9a31a97 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -47,6 +47,36 @@ - filename: Llama-3.3-70B-Instruct.Q4_K_M.gguf sha256: 4f3b04ecae278bdb0fd545b47c210bc5edf823e5ebf7d41e0b526c81d54b1ff3 uri: huggingface://MaziyarPanahi/Llama-3.3-70B-Instruct-GGUF/Llama-3.3-70B-Instruct.Q4_K_M.gguf +- !!merge <<: *llama33 + name: "l3.3-70b-euryale-v2.3" + icon: https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3/resolve/main/Eury.png + urls: + - https://huggingface.co/Sao10K/L3.3-70B-Euryale-v2.3 + - https://huggingface.co/bartowski/L3.3-70B-Euryale-v2.3-GGUF + description: | + A direct replacement / successor to Euryale v2.2, not Hanami-x1, though it is slightly better than them in my opinion. + overrides: + parameters: + model: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf + files: + - filename: L3.3-70B-Euryale-v2.3-Q4_K_M.gguf + sha256: 4e78bb0e65886bfcff89b829f6d38aa6f6846988bb8291857e387e3f60b3217b + uri: huggingface://bartowski/L3.3-70B-Euryale-v2.3-GGUF/L3.3-70B-Euryale-v2.3-Q4_K_M.gguf +- !!merge <<: *llama33 + name: "l3.3-ms-evayale-70b" + icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/HFCaVzRpiE05Y46p41qRy.webp + urls: + - https://huggingface.co/Steelskull/L3.3-MS-Evayale-70B + - https://huggingface.co/bartowski/L3.3-MS-Evayale-70B-GGUF + description: | + This model was created as I liked the storytelling of EVA but the prose and details of scenes from EURYALE, my goal is to merge the robust storytelling of both models while attempting to maintain the positives of both models. + overrides: + parameters: + model: L3.3-MS-Evayale-70B-Q4_K_M.gguf + files: + - filename: L3.3-MS-Evayale-70B-Q4_K_M.gguf + sha256: f941d88870fec8343946517a1802d159d23f3971eeea50b6cf12295330bd29cc + uri: huggingface://bartowski/L3.3-MS-Evayale-70B-GGUF/L3.3-MS-Evayale-70B-Q4_K_M.gguf - &rwkv url: "github:mudler/LocalAI/gallery/rwkv.yaml@master" name: "rwkv-6-world-7b" @@ -428,7 +458,6 @@ urls: - https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF overrides: - embeddings: true parameters: model: llama-3.2-1b-instruct-q4_k_m.gguf files: @@ -777,6 +806,20 @@ - filename: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf sha256: 03d8d05e3821f4caa65defa82baaff658484d4405b66546431528153ceef4d9e uri: huggingface://mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF/Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf +- !!merge <<: *llama32 + name: "fusechat-llama-3.2-3b-instruct" + urls: + - https://huggingface.co/FuseAI/FuseChat-Llama-3.2-3B-Instruct + - https://huggingface.co/bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF + description: | + We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code. + overrides: + parameters: + model: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf + files: + - filename: FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf + sha256: a4f0e9a905b74886b79b72622c06a3219d6812818a564a53c39fc49032d7f842 + uri: huggingface://bartowski/FuseChat-Llama-3.2-3B-Instruct-GGUF/FuseChat-Llama-3.2-3B-Instruct-Q4_K_M.gguf - &qwen25 ## Qwen2.5 name: "qwen2.5-14b-instruct" @@ -1952,6 +1995,71 @@ - filename: Sailor2-20B-Chat-Q4_K_M.gguf sha256: 0cf8fcd367accee19702ef15ee964bddd5035bde034afddd838f818e7655534a uri: huggingface://bartowski/Sailor2-20B-Chat-GGUF/Sailor2-20B-Chat-Q4_K_M.gguf +- !!merge <<: *qwen25 + name: "72b-qwen2.5-kunou-v1" + icon: https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1/resolve/main/knn.png + urls: + - https://huggingface.co/Sao10K/72B-Qwen2.5-Kunou-v1 + - https://huggingface.co/bartowski/72B-Qwen2.5-Kunou-v1-GGUF + description: | + I do not really have anything planned for this model other than it being a generalist, and Roleplay Model? It was just something made and planned in minutes. + Same with the 14 and 32B version. + Kunou's the name of an OC I worked on for a couple of years, for a... fanfic. mmm... + + A kind-of successor to L3-70B-Euryale-v2.2 in all but name? I'm keeping Stheno/Euryale lineage to Llama series for now. + I had a version made on top of Nemotron, a supposed Euryale 2.4 but that flopped hard, it was not my cup of tea. + This version is basically a better, more cleaned up Dataset used on Euryale and Stheno. + overrides: + parameters: + model: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf + files: + - filename: 72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf + sha256: 91907f29746625a62885793475956220b81d8a5a34b53686a1acd1d03fd403ea + uri: huggingface://bartowski/72B-Qwen2.5-Kunou-v1-GGUF/72B-Qwen2.5-Kunou-v1-Q4_K_M.gguf +- !!merge <<: *qwen25 + icon: https://i.imgur.com/OxX2Usi.png + name: "evathene-v1.3" + urls: + - https://huggingface.co/sophosympatheia/Evathene-v1.3 + - https://huggingface.co/bartowski/Evathene-v1.3-GGUF + description: | + This 72B parameter model is a merge of sophosympatheia/Evathene-v1.1 and sophosympatheia/Evathene-v1.2. See the merge recipe below for details. + overrides: + parameters: + model: Evathene-v1.3-Q4_K_M.gguf + files: + - filename: Evathene-v1.3-Q4_K_M.gguf + sha256: 0f54909b3ddca514994ee16417da8750f56e7bd59581b46ac47625c230e29d1f + uri: huggingface://bartowski/Evathene-v1.3-GGUF/Evathene-v1.3-Q4_K_M.gguf +- !!merge <<: *qwen25 + name: "fusechat-qwen-2.5-7b-instruct" + icon: https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct/resolve/main/FuseChat-3.0.png + urls: + - https://huggingface.co/FuseAI/FuseChat-Qwen-2.5-7B-Instruct + - https://huggingface.co/bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF + description: | + We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code. + overrides: + parameters: + model: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf + files: + - filename: FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf + sha256: 8cd8c317769f03125ac753c836ac92c5a76ee0b35502811d0e65bcbb8df9d55c + uri: huggingface://bartowski/FuseChat-Qwen-2.5-7B-Instruct-GGUF/FuseChat-Qwen-2.5-7B-Instruct-Q4_K_M.gguf +- !!merge <<: *qwen25 + name: "neumind-math-7b-instruct" + urls: + - https://huggingface.co/prithivMLmods/Neumind-Math-7B-Instruct + - https://huggingface.co/QuantFactory/Neumind-Math-7B-Instruct-GGUF + description: | + The Neumind-Math-7B-Instruct is a fine-tuned model based on Qwen2.5-7B-Instruct, optimized for mathematical reasoning, step-by-step problem-solving, and instruction-based tasks in the mathematics domain. The model is designed for applications requiring structured reasoning, numerical computations, and mathematical proof generation. + overrides: + parameters: + model: Neumind-Math-7B-Instruct.Q4_K_M.gguf + files: + - filename: Neumind-Math-7B-Instruct.Q4_K_M.gguf + sha256: 3250abadeae4234e06dfaf7cf86fe871fe021e6c2dfcb4542c2a4f412d71e28c + uri: huggingface://QuantFactory/Neumind-Math-7B-Instruct-GGUF/Neumind-Math-7B-Instruct.Q4_K_M.gguf - &archfunct license: apache-2.0 tags: @@ -2005,6 +2113,20 @@ - filename: Arch-Function-3B.Q4_K_M.gguf sha256: 9945cb8d070498d163e5df90c1987f591d35e4fd2222a6c51bcfff848c4b573b uri: huggingface://mradermacher/Arch-Function-3B-GGUF/Arch-Function-3B.Q4_K_M.gguf +- !!merge <<: *qwen25 + name: "qwen2-7b-multilingual-rp" + urls: + - https://huggingface.co/maywell/Qwen2-7B-Multilingual-RP + - https://huggingface.co/QuantFactory/Qwen2-7B-Multilingual-RP-GGUF + description: | + Multilingual Qwen2-7B model trained on Roleplaying. + overrides: + parameters: + model: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf + files: + - filename: Qwen2-7B-Multilingual-RP.Q4_K_M.gguf + sha256: 31756c58fd135f2deb59b2d9b142f39134dc8d1a6eaa02f388dda7491fc95ccc + uri: huggingface://QuantFactory/Qwen2-7B-Multilingual-RP-GGUF/Qwen2-7B-Multilingual-RP.Q4_K_M.gguf - &smollm ## SmolLM url: "github:mudler/LocalAI/gallery/chatml.yaml@master" @@ -3008,6 +3130,22 @@ - filename: hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf sha256: 8cff9d399a0583616fe1f290da6daa091ab5c5493d0e173a8fffb45202d79417 uri: huggingface://mlabonne/Hermes-3-Llama-3.1-8B-lorablated-GGUF/hermes-3-llama-3.1-8b-lorablated.Q4_K_M.gguf +- !!merge <<: *llama32 + name: "hermes-3-llama-3.2-3b" + icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/-kj_KflXsdpcZoTQsvx7W.jpeg + urls: + - https://huggingface.co/NousResearch/Hermes-3-Llama-3.2-3B + - https://huggingface.co/bartowski/Hermes-3-Llama-3.2-3B-GGUF + description: | + Hermes 3 3B is a small but mighty new addition to the Hermes series of LLMs by Nous Research, and is Nous's first fine-tune in this parameter class. + Hermes 3 is a generalist language model with many improvements over Hermes 2, including advanced agentic capabilities, much better roleplaying, reasoning, multi-turn conversation, long context coherence, and improvements across the board. + overrides: + parameters: + model: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf + files: + - filename: Hermes-3-Llama-3.2-3B-Q4_K_M.gguf + sha256: 2e220a14ba4328fee38cf36c2c068261560f999fadb5725ce5c6d977cb5126b5 + uri: huggingface://bartowski/Hermes-3-Llama-3.2-3B-GGUF/Hermes-3-Llama-3.2-3B-Q4_K_M.gguf - !!merge <<: *llama31 name: "doctoraifinetune-3.1-8b-i1" urls: @@ -3761,6 +3899,35 @@ - filename: B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf sha256: 625a54848dcd3f23bc06b639a7dfecae14142b5d177dd45acfe7724816bab4cd uri: huggingface://QuantFactory/B-NIMITA-L3-8B-v0.02-GGUF/B-NIMITA-L3-8B-v0.02.Q4_K_M.gguf +- !!merge <<: *llama31 + name: "deepthought-8b-llama-v0.01-alpha" + urls: + - https://huggingface.co/ruliad/deepthought-8b-llama-v0.01-alpha + - https://huggingface.co/bartowski/deepthought-8b-llama-v0.01-alpha-GGUF + description: | + Deepthought-8B is a small and capable reasoning model built on LLaMA-3.1 8B, designed to make AI reasoning more transparent and controllable. Despite its relatively small size, it achieves sophisticated reasoning capabilities that rival much larger models. + overrides: + parameters: + model: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf + files: + - filename: deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf + sha256: 33195ba7b898ef8b2997d095e8be42adf1d0e1f6e8291cf07e026fc8e45903fd + uri: huggingface://bartowski/deepthought-8b-llama-v0.01-alpha-GGUF/deepthought-8b-llama-v0.01-alpha-Q4_K_M.gguf +- !!merge <<: *llama31 + name: "fusechat-llama-3.1-8b-instruct" + icon: https://huggingface.co/FuseAI/FuseChat-Llama-3.1-8B-Instruct/resolve/main/FuseChat-3.0.png + urls: + - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF + - https://huggingface.co/bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF + description: | + We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code. + overrides: + parameters: + model: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf + files: + - filename: FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf + sha256: fe58c8c9b695e36e6b0ee5e4d81ff71ea0a4f1a11fa7bb16e8d6f1b35a58dff6 + uri: huggingface://bartowski/FuseChat-Llama-3.1-8B-Instruct-GGUF/FuseChat-Llama-3.1-8B-Instruct-Q4_K_M.gguf - &deepseek ## Deepseek url: "github:mudler/LocalAI/gallery/deepseek.yaml@master" @@ -4146,6 +4313,20 @@ - filename: Marco-o1.Q4_K_M.gguf sha256: 54dd9554cb54609bf0bf4b367dfba192fc982a2fc6b87a0f56fba5ea82762d0d uri: huggingface://QuantFactory/Marco-o1-GGUF/Marco-o1.Q4_K_M.gguf +- !!merge <<: *qwen2 + name: "marco-o1-uncensored" + urls: + - https://huggingface.co/thirdeyeai/marco-o1-uncensored + - https://huggingface.co/QuantFactory/marco-o1-uncensored-GGUF + description: | + Uncensored version of marco-o1 + overrides: + parameters: + model: marco-o1-uncensored.Q4_K_M.gguf + files: + - filename: marco-o1-uncensored.Q4_K_M.gguf + sha256: ad0440270a7254098f90779744d3e5b34fe49b7baf97c819909ba9c5648cc0d9 + uri: huggingface://QuantFactory/marco-o1-uncensored-GGUF/marco-o1-uncensored.Q4_K_M.gguf - &mistral03 ## START Mistral url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master" @@ -4595,6 +4776,48 @@ - filename: MN-Chunky-Lotus-12B.Q4_K_M.gguf sha256: 363defe0a769fdb715dab75517966a0a80bcdd981a610d4c759099b6c8ff143a uri: huggingface://QuantFactory/MN-Chunky-Lotus-12B-GGUF/MN-Chunky-Lotus-12B.Q4_K_M.gguf +- !!merge <<: *mistral03 + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + name: "chronos-gold-12b-1.0" + icon: https://cdn-uploads.huggingface.co/production/uploads/630417380907b9a115c6aa9f/3hc8zt8fzKdO3qHK1p1mW.webp + urls: + - https://huggingface.co/elinas/Chronos-Gold-12B-1.0 + - https://huggingface.co/mradermacher/Chronos-Gold-12B-1.0-GGUF + description: | + Chronos Gold 12B 1.0 is a very unique model that applies to domain areas such as general chatbot functionatliy, roleplay, and storywriting. The model has been observed to write up to 2250 tokens in a single sequence. The model was trained at a sequence length of 16384 (16k) and will still retain the apparent 128k context length from Mistral-Nemo, though it deteriorates over time like regular Nemo does based on the RULER Test + + As a result, is recommended to keep your sequence length max at 16384, or you will experience performance degredation. + + The base model is mistralai/Mistral-Nemo-Base-2407 which was heavily modified to produce a more coherent model, comparable to much larger models. + + Chronos Gold 12B-1.0 re-creates the uniqueness of the original Chronos with significiantly enhanced prompt adherence (following), coherence, a modern dataset, as well as supporting a majority of "character card" formats in applications like SillyTavern. + + It went through an iterative and objective merge process as my previous models and was further finetuned on a dataset curated for it. + + The specifics of the model will not be disclosed at the time due to dataset ownership. + overrides: + parameters: + model: Chronos-Gold-12B-1.0.Q4_K_M.gguf + files: + - filename: Chronos-Gold-12B-1.0.Q4_K_M.gguf + sha256: d75a6ed28781f0ea6fa6e58c0b25dfecdd160d4cab64aaf511ea156e99a1e1f3 + uri: huggingface://mradermacher/Chronos-Gold-12B-1.0-GGUF/Chronos-Gold-12B-1.0.Q4_K_M.gguf +- !!merge <<: *mistral03 + name: "naturallm-7b-instruct" + url: "github:mudler/LocalAI/gallery/chatml.yaml@master" + urls: + - https://huggingface.co/qingy2024/NaturalLM-7B-Instruct + - https://huggingface.co/bartowski/NaturalLM-7B-Instruct-GGUF + description: | + This Mistral 7B fine-tune is trained (for 150 steps) to talk like a human, not a "helpful assistant"! + It's also very beta right now. The dataset (qingy2024/Natural-Text-ShareGPT) can definitely be improved. + overrides: + parameters: + model: NaturalLM-7B-Instruct-Q4_K_M.gguf + files: + - filename: NaturalLM-7B-Instruct-Q4_K_M.gguf + sha256: 15b2f34116f690fea35790a9392b8a2190fe25827e370d426e88a2a543f4dcee + uri: huggingface://bartowski/NaturalLM-7B-Instruct-GGUF/NaturalLM-7B-Instruct-Q4_K_M.gguf - &mudler ### START mudler's LocalAI specific-models url: "github:mudler/LocalAI/gallery/mudler.yaml@master" @@ -5383,6 +5606,21 @@ - filename: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf sha256: 1e92fe80ccad80e97076ee26b002c2280f075dfe2507d534b46a4391a077f319 uri: huggingface://QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF/BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf +- !!merge <<: *gemma + name: "fusechat-gemma-2-9b-instruct" + icon: "https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct/resolve/main/FuseChat-3.0.png" + urls: + - https://huggingface.co/FuseAI/FuseChat-Gemma-2-9B-Instruct + - https://huggingface.co/bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF + description: | + We present FuseChat-3.0, a series of models crafted to enhance performance by integrating the strengths of multiple source LLMs into more compact target LLMs. To achieve this fusion, we utilized four powerful source LLMs: Gemma-2-27B-It, Mistral-Large-Instruct-2407, Qwen-2.5-72B-Instruct, and Llama-3.1-70B-Instruct. For the target LLMs, we employed three widely-used smaller models—Llama-3.1-8B-Instruct, Gemma-2-9B-It, and Qwen-2.5-7B-Instruct—along with two even more compact models—Llama-3.2-3B-Instruct and Llama-3.2-1B-Instruct. The implicit model fusion process involves a two-stage training pipeline comprising Supervised Fine-Tuning (SFT) to mitigate distribution discrepancies between target and source LLMs, and Direct Preference Optimization (DPO) for learning preferences from multiple source LLMs. The resulting FuseChat-3.0 models demonstrated substantial improvements in tasks related to general conversation, instruction following, mathematics, and coding. Notably, when Llama-3.1-8B-Instruct served as the target LLM, our fusion approach achieved an average improvement of 6.8 points across 14 benchmarks. Moreover, it showed significant improvements of 37.1 and 30.1 points on instruction-following test sets AlpacaEval-2 and Arena-Hard respectively. We have released the FuseChat-3.0 models on Huggingface, stay tuned for the forthcoming dataset and code. + overrides: + parameters: + model: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf + files: + - filename: FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf + sha256: f5aef201be68f344bebff3433af87aac6428fd227adfd7e468c8bfbcf9660ece + uri: huggingface://bartowski/FuseChat-Gemma-2-9B-Instruct-GGUF/FuseChat-Gemma-2-9B-Instruct-Q4_K_M.gguf - &llama3 url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master" icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png @@ -9513,6 +9751,10 @@ llama3.2 embeddings model. Using as drop-in replacement for bert-embeddings tags: - embeddings + overrides: + embeddings: true + parameters: + model: llama-3.2-1b-instruct-q4_k_m.gguf ## Stable Diffusion - url: github:mudler/LocalAI/gallery/stablediffusion.yaml@master license: "BSD-3"