Skip to content

Commit

Permalink
chore(deps): update llama.cpp (#3497)
Browse files Browse the repository at this point in the history
* Apply llava patch

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
  • Loading branch information
mudler authored Sep 12, 2024
1 parent e35d816 commit d51444d
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 86 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ DETECT_LIBS?=true
# llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=815b1fb20a53e439882171757825bacb1350de04
CPPLLAMA_VERSION?=e6b7801bd189d102d901d3e72035611a25456ef1

# go-rwkv version
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
Expand Down
107 changes: 49 additions & 58 deletions backend/cpp/llama/grpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@
#include "common.h"
#include "json.hpp"
#include "llama.h"
#include "grammar-parser.h"
#include "backend.pb.h"
#include "backend.grpc.pb.h"
#include "utils.hpp"

#include "sampling.h"
// include std::regex
#include <cstddef>
#include <thread>
Expand Down Expand Up @@ -203,8 +202,8 @@ struct llama_client_slot
std::string stopping_word;

// sampling
struct llama_sampling_params sparams;
llama_sampling_context *ctx_sampling = nullptr;
struct gpt_sampler_params sparams;
gpt_sampler *ctx_sampling = nullptr;

int32_t ga_i = 0; // group-attention state
int32_t ga_n = 1; // group-attention factor
Expand Down Expand Up @@ -619,7 +618,7 @@ struct llama_server_context

bool launch_slot_with_data(llama_client_slot* &slot, json data) {
slot_params default_params;
llama_sampling_params default_sparams;
gpt_sampler_params default_sparams;

slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
Expand All @@ -628,7 +627,7 @@ struct llama_server_context
slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p);
slot->sparams.typ_p = json_value(data, "typical_p", default_sparams.typ_p);
slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
slot->sparams.dynatemp_range = json_value(data, "dynatemp_range", default_sparams.dynatemp_range);
slot->sparams.dynatemp_exponent = json_value(data, "dynatemp_exponent", default_sparams.dynatemp_exponent);
Expand All @@ -641,7 +640,7 @@ struct llama_server_context
slot->sparams.mirostat_eta = json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
slot->sparams.penalize_nl = json_value(data, "penalize_nl", default_sparams.penalize_nl);
slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
slot->params.seed = json_value(data, "seed", default_params.seed);
slot->sparams.seed = json_value(data, "seed", default_sparams.seed);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
Expand All @@ -665,6 +664,7 @@ struct llama_server_context
slot->params.input_prefix = "";
}


if (data.count("input_suffix") != 0)
{
slot->params.input_suffix = data["input_suffix"];
Expand All @@ -683,6 +683,10 @@ struct llama_server_context
slot->prompt = "";
}

if (json_value(data, "ignore_eos", false)) {
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
}
/*
slot->sparams.penalty_prompt_tokens.clear();
slot->sparams.use_penalty_prompt_tokens = false;
const auto &penalty_prompt = data.find("penalty_prompt");
Expand Down Expand Up @@ -718,14 +722,10 @@ struct llama_server_context
slot->sparams.use_penalty_prompt_tokens = true;
}
}
*/

slot->sparams.logit_bias.clear();

if (json_value(data, "ignore_eos", false))
{
slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}

const auto &logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array())
{
Expand Down Expand Up @@ -753,21 +753,21 @@ struct llama_server_context
llama_token tok = el[0].get<llama_token>();
if (tok >= 0 && tok < n_vocab)
{
slot->sparams.logit_bias[tok] = bias;
slot->sparams.logit_bias.push_back({tok, bias});
}
}
else if (el[0].is_string())
{
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
for (auto tok : toks)
{
slot->sparams.logit_bias[tok] = bias;
slot->sparams.logit_bias.push_back({tok, bias});
}
}
}
}
}

slot->params.antiprompt.clear();

const auto &stop = data.find("stop");
Expand All @@ -781,24 +781,22 @@ struct llama_server_context
}
}
}

const auto &samplers_sequence = data.find("samplers");
if (samplers_sequence != data.end() && samplers_sequence->is_array())
{

const auto & samplers = data.find("samplers");
if (samplers != data.end() && samplers->is_array()) {
std::vector<std::string> sampler_names;
for (const auto &sampler_name : *samplers_sequence)
{
if (sampler_name.is_string())
{
sampler_names.emplace_back(sampler_name);
for (const auto & name : *samplers) {
if (name.is_string()) {
sampler_names.emplace_back(name);
}
}
}
slot->sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, false);
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
}
else
{
slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
slot->sparams.samplers = default_sparams.samplers;
}


if (multimodal)
{
Expand Down Expand Up @@ -875,10 +873,10 @@ struct llama_server_context

if (slot->ctx_sampling != nullptr)
{
llama_sampling_free(slot->ctx_sampling);
gpt_sampler_free(slot->ctx_sampling);
}
slot->ctx_sampling = llama_sampling_init(slot->sparams);
llama_set_rng_seed(ctx, slot->params.seed);
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
//llama_set_rng_seed(ctx, slot->params.seed);
slot->command = LOAD_PROMPT;

all_slots_are_idle = false;
Expand All @@ -888,7 +886,7 @@ struct llama_server_context
{"task_id", slot->task_id},
});

LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
// LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());

return true;
}
Expand Down Expand Up @@ -1006,11 +1004,13 @@ struct llama_server_context
slot.generated_text += token_str;
slot.has_next_token = true;

/*
if (slot.ctx_sampling->params.use_penalty_prompt_tokens && result.tok != -1)
{
// we can change penalty_prompt_tokens because it is always created from scratch each request
slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
}
*/

// check if there is incomplete UTF-8 character at the end
bool incomplete = false;
Expand Down Expand Up @@ -1144,13 +1144,11 @@ struct llama_server_context

json get_formated_generation(llama_client_slot &slot)
{
const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
std::vector<std::string> samplers_sequence;
for (const auto &sampler_type : slot.sparams.samplers_sequence)
std::vector<std::string> samplers;
samplers.reserve(slot.sparams.samplers.size());
for (const auto & sampler : slot.sparams.samplers)
{
samplers_sequence.emplace_back(llama_sampling_type_to_str(sampler_type));
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
}

return json {
Expand All @@ -1165,27 +1163,25 @@ struct llama_server_context
{"top_p", slot.sparams.top_p},
{"min_p", slot.sparams.min_p},
{"tfs_z", slot.sparams.tfs_z},
{"typical_p", slot.sparams.typical_p},
{"typical_p", slot.sparams.typ_p},
{"repeat_last_n", slot.sparams.penalty_last_n},
{"repeat_penalty", slot.sparams.penalty_repeat},
{"presence_penalty", slot.sparams.penalty_present},
{"frequency_penalty", slot.sparams.penalty_freq},
{"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
{"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
{"mirostat", slot.sparams.mirostat},
{"mirostat_tau", slot.sparams.mirostat_tau},
{"mirostat_eta", slot.sparams.mirostat_eta},
{"penalize_nl", slot.sparams.penalize_nl},
{"stop", slot.params.antiprompt},
{"n_predict", slot.params.n_predict},
{"n_keep", params.n_keep},
{"ignore_eos", ignore_eos},
{"ignore_eos", slot.sparams.ignore_eos},
{"stream", slot.params.stream},
{"logit_bias", slot.sparams.logit_bias},
// {"logit_bias", slot.sparams.logit_bias},
{"n_probs", slot.sparams.n_probs},
{"min_keep", slot.sparams.min_keep},
{"grammar", slot.sparams.grammar},
{"samplers", samplers_sequence}
{"samplers", samplers}
};
}

Expand Down Expand Up @@ -1714,7 +1710,7 @@ struct llama_server_context

if (!slot.params.cache_prompt)
{
llama_sampling_reset(slot.ctx_sampling);
gpt_sampler_reset(slot.ctx_sampling);

slot.n_past = 0;
slot.n_past_se = 0;
Expand All @@ -1726,7 +1722,7 @@ struct llama_server_context
// push the prompt into the sampling context (do not apply grammar)
for (auto &token : prompt_tokens)
{
llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
gpt_sampler_accept(slot.ctx_sampling, token, false);
}

slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
Expand Down Expand Up @@ -1934,9 +1930,9 @@ struct llama_server_context
}

completion_token_output result;
const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, NULL, slot.i_batch - i);
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);

llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
gpt_sampler_accept(slot.ctx_sampling, id, true);

slot.n_decoded += 1;
if (slot.n_decoded == 1)
Expand All @@ -1946,19 +1942,14 @@ struct llama_server_context
metrics.on_prompt_eval(slot);
}

llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
result.tok = id;
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);

const int32_t n_probs = slot.sparams.n_probs;
if (slot.sparams.temp <= 0 && n_probs > 0)
{
// for llama_sample_token_greedy we need to sort candidates
llama_sample_softmax(ctx, &cur_p);
}

for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i)
{
result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
result.probs.push_back({
cur_p->data[i].id,
i >= cur_p->size ? 0.0f : cur_p->data[i].p,
});
}

if (!process_token(result, slot))
Expand Down
13 changes: 13 additions & 0 deletions backend/cpp/llama/patches/01-llava.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 342042ff..224db9b5 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2419,7 +2419,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
- patches_data[i] = i + 1;
+ patches_data[i] = i;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);
7 changes: 7 additions & 0 deletions backend/cpp/llama/prepare.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
#!/bin/bash

## Patches
## Apply patches from the `patches` directory
for patch in $(ls patches); do
echo "Applying patch $patch"
patch -d llama.cpp/ -p1 < patches/$patch
done

cp -r CMakeLists.txt llama.cpp/examples/grpc-server/
cp -r grpc-server.cpp llama.cpp/examples/grpc-server/
cp -rfv json.hpp llama.cpp/examples/grpc-server/
Expand Down
27 changes: 0 additions & 27 deletions backend/cpp/llama/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -480,31 +480,4 @@ static inline std::vector<uint8_t> base64_decode(const std::string & encoded_str
}

return ret;
}

//
// random string / id
//

static std::string random_string()
{
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");

std::random_device rd;
std::mt19937 generator(rd());

std::string result(32, ' ');

for (int i = 0; i < 32; ++i) {
result[i] = str[generator() % str.size()];
}

return result;
}

static std::string gen_chatcmplid()
{
std::stringstream chatcmplid;
chatcmplid << "chatcmpl-" << random_string();
return chatcmplid.str();
}

0 comments on commit d51444d

Please sign in to comment.