Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : llama.cpp #981

Merged
merged 9 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/gpt-2/main-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
return false;
}

ggml_log_set(ggml_log_callback_default, nullptr);

auto & ctx = model.ctx_w;

// create the ggml context
Expand Down Expand Up @@ -210,7 +212,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
#ifdef GGML_USE_METAL
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
Expand Down
3 changes: 2 additions & 1 deletion examples/gpt-2/main-batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
}

ggml_log_set(ggml_log_callback_default, nullptr);

// create the ggml context
{
size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
Expand Down Expand Up @@ -298,7 +300,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
#ifdef GGML_USE_METAL
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
Expand Down
3 changes: 2 additions & 1 deletion examples/gpt-2/main-sched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ struct gpt2_model {
void init_backends(gpt2_model & model, const gpt_params & params) {
ggml_backend_t gpu_backend = NULL;

ggml_log_set(ggml_log_callback_default, nullptr);

// initialize the backends
#ifdef GGML_USE_CUDA
if (params.n_gpu_layers > 0) {
Expand All @@ -122,7 +124,6 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
#ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
gpu_backend = ggml_backend_metal_init();
if (!gpu_backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
Expand Down
11 changes: 6 additions & 5 deletions examples/mnist/mnist-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,18 @@ struct mnist_model {
ggml_backend_buffer_t buf_compute = nullptr;

mnist_model(const std::string & backend_name) {
const size_t backend_index = ggml_backend_reg_find_by_name(backend_name.c_str());
if (backend_index == SIZE_MAX) {
const ggml_backend_dev_t dev = ggml_backend_dev_by_name(backend_name.c_str());
if (dev == nullptr) {
fprintf(stderr, "%s: ERROR: backend %s not found, available:\n", __func__, backend_name.c_str());
for (size_t i = 0; i < ggml_backend_reg_get_count(); ++i) {
fprintf(stderr, " - %s\n", ggml_backend_reg_get_name(i));
for (size_t i = 0; i < ggml_backend_reg_count(); ++i) {
fprintf(stderr, " - %s\n", ggml_backend_reg_name(ggml_backend_reg_get(i)));
}
exit(1);
}

fprintf(stderr, "%s: using %s backend\n", __func__, backend_name.c_str());
backend = ggml_backend_reg_init_backend(backend_index, nullptr);

ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
if (ggml_backend_is_cpu(backend)) {
const int ncores_logical = std::thread::hardware_concurrency();
ggml_backend_cpu_set_n_threads(backend, std::min(ncores_logical, (ncores_logical + 4)/2));
Expand Down
3 changes: 2 additions & 1 deletion examples/simple/simple-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ struct simple_model {

// initialize the tensors of the model in this case two matrices 2x2
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
ggml_log_set(ggml_log_callback_default, nullptr);

// initialize the backend
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
Expand All @@ -54,7 +56,6 @@ void load_model(simple_model & model, float * a, float * b, int rows_A, int cols

#ifdef GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__);
ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
Expand Down
202 changes: 142 additions & 60 deletions include/ggml-backend.h

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions include/ggml-blas.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ extern "C" {
#endif

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
GGML_API ggml_backend_t ggml_backend_blas_init(void);

GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);

// number of threads used for conversion to float
// for openblas and blis, this will also set the number of threads used for blas operations
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);


#ifdef __cplusplus
Expand Down
29 changes: 9 additions & 20 deletions include/ggml-cann.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ extern "C" {
* @param device The index of the device to initialize.
* @return A pointer to the initialized backend instance, or nullptr on failure.
*/
GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);

/**
* @brief Checks if a given backend is a CANN backend.
Expand All @@ -55,7 +55,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
* @param backend The backend instance to check.
* @return True if the backend is a CANN backend, false otherwise.
*/
GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);

/**
* @brief Retrieves the CANN buffer type for a specified device.
Expand All @@ -67,7 +67,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
* @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range.
*/
GGML_API GGML_CALL ggml_backend_buffer_type_t
GGML_API ggml_backend_buffer_type_t
ggml_backend_cann_buffer_type(int32_t device);

/**
Expand All @@ -78,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device);
*
* @return The number of CANN devices available.
*/
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
GGML_API int32_t ggml_backend_cann_get_device_count(void);

/**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
*
* @return A pointer to the host buffer type interface.
*/
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);

/**
* @brief Retrieves the description of a specific CANN device.
Expand All @@ -97,7 +97,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type
* @param description Pointer to a buffer where the description will be written.
* @param description_size Size of the description buffer.
*/
GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
GGML_API void ggml_backend_cann_get_device_description(
int32_t device, char* description, size_t description_size);

/**
Expand All @@ -112,20 +112,9 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
* @param total Pointer to a variable where the total memory size will be
* stored.
*/
GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free,
size_t* total);

/**
* @brief Set the logging callback for GGML.
*
* This function sets the logging callback and user data for logging.
*
* @param log_callback The logging callback to set.
* @param user_data User data to pass to the logging callback.
*/
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
void* user_data);
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free,
size_t* total);

#ifdef __cplusplus
}
Expand Down
32 changes: 16 additions & 16 deletions include/ggml-cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
#include "ggml.h"
#include "ggml-backend.h"

#ifdef __cplusplus
extern "C" {
#endif

#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
Expand All @@ -13,35 +17,31 @@
#define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS"
#endif

#ifdef __cplusplus
extern "C" {
#endif

#define GGML_CUDA_MAX_DEVICES 16

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);

GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);

// device buffer
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);

// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);

// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

GGML_API int ggml_backend_cuda_get_device_count(void);
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);

GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);

GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);

GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
#ifdef __cplusplus
}
#endif
8 changes: 4 additions & 4 deletions include/ggml-metal.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
// Note: this description is outdated
//
// An interface allowing to compute ggml_cgraph with Metal
//
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
Expand Down Expand Up @@ -37,17 +39,15 @@ extern "C" {
// user-code should use only these functions
//

GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);

GGML_API ggml_backend_t ggml_backend_metal_init(void);

GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);

GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);

GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
Expand Down
10 changes: 5 additions & 5 deletions include/ggml-rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ extern "C" {
#define GGML_RPC_MAX_SERVERS 16

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);

GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);

GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);

GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);

#ifdef __cplusplus
}
Expand Down
16 changes: 8 additions & 8 deletions include/ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);

// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);

// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);

GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API int ggml_backend_sycl_get_device_count();
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);

// SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif
14 changes: 7 additions & 7 deletions include/ggml-vulkan.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@ extern "C" {
GGML_API void ggml_vk_instance_init(void);

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);

GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API int ggml_backend_vk_get_device_count(void);
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);

GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);

#ifdef __cplusplus
}
Expand Down
Loading
Loading