Skip to content

Commit 31b1a7c

Browse files
committed
llama : refactor model loader with backend registry
1 parent 958367b commit 31b1a7c

18 files changed

+1772
-1973
lines changed

examples/llama-bench/llama-bench.cpp

Lines changed: 22 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,6 @@
2121
#include "ggml.h"
2222
#include "llama.h"
2323
#include "common.h"
24-
#include "ggml-cuda.h"
25-
#include "ggml-sycl.h"
26-
27-
#ifdef GGML_USE_CANN
28-
#include "ggml-cann.h"
29-
#endif
3024

3125
#ifdef _WIN32
3226
#define WIN32_LEAN_AND_MEAN
@@ -82,95 +76,27 @@ static T stdev(const std::vector<T> & v) {
8276
}
8377

8478
static std::string get_cpu_info() {
85-
std::string id;
86-
#ifdef __linux__
87-
FILE * f = fopen("/proc/cpuinfo", "r");
88-
if (f) {
89-
char buf[1024];
90-
while (fgets(buf, sizeof(buf), f)) {
91-
if (strncmp(buf, "model name", 10) == 0) {
92-
char * p = strchr(buf, ':');
93-
if (p) {
94-
p++;
95-
while (std::isspace(*p)) {
96-
p++;
97-
}
98-
while (std::isspace(p[strlen(p) - 1])) {
99-
p[strlen(p) - 1] = '\0';
100-
}
101-
id = p;
102-
break;
103-
}
104-
}
79+
std::vector<std::string> gpu_list;
80+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
81+
auto * dev = ggml_backend_dev_get(i);
82+
auto dev_type = ggml_backend_dev_type(dev);
83+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU || dev_type == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
84+
gpu_list.push_back(ggml_backend_dev_description(dev));
10585
}
106-
fclose(f);
107-
}
108-
#elif defined(_WIN32)
109-
HKEY hKey;
110-
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
111-
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
112-
0,
113-
KEY_READ,
114-
&hKey) != ERROR_SUCCESS) {
115-
// fail to open registry key
116-
return "";
11786
}
118-
char cpu_brand[256];
119-
DWORD cpu_brand_size = sizeof(cpu_brand);
120-
if (RegQueryValueExA(hKey,
121-
TEXT("ProcessorNameString"),
122-
NULL,
123-
NULL,
124-
(LPBYTE)cpu_brand,
125-
&cpu_brand_size) == ERROR_SUCCESS) {
126-
id.assign(cpu_brand, cpu_brand_size);
127-
if (id.find('\0') != std::string::npos) {
128-
id.resize(id.find('\0'));
129-
}
130-
}
131-
RegCloseKey(hKey);
132-
#endif
133-
// TODO: other platforms
134-
return id;
87+
return join(gpu_list, ", ");
13588
}
13689

13790
static std::string get_gpu_info() {
138-
std::string id;
139-
#ifdef GGML_USE_CUDA
140-
int count = ggml_backend_cuda_get_device_count();
141-
for (int i = 0; i < count; i++) {
142-
char buf[128];
143-
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
144-
id += buf;
145-
if (i < count - 1) {
146-
id += "/";
91+
std::vector<std::string> gpu_list;
92+
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
93+
auto * dev = ggml_backend_dev_get(i);
94+
auto dev_type = ggml_backend_dev_type(dev);
95+
if (dev_type == GGML_BACKEND_DEVICE_TYPE_GPU) {
96+
gpu_list.push_back(ggml_backend_dev_description(dev));
14797
}
14898
}
149-
#endif
150-
#ifdef GGML_USE_SYCL
151-
int count = ggml_backend_sycl_get_device_count();
152-
for (int i = 0; i < count; i++) {
153-
char buf[128];
154-
ggml_backend_sycl_get_device_description(i, buf, sizeof(buf));
155-
id += buf;
156-
if (i < count - 1) {
157-
id += "/";
158-
}
159-
}
160-
#endif
161-
#ifdef GGML_USE_CANN
162-
uint32_t count = ggml_backend_cann_get_device_count();
163-
for (uint32_t i = 0; i < count; i++) {
164-
char buf[128];
165-
ggml_backend_cann_get_device_description(i, buf, sizeof(buf));
166-
id += buf;
167-
if (i < count - 1) {
168-
id += "/";
169-
}
170-
}
171-
#endif
172-
// TODO: other backends
173-
return id;
99+
return join(gpu_list, ", ");
174100
}
175101

176102
// command line params
@@ -938,29 +864,15 @@ struct test {
938864
}
939865

940866
static std::string get_backend() {
941-
if (cuda) {
942-
return GGML_CUDA_NAME;
943-
}
944-
if (vulkan) {
945-
return "Vulkan";
946-
}
947-
if (kompute) {
948-
return "Kompute";
949-
}
950-
if (metal) {
951-
return "Metal";
952-
}
953-
if (sycl) {
954-
return GGML_SYCL_NAME;
955-
}
956-
if (gpu_blas) {
957-
return "GPU BLAS";
958-
}
959-
if (blas) {
960-
return "BLAS";
867+
std::vector<std::string> backends;
868+
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
869+
auto * reg = ggml_backend_reg_get(i);
870+
std::string name = ggml_backend_reg_name(reg);
871+
if (name != "CPU") {
872+
backends.push_back(ggml_backend_reg_name(reg));
873+
}
961874
}
962-
963-
return "CPU";
875+
return backends.empty() ? "CPU" : join(backends, ",");
964876
}
965877

966878
static const std::vector<std::string> & get_fields() {

ggml/include/ggml-backend.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -114,11 +114,12 @@ extern "C" {
114114
//
115115

116116
enum ggml_backend_dev_type {
117+
// CPU device using system memory
117118
GGML_BACKEND_DEVICE_TYPE_CPU,
119+
// GPU device using dedicated memory
118120
GGML_BACKEND_DEVICE_TYPE_GPU,
119-
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
120-
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
121-
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
121+
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
122+
GGML_BACKEND_DEVICE_TYPE_ACCEL
122123
};
123124

124125
// functionality supported by the device
@@ -169,8 +170,8 @@ extern "C" {
169170

170171

171172
// Functions that may be obtained using ggml_backend_reg_get_proc_address
172-
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
173-
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
173+
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tesor_split);
174+
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
174175

175176
//
176177
// Backend registry
@@ -192,7 +193,7 @@ extern "C" {
192193
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
193194
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
194195
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
195-
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
196+
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
196197
GGML_API ggml_backend_t ggml_backend_init_best(void);
197198

198199
//

ggml/include/ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
2828
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
2929

3030
// split tensor buffer that splits matrices by rows across multiple devices
31-
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
31+
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
3232

3333
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
3434
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

ggml/src/ggml-amx.cpp

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@
1616
#if defined(__AMX_INT8__)
1717

1818
// AMX buffer interface
19-
static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
20-
return "AMX";
21-
22-
GGML_UNUSED(buffer);
23-
}
24-
2519
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
2620
free(buffer->context);
2721
}
@@ -72,7 +66,6 @@ static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
7266
}
7367

7468
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
75-
/* .get_name = */ ggml_backend_amx_buffer_get_name,
7669
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
7770
/* .get_base = */ ggml_backend_amx_buffer_get_base,
7871
/* .init_tensor = */ NULL, // no initialization required
@@ -149,12 +142,6 @@ static void ggml_backend_amx_free(ggml_backend_t backend) {
149142
delete backend;
150143
}
151144

152-
static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
153-
return ggml_backend_amx_buffer_type();
154-
155-
GGML_UNUSED(backend);
156-
}
157-
158145
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
159146
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
160147

@@ -187,7 +174,6 @@ static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, s
187174
static struct ggml_backend_i ggml_backend_amx_i = {
188175
/* .get_name = */ ggml_backend_amx_name,
189176
/* .free = */ ggml_backend_amx_free,
190-
/* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
191177
/* .set_tensor_async = */ NULL,
192178
/* .get_tensor_async = */ NULL,
193179
/* .cpy_tensor_async = */ NULL,
@@ -197,9 +183,6 @@ static struct ggml_backend_i ggml_backend_amx_i = {
197183
/* .graph_plan_update = */ NULL,
198184
/* .graph_plan_compute = */ NULL,
199185
/* .graph_compute = */ ggml_backend_amx_graph_compute,
200-
/* .supports_op = */ NULL,
201-
/* .supports_buft = */ NULL,
202-
/* .offload_op = */ NULL,
203186
/* .event_record = */ NULL,
204187
/* .event_wait = */ NULL,
205188
};

ggml/src/ggml-backend-impl.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ extern "C" {
2222
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
2323
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
2424
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
25-
// (optional) check if tensor data is in host memory (defaults to false)
25+
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
2626
bool (*is_host) (ggml_backend_buffer_type_t buft);
2727
};
2828

@@ -37,7 +37,6 @@ extern "C" {
3737
//
3838

3939
struct ggml_backend_buffer_i {
40-
const char * (*get_name) (ggml_backend_buffer_t buffer);
4140
// (optional) free the buffer
4241
void (*free_buffer) (ggml_backend_buffer_t buffer);
4342
// base address of the buffer
@@ -88,19 +87,16 @@ extern "C" {
8887

8988
void (*free)(ggml_backend_t backend);
9089

91-
// Will be moved to the device interface
92-
// buffer allocation
93-
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
94-
9590
// (optional) asynchronous tensor data access
9691
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
9792
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
9893
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
9994

100-
// (optional) complete all pending operations
95+
// (optional) complete all pending operations (required if the backend supports async operations)
10196
void (*synchronize)(ggml_backend_t backend);
10297

103-
// (optional) compute graph with a plan (not used currently)
98+
// (optional) graph plans
99+
// compute graph with a plan (not used currently)
104100
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
105101
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
106102
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@@ -111,13 +107,6 @@ extern "C" {
111107
// compute graph (always async if supported by the backend)
112108
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
113109

114-
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
115-
// new backends should implement the device interface instead
116-
// These functions are being moved to the device interface
117-
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
118-
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
119-
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
120-
121110
// (optional) event synchronization
122111
// record an event on this stream
123112
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);

0 commit comments

Comments
 (0)