Skip to content

Commit e44f640

Browse files
authored
Merge pull request ggml-org#3 from hodlen/fix/gpu-dependency
support powerinfer without GPU
2 parents e4b798a + 182316e commit e44f640

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

llama.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2754,8 +2754,11 @@ struct llama_augmentation_model_loader {
27542754
// 1. gpu_idx;
27552755
// 2. gpu_bucket;
27562756
// 3. transformed ffn_down;
2757-
const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + 5120*40*4 * ggml_tensor_overhead() + (int64_t)13824*5120*40*4);
2758-
printf("%ld\n", ggml_aux_tensor_size);
2757+
// const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + 5120*40*4 * ggml_tensor_overhead() + (int64_t)13824*5120*40*4);
2758+
int model_layer = model->layers.size();
2759+
int ffn_dim = model->layers[0].ffn_up->ne[1];
2760+
const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + model_layer*ffn_dim*sizeof(float) * ggml_tensor_overhead() );
2761+
printf("augmentation buffer: %ld\n", ggml_aux_tensor_size);
27592762
struct ggml_init_params params = {
27602763
/*.mem_size =*/ ggml_aux_tensor_size,
27612764
/*.mem_buffer =*/ nullptr,
@@ -3966,12 +3969,14 @@ static struct ggml_tensor * llm_build_ffn_sparse(
39663969
third = cur;
39673970
struct ggml_tensor * tmp = ggml_mul_mat_idx(ctx, up, cur, idx, gpu_index);
39683971
cb(tmp, "ffn_up_sparse", il);
3972+
#ifdef GGML_USE_CUBLAS
39693973
struct ggml_tensor * tmp2 = ggml_mul_mat_special(ctx, up_gpu, cur, idx, gpu_bucket, up);
39703974
if (tmp2 != NULL) {
39713975
ggml_cuda_assign_buffers_no_alloc(tmp2);
39723976
cb(tmp2, "ffn_up_sparse_gpu", il);
39733977
}
39743978
tmp = ggml_add(ctx, tmp, tmp2);
3979+
#endif
39753980

39763981

39773982
if (up_b) {
@@ -3985,12 +3990,14 @@ static struct ggml_tensor * llm_build_ffn_sparse(
39853990
third = cur;
39863991
cur = ggml_mul_mat_idx(ctx, gate, cur, idx, gpu_index);
39873992
cb(cur, "ffn_gate", il);
3993+
#ifdef GGML_USE_CUBLAS
39883994
tmp2 = ggml_mul_mat_special(ctx, gate_gpu, third, idx, gpu_bucket, gate);
39893995
if (tmp2 != NULL) {
39903996
ggml_cuda_assign_buffers_no_alloc(tmp2);
39913997
cb(tmp2, "ffn_up_sparse_gpu", il);
39923998
}
39933999
cur = ggml_add(ctx, cur, tmp2);
4000+
#endif
39944001

39954002
if (gate_b) {
39964003
cur = ggml_add(ctx, cur, gate_b);
@@ -4017,14 +4024,20 @@ static struct ggml_tensor * llm_build_ffn_sparse(
40174024
}
40184025

40194026
third = cur;
4027+
#ifdef GGML_USE_CUBLAS
40204028
cur = ggml_axpy(ctx, down_gpu, cur, idx, gpu_bucket);
40214029
if (cur != NULL) {
40224030
ggml_cuda_assign_buffers_no_alloc(cur);
40234031
cb(cur, "ffn_down", il);
40244032
}
4033+
#endif
40254034
tmp = ggml_axpy(ctx, down_t, third, idx, gpu_index);
40264035
cb(tmp, "ffn_down_gpu", il);
4036+
#ifdef GGML_USE_CUBLAS
40274037
cur = ggml_add(ctx, cur, tmp);
4038+
#else
4039+
cur = tmp;
4040+
#endif
40284041

40294042
if (down_b) {
40304043
cur = ggml_add(ctx, cur, down_b);

0 commit comments

Comments
 (0)