@@ -2754,8 +2754,11 @@ struct llama_augmentation_model_loader {
2754
2754
// 1. gpu_idx;
2755
2755
// 2. gpu_bucket;
2756
2756
// 3. transformed ffn_down;
2757
- const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + 5120 *40 *4 * ggml_tensor_overhead () + (int64_t )13824 *5120 *40 *4 );
2758
- printf (" %ld\n " , ggml_aux_tensor_size);
2757
+ // const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + 5120*40*4 * ggml_tensor_overhead() + (int64_t)13824*5120*40*4);
2758
+ int model_layer = model->layers .size ();
2759
+ int ffn_dim = model->layers [0 ].ffn_up ->ne [1 ];
2760
+ const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + model_layer*ffn_dim*sizeof (float ) * ggml_tensor_overhead () );
2761
+ printf (" augmentation buffer: %ld\n " , ggml_aux_tensor_size);
2759
2762
struct ggml_init_params params = {
2760
2763
/* .mem_size =*/ ggml_aux_tensor_size,
2761
2764
/* .mem_buffer =*/ nullptr ,
@@ -3966,12 +3969,14 @@ static struct ggml_tensor * llm_build_ffn_sparse(
3966
3969
third = cur;
3967
3970
struct ggml_tensor * tmp = ggml_mul_mat_idx (ctx, up, cur, idx, gpu_index);
3968
3971
cb (tmp, " ffn_up_sparse" , il);
3972
+ #ifdef GGML_USE_CUBLAS
3969
3973
struct ggml_tensor * tmp2 = ggml_mul_mat_special (ctx, up_gpu, cur, idx, gpu_bucket, up);
3970
3974
if (tmp2 != NULL ) {
3971
3975
ggml_cuda_assign_buffers_no_alloc (tmp2);
3972
3976
cb (tmp2, " ffn_up_sparse_gpu" , il);
3973
3977
}
3974
3978
tmp = ggml_add (ctx, tmp, tmp2);
3979
+ #endif
3975
3980
3976
3981
3977
3982
if (up_b) {
@@ -3985,12 +3990,14 @@ static struct ggml_tensor * llm_build_ffn_sparse(
3985
3990
third = cur;
3986
3991
cur = ggml_mul_mat_idx (ctx, gate, cur, idx, gpu_index);
3987
3992
cb (cur, " ffn_gate" , il);
3993
+ #ifdef GGML_USE_CUBLAS
3988
3994
tmp2 = ggml_mul_mat_special (ctx, gate_gpu, third, idx, gpu_bucket, gate);
3989
3995
if (tmp2 != NULL ) {
3990
3996
ggml_cuda_assign_buffers_no_alloc (tmp2);
3991
3997
cb (tmp2, " ffn_up_sparse_gpu" , il);
3992
3998
}
3993
3999
cur = ggml_add (ctx, cur, tmp2);
4000
+ #endif
3994
4001
3995
4002
if (gate_b) {
3996
4003
cur = ggml_add (ctx, cur, gate_b);
@@ -4017,14 +4024,20 @@ static struct ggml_tensor * llm_build_ffn_sparse(
4017
4024
}
4018
4025
4019
4026
third = cur;
4027
+ #ifdef GGML_USE_CUBLAS
4020
4028
cur = ggml_axpy (ctx, down_gpu, cur, idx, gpu_bucket);
4021
4029
if (cur != NULL ) {
4022
4030
ggml_cuda_assign_buffers_no_alloc (cur);
4023
4031
cb (cur, " ffn_down" , il);
4024
4032
}
4033
+ #endif
4025
4034
tmp = ggml_axpy (ctx, down_t , third, idx, gpu_index);
4026
4035
cb (tmp, " ffn_down_gpu" , il);
4036
+ #ifdef GGML_USE_CUBLAS
4027
4037
cur = ggml_add (ctx, cur, tmp);
4038
+ #else
4039
+ cur = tmp;
4040
+ #endif
4028
4041
4029
4042
if (down_b) {
4030
4043
cur = ggml_add (ctx, cur, down_b);
0 commit comments