@@ -6624,8 +6624,10 @@ inline void ggml_cuda_op_clamp(
6624
6624
GGML_ASSERT(src0->type == GGML_TYPE_F32);
6625
6625
GGML_ASSERT( dst->type == GGML_TYPE_F32);
6626
6626
6627
- const float min = ((float *) dst->op_params)[0];
6628
- const float max = ((float *) dst->op_params)[1];
6627
+ float min;
6628
+ float max;
6629
+ memcpy(&min, dst->op_params, sizeof(float));
6630
+ memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
6629
6631
6630
6632
clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
6631
6633
CUDA_CHECK(cudaGetLastError());
@@ -7149,6 +7151,30 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
7149
7151
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
7150
7152
}
7151
7153
7154
+ __global__ void k_compute_batched_ptrs(
7155
+ const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
7156
+ void ** ptrs,
7157
+ int ne12, int ne13,
7158
+ int ne23,
7159
+ int nb02, int nb03,
7160
+ int nb12, int nb13,
7161
+ int nb2, int nb3,
7162
+ int r2, int r3) {
7163
+ int i13 = blockIdx.x * blockDim.x + threadIdx.x;
7164
+ int i12 = blockIdx.y * blockDim.y + threadIdx.y;
7165
+
7166
+ if (i13 >= ne13 || i12 >= ne12) {
7167
+ return;
7168
+ }
7169
+
7170
+ int i03 = i13 / r3;
7171
+ int i02 = i12 / r2;
7172
+
7173
+ ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*nb02 + i03*nb03;
7174
+ ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
7175
+ ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* nb2/2 + i13* nb3/2;
7176
+ }
7177
+
7152
7178
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
7153
7179
GGML_ASSERT(!ggml_is_transposed(src0));
7154
7180
GGML_ASSERT(!ggml_is_transposed(src1));
@@ -7250,49 +7276,35 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
7250
7276
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7251
7277
} else {
7252
7278
// use cublasGemmBatchedEx
7253
- // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
7254
7279
const int ne23 = ne12*ne13;
7255
7280
7256
- // TODO: avoid this alloc
7257
- void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
7258
-
7259
- for (int i13 = 0; i13 < ne13; ++i13) {
7260
- for (int i12 = 0; i12 < ne12; ++i12) {
7261
- int i03 = i13 / r3;
7262
- int i02 = i12 / r2;
7263
-
7264
- ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
7265
- ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
7266
- ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
7267
- }
7268
- }
7269
-
7270
- // allocate device memory for pointers
7271
7281
void ** ptrs_as = nullptr;
7272
- CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
7273
-
7274
- // TODO: this does not work for some reason -- not sure why?
7275
- //size_t ptrs_s = 0;
7276
- //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7277
-
7278
- // copy pointers to device
7279
- CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
7280
-
7281
- free(ptrs);
7282
+ size_t ptrs_s = 0;
7283
+ ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
7284
+
7285
+ dim3 block_dims(ne13, ne12);
7286
+ k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
7287
+ src0_as_f16, src1_as_f16, dst_f16,
7288
+ ptrs_as,
7289
+ ne12, ne13,
7290
+ ne23,
7291
+ nb02, nb03,
7292
+ nb12, nb13,
7293
+ dst->nb[2], dst->nb[3],
7294
+ r2, r3);
7295
+ CUDA_CHECK(cudaGetLastError());
7282
7296
7283
7297
CUBLAS_CHECK(
7284
7298
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7285
7299
ne01, ne11, ne10,
7286
- &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7287
- (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7288
- &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7300
+ &alpha_f16, (const void * const *) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
7301
+ (const void * const *) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
7302
+ &beta_f16, ( void ** ) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
7289
7303
ne23,
7290
7304
CUBLAS_COMPUTE_16F,
7291
7305
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7292
7306
7293
- // free device memory for pointers
7294
- CUDA_CHECK(cudaFree(ptrs_as));
7295
- //ggml_cuda_pool_free(ptrs_as, ptrs_s);
7307
+ ggml_cuda_pool_free(ptrs_as, ptrs_s);
7296
7308
}
7297
7309
#endif
7298
7310
0 commit comments