@@ -7576,7 +7576,7 @@ static void ggml_cuda_op_mul_mat_cublas(
7576
7576
7577
7577
const int compute_capability = g_device_caps[id].cc ;
7578
7578
7579
- if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ] && dst-> op_params [ 0 ] == GGML_PREC_DEFAULT ) {
7579
+ if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ]) {
7580
7580
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
7581
7581
cuda_pool_alloc<half> src0_as_f16;
7582
7582
if (src0->type != GGML_TYPE_F16) {
@@ -7597,23 +7597,44 @@ static void ggml_cuda_op_mul_mat_cublas(
7597
7597
to_fp16_cuda (src1_ddf_i, src1_as_f16.get (), ne, stream);
7598
7598
}
7599
7599
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get ();
7600
- cuda_pool_alloc<half> dst_f16 (row_diff*src1_ncols);
7601
7600
7602
- const half alpha_f16 = 1 .0f ;
7603
- const half beta_f16 = 0 .0f ;
7604
-
7605
- CUBLAS_CHECK (cublasSetStream (g_cublas_handles[id], stream));
7606
- CUBLAS_CHECK (
7607
- cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7608
- row_diff, src1_ncols, ne10,
7609
- &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
7610
- src1_ptr, CUDA_R_16F, ne10,
7611
- &beta_f16, dst_f16.get (), CUDA_R_16F, ldc,
7612
- CUBLAS_COMPUTE_16F,
7613
- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7614
-
7615
- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7616
- to_fp32_cuda (dst_f16.get (), dst_dd_i, row_diff*src1_ncols, stream);
7601
+ switch (dst->op_params [0 ]) {
7602
+ case GGML_PREC_DEFAULT:
7603
+ {
7604
+ cuda_pool_alloc<half> dst_f16 (row_diff*src1_ncols);
7605
+
7606
+ const half alpha_f16 = 1 .0f ;
7607
+ const half beta_f16 = 0 .0f ;
7608
+
7609
+ CUBLAS_CHECK (cublasSetStream (g_cublas_handles[id], stream));
7610
+ CUBLAS_CHECK (
7611
+ cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7612
+ row_diff, src1_ncols, ne10,
7613
+ &alpha_f16, src0_ptr, CUDA_R_16F, ne00,
7614
+ src1_ptr, CUDA_R_16F, ne10,
7615
+ &beta_f16, dst_f16.get (), CUDA_R_16F, ldc,
7616
+ CUBLAS_COMPUTE_16F,
7617
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7618
+
7619
+ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (GGML_TYPE_F16);
7620
+ to_fp32_cuda (dst_f16.get (), dst_dd_i, row_diff*src1_ncols, stream);
7621
+ } break ;
7622
+ case GGML_PREC_F32:
7623
+ {
7624
+ const float alpha_f32 = 1 .0f ;
7625
+ const float beta_f32 = 0 .0f ;
7626
+
7627
+ CUBLAS_CHECK (cublasSetStream (g_cublas_handles[id], stream));
7628
+ CUBLAS_CHECK (
7629
+ cublasGemmEx (g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
7630
+ row_diff, src1_ncols, ne10,
7631
+ &alpha_f32, src0_ptr, CUDA_R_16F, ne00,
7632
+ src1_ptr, CUDA_R_16F, ne10,
7633
+ &beta_f32, dst_dd_i, CUDA_R_32F, ldc,
7634
+ CUBLAS_COMPUTE_32F,
7635
+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
7636
+ } break ;
7637
+ }
7617
7638
}
7618
7639
else {
7619
7640
cuda_pool_alloc<float > src0_ddq_as_f32;
@@ -9228,6 +9249,20 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
9228
9249
}
9229
9250
9230
9251
void ggml_cuda_free_data (struct ggml_tensor * tensor) {
9252
+ // print current mem usage using cudaMemGetInfo
9253
+ // TODO: this is a hack - need better solution
9254
+ {
9255
+ size_t free ;
9256
+ size_t total;
9257
+ CUDA_CHECK (cudaMemGetInfo (&free , &total));
9258
+
9259
+ static size_t used = 0 ;
9260
+ if (used < total - free ) {
9261
+ printf (" CUDA: used %zu MB, free %zu MB\n " , (total - free )/1024 /1024 , free /1024 /1024 );
9262
+ used = total - free ;
9263
+ }
9264
+ }
9265
+
9231
9266
if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
9232
9267
return ;
9233
9268
}
0 commit comments