1
+ #include < cstddef>
1
2
#include < cstdint>
2
3
#include < stdint.h>
3
4
#include < stdio.h>
@@ -253,6 +254,7 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
253
254
dequantize_block_q8_0<<<nb, 1 , 0 , stream>>> (vx, y);
254
255
}
255
256
257
+ // TODO: optimize
256
258
static __global__ void convert_fp16_to_fp32 (const void * vx, float * y) {
257
259
const half * x = (const half *) vx;
258
260
@@ -345,26 +347,31 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
345
347
CUDA_CHECK (cudaFree (ptr));
346
348
}
347
349
350
+ #define GGML_CUDA_MAX_STREAMS 8
351
+ #define GGML_CUDA_MAX_EVENTS 64
348
352
static cublasHandle_t g_cublasH = nullptr ;
349
- static cudaStream_t g_cudaStream = nullptr ;
350
- static cudaStream_t g_cudaStream2 = nullptr ;
351
- static cudaEvent_t g_cudaEvent = nullptr ;
353
+ static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr } ;
354
+ static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr } ;
355
+ static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr } ;
352
356
353
357
void ggml_init_cublas () {
354
358
if (g_cublasH == nullptr ) {
355
- // create cublas handle, bind a stream
356
- CUBLAS_CHECK (cublasCreate (&g_cublasH));
357
- CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStream, cudaStreamNonBlocking));
358
- CUBLAS_CHECK (cublasSetStream (g_cublasH, g_cudaStream));
359
- // enable tensor cores
360
- CUBLAS_CHECK (cublasSetMathMode (g_cublasH, CUBLAS_TENSOR_OP_MATH));
359
+ // create streams
360
+ for (int i = 0 ; i < GGML_CUDA_MAX_STREAMS; ++i) {
361
+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams[i], cudaStreamNonBlocking));
362
+ CUDA_CHECK (cudaStreamCreateWithFlags (&g_cudaStreams2[i], cudaStreamNonBlocking));
363
+ }
364
+ // create events
365
+ for (int i = 0 ; i < GGML_CUDA_MAX_EVENTS; ++i) {
366
+ CUDA_CHECK (cudaEventCreateWithFlags (&g_cudaEvents[i], cudaEventDisableTiming));
367
+ }
361
368
362
- // create additional stream and event for synchronization
363
- CUDA_CHECK ( cudaStreamCreateWithFlags (&g_cudaStream2, cudaStreamNonBlocking ));
364
- CUDA_CHECK ( cudaEventCreateWithFlags (&g_cudaEvent, cudaEventDisableTiming ));
369
+ // create cublas handle
370
+ CUBLAS_CHECK ( cublasCreate (&g_cublasH ));
371
+ CUBLAS_CHECK ( cublasSetMathMode (g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH ));
365
372
366
373
// configure logging to stdout
367
- // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL ));
374
+ // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr ));
368
375
}
369
376
}
370
377
@@ -433,39 +440,141 @@ static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor *
433
440
const int x_ne = ne01 * ne00;
434
441
const int y_ne = ne11 * ne10;
435
442
const int d_ne = ne11 * ne01;
443
+ const int n_mm = ne03 * ne02;
436
444
437
445
size_t x_size, y_size, d_size;
438
- float * d_X = (float *) ggml_cuda_pool_malloc (sizeof (float ) * x_ne, &x_size);
439
- float * d_Y = (float *) ggml_cuda_pool_malloc (sizeof (float ) * y_ne, &y_size);
440
- float * d_D = (float *) ggml_cuda_pool_malloc (sizeof (float ) * d_ne, &d_size);
446
+ float * d_X = (float *) ggml_cuda_pool_malloc (n_mm * sizeof (float ) * x_ne, &x_size);
447
+ float * d_Y = (float *) ggml_cuda_pool_malloc (n_mm * sizeof (float ) * y_ne, &y_size);
448
+ float * d_D = (float *) ggml_cuda_pool_malloc (n_mm * sizeof (float ) * d_ne, &d_size);
441
449
442
450
for (int64_t i03 = 0 ; i03 < ne03; i03++) {
443
451
for (int64_t i02 = 0 ; i02 < ne02; i02++) {
452
+ int i = i03*ne02 + i02;
453
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
454
+
455
+ float * c_X = d_X + i * x_ne;
456
+ float * c_Y = d_Y + i * y_ne;
457
+ float * c_D = d_D + i * d_ne;
458
+
444
459
// copy data to device
445
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0, i03, i02, g_cudaStream ));
446
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Y , src1, i03, i02, g_cudaStream ));
460
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (c_X , src0, i03, i02, cudaStream ));
461
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (c_Y , src1, i03, i02, cudaStream ));
447
462
448
463
// compute
464
+ CUBLAS_CHECK (cublasSetStream (g_cublasH, cudaStream));
449
465
CUBLAS_CHECK (
450
466
cublasSgemm (g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
451
467
ne01, ne11, ne10,
452
- &alpha, d_X, ne00,
453
- d_Y, ne10,
454
- &beta, d_D, ne01));
468
+ &alpha, c_X, ne00,
469
+ c_Y, ne10,
470
+ &beta, c_D, ne01));
471
+
472
+ // copy dst to host
473
+ float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
474
+ CUDA_CHECK (cudaMemcpyAsync (d, c_D, sizeof (float ) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
475
+ }
476
+ }
477
+
478
+ CUDA_CHECK (cudaDeviceSynchronize ());
479
+ ggml_cuda_pool_free (d_X, x_size);
480
+ ggml_cuda_pool_free (d_Y, y_size);
481
+ ggml_cuda_pool_free (d_D, d_size);
482
+ }
483
+
484
+ static void ggml_cuda_mul_mat_f16 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */ ) {
485
+ const int64_t ne00 = src0->ne [0 ];
486
+ const int64_t ne01 = src0->ne [1 ];
487
+ const int64_t ne02 = src0->ne [2 ];
488
+ const int64_t ne03 = src0->ne [3 ];
489
+
490
+ const int64_t ne10 = src1->ne [0 ];
491
+ const int64_t ne11 = src1->ne [1 ];
492
+
493
+ const int nb10 = src1->nb [0 ];
494
+ const int nb11 = src1->nb [1 ];
495
+ const int nb12 = src1->nb [2 ];
496
+ const int nb13 = src1->nb [3 ];
497
+
498
+ const int nb2 = dst->nb [2 ];
499
+ const int nb3 = dst->nb [3 ];
500
+
501
+ const float alpha = 1 .0f ;
502
+ const float beta = 0 .0f ;
503
+ const int x_ne = ne01 * ne00;
504
+ const int y_ne = ne11 * ne10;
505
+ const int d_ne = ne11 * ne01;
506
+ const int n_mm = ne03 * ne02;
507
+
508
+ size_t x_size, y_size, d_size;
509
+ half * d_X = (half *) ggml_cuda_pool_malloc (n_mm * sizeof (half) * x_ne, &x_size);
510
+ half * d_Y = (half *) ggml_cuda_pool_malloc (n_mm * sizeof (half) * y_ne, &y_size);
511
+ float * d_D = (float *) ggml_cuda_pool_malloc (n_mm * sizeof (float ) * d_ne, &d_size);
512
+
513
+ bool src1_cont_rows = nb10 == sizeof (float );
514
+ bool src1_cont_cols = (size_t )nb11 == ne11*sizeof (float );
515
+
516
+ for (int64_t i03 = 0 ; i03 < ne03; i03++) {
517
+ for (int64_t i02 = 0 ; i02 < ne02; i02++) {
518
+ int i = i03*ne02 + i02;
519
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
520
+
521
+ half * c_X = d_X + i * x_ne;
522
+ half * c_Y = d_Y + i * y_ne;
523
+ float * c_D = d_D + i * d_ne;
524
+
525
+ // copy src0 to device
526
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (c_X, src0, i03, i02, cudaStream));
527
+
528
+ // convert src1 to fp16
529
+ // TODO: use multiple threads
530
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
531
+ char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
532
+ if (src1_cont_rows) {
533
+ if (src1_cont_cols) {
534
+ ggml_fp32_to_fp16_row ((float *) src1i, tmp, ne10*ne11);
535
+ }
536
+ else {
537
+ for (int64_t i01 = 0 ; i01 < ne11; i01++) {
538
+ ggml_fp32_to_fp16_row ((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
539
+ }
540
+ }
541
+ }
542
+ else {
543
+ for (int64_t i01 = 0 ; i01 < ne11; i01++) {
544
+ for (int64_t i00 = 0 ; i00 < ne10; i00++) {
545
+ // very slow due to no inlining
546
+ tmp[i01*ne10 + i00] = ggml_fp32_to_fp16 (*(float *) (src1i + i01*nb11 + i00*nb10));
547
+ }
548
+ }
549
+ }
550
+
551
+ // copy src1 to device
552
+ CUDA_CHECK (cudaMemcpyAsync (c_Y, tmp, sizeof (half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
553
+
554
+ // compute
555
+ CUBLAS_CHECK (cublasSetStream (g_cublasH, cudaStream));
556
+ CUBLAS_CHECK (
557
+ cublasGemmEx (g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
558
+ ne01, ne11, ne10,
559
+ &alpha, c_X, CUDA_R_16F, ne00,
560
+ c_Y, CUDA_R_16F, ne10,
561
+ &beta, c_D, CUDA_R_32F, ne01,
562
+ CUBLAS_COMPUTE_32F_FAST_16F,
563
+ CUBLAS_GEMM_DEFAULT));
455
564
456
- // copy data to host
565
+ // copy dst to host
457
566
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
458
- CUDA_CHECK (cudaMemcpyAsync (d, d_D , sizeof (float ) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream ));
567
+ CUDA_CHECK (cudaMemcpyAsync (d, c_D , sizeof (float ) * d_ne, cudaMemcpyDeviceToHost, cudaStream ));
459
568
}
460
569
}
461
570
462
- CUDA_CHECK (cudaStreamSynchronize (g_cudaStream ));
571
+ CUDA_CHECK (cudaDeviceSynchronize ( ));
463
572
ggml_cuda_pool_free (d_X, x_size);
464
573
ggml_cuda_pool_free (d_Y, y_size);
465
574
ggml_cuda_pool_free (d_D, d_size);
466
575
}
467
576
468
- static void ggml_cuda_mul_mat_q (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
577
+ static void ggml_cuda_mul_mat_q_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
469
578
const int64_t ne00 = src0->ne [0 ];
470
579
const int64_t ne01 = src0->ne [1 ];
471
580
const int64_t ne02 = src0->ne [2 ];
@@ -483,46 +592,58 @@ static void ggml_cuda_mul_mat_q(const ggml_tensor * src0, const ggml_tensor * sr
483
592
const int x_ne = ne01 * ne00;
484
593
const int y_ne = ne11 * ne10;
485
594
const int d_ne = ne11 * ne01;
595
+ const int n_mm = ne03 * ne02;
596
+ const size_t q_sz = ggml_type_size (type) * x_ne / ggml_blck_size (type);
486
597
487
598
size_t x_size, y_size, d_size, q_size;
488
- float * d_X = (float *) ggml_cuda_pool_malloc (sizeof (float ) * x_ne, &x_size);
489
- float * d_Y = (float *) ggml_cuda_pool_malloc (sizeof (float ) * y_ne, &y_size);
490
- float * d_D = (float *) ggml_cuda_pool_malloc (sizeof (float ) * d_ne, &d_size);
491
- void * d_Q = (void *) ggml_cuda_pool_malloc (ggml_type_size (type) * x_ne / ggml_blck_size (type) , &q_size);
599
+ float * d_X = (float *) ggml_cuda_pool_malloc (n_mm * sizeof (float ) * x_ne, &x_size);
600
+ float * d_Y = (float *) ggml_cuda_pool_malloc (n_mm * sizeof (float ) * y_ne, &y_size);
601
+ float * d_D = (float *) ggml_cuda_pool_malloc (n_mm * sizeof (float ) * d_ne, &d_size);
602
+ char * d_Q = (char *) ggml_cuda_pool_malloc (n_mm * q_sz , &q_size);
492
603
493
604
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda (type);
494
- GGML_ASSERT (to_fp32_cuda != NULL );
605
+ GGML_ASSERT (to_fp32_cuda != nullptr );
495
606
496
607
for (int64_t i03 = 0 ; i03 < ne03; i03++) {
497
608
for (int64_t i02 = 0 ; i02 < ne02; i02++) {
498
- // copy and convert to fp32 on device
499
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q, src0, i03, i02, g_cudaStream2));
500
-
501
- to_fp32_cuda (d_Q, d_X, x_ne, g_cudaStream2);
609
+ int i = i03*ne02 + i02;
610
+ cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
611
+ cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
612
+ cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
613
+
614
+ float * c_X = d_X + i * x_ne;
615
+ float * c_Y = d_Y + i * y_ne;
616
+ float * c_D = d_D + i * d_ne;
617
+ char * c_Q = d_Q + i * q_sz;
618
+
619
+ // copy src0 and convert to fp32 on device
620
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (c_Q, src0, i03, i02, cudaStream2));
621
+ to_fp32_cuda (c_Q, c_X, x_ne, cudaStream2);
502
622
CUDA_CHECK (cudaGetLastError ());
503
- CUDA_CHECK (cudaEventRecord (g_cudaEvent, g_cudaStream2 ));
623
+ CUDA_CHECK (cudaEventRecord (cudaEvent, cudaStream2 ));
504
624
505
- // copy data to device
506
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Y , src1, i03, i02, g_cudaStream ));
625
+ // copy src1 to device
626
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (c_Y , src1, i03, i02, cudaStream ));
507
627
508
628
// wait for conversion
509
- CUDA_CHECK (cudaStreamWaitEvent (g_cudaStream, g_cudaEvent , 0 ));
629
+ CUDA_CHECK (cudaStreamWaitEvent (cudaStream, cudaEvent , 0 ));
510
630
511
631
// compute
632
+ CUBLAS_CHECK (cublasSetStream (g_cublasH, cudaStream));
512
633
CUBLAS_CHECK (
513
634
cublasSgemm (g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
514
635
ne01, ne11, ne10,
515
- &alpha, d_X , ne00,
516
- d_Y , ne10,
517
- &beta, d_D , ne01));
636
+ &alpha, c_X , ne00,
637
+ c_Y , ne10,
638
+ &beta, c_D , ne01));
518
639
519
- // copy data to host
640
+ // copy dst to host
520
641
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
521
- CUDA_CHECK (cudaMemcpyAsync (d, d_D , sizeof (float ) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream ));
642
+ CUDA_CHECK (cudaMemcpyAsync (d, c_D , sizeof (float ) * d_ne, cudaMemcpyDeviceToHost, cudaStream ));
522
643
}
523
644
}
524
645
525
- CUDA_CHECK (cudaStreamSynchronize (g_cudaStream ));
646
+ CUDA_CHECK (cudaDeviceSynchronize ( ));
526
647
ggml_cuda_pool_free (d_X, x_size);
527
648
ggml_cuda_pool_free (d_Y, y_size);
528
649
ggml_cuda_pool_free (d_D, d_size);
@@ -547,18 +668,48 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
547
668
return false ;
548
669
}
549
670
550
- void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
551
- GGML_ASSERT (ggml_cuda_can_mul_mat (src0, src1, dst));
671
+ bool ggml_cuda_mul_mat_use_f16 (const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */ ) {
672
+ size_t src0_sz = ggml_nbytes (src0);
673
+ size_t src1_sz = ggml_nbytes (src1);
552
674
553
- const ggml_type type = src0->type ;
675
+ // mul_mat_q: src0 is converted to fp32 on device
676
+ size_t mul_mat_q_transfer = src0_sz + src1_sz;
677
+
678
+ // mul_mat_f16: src1 is converted to fp16 on cpu
679
+ size_t mul_mat_f16_transfer = src0_sz + sizeof (half) * ggml_nelements (src1);
680
+
681
+ // choose the smaller one to transfer to the device
682
+ // TODO: this is not always the best choice due to the overhead of converting to fp16
683
+ return mul_mat_f16_transfer < mul_mat_q_transfer;
684
+ }
685
+
686
+ void ggml_cuda_mul_mat (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
687
+ GGML_ASSERT (ggml_cuda_can_mul_mat (src0, src1, dst));
554
688
555
- if (type == GGML_TYPE_F32) {
689
+ if (src0-> type == GGML_TYPE_F32) {
556
690
ggml_cuda_mul_mat_f32 (src0, src1, dst);
557
691
}
558
- else if (type == GGML_TYPE_F16 || ggml_is_quantized (type)) {
559
- ggml_cuda_mul_mat_q (src0, src1, dst);
692
+ else if (src0->type == GGML_TYPE_F16) {
693
+ if (ggml_cuda_mul_mat_use_f16 (src0, src1, dst)) {
694
+ ggml_cuda_mul_mat_f16 (src0, src1, dst, wdata, wsize);
695
+ }
696
+ else {
697
+ ggml_cuda_mul_mat_q_f32 (src0, src1, dst);
698
+ }
699
+ }
700
+ else if (ggml_is_quantized (src0->type )) {
701
+ ggml_cuda_mul_mat_q_f32 (src0, src1, dst);
560
702
}
561
703
else {
562
704
GGML_ASSERT (false );
563
705
}
564
706
}
707
+
708
+ size_t ggml_cuda_mul_mat_get_wsize (const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
709
+ if (ggml_cuda_mul_mat_use_f16 (src0, src1, dst)) {
710
+ return ggml_nelements (src1) * sizeof (ggml_fp16_t );
711
+ }
712
+ else {
713
+ return 0 ;
714
+ }
715
+ }
0 commit comments