@@ -512,6 +512,14 @@ static size_t g_scratch_offset = 0;
512
512
513
513
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
514
514
515
+ [[noreturn]]
516
+ static __device__ void bad_arch () {
517
+ printf (" ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n " );
518
+ __trap ();
519
+
520
+ (void ) bad_arch; // suppress unused function warning
521
+ }
522
+
515
523
static __device__ __forceinline__ float warp_reduce_sum (float x) {
516
524
#pragma unroll
517
525
for (int mask = 16 ; mask > 0 ; mask >>= 1 ) {
@@ -1972,8 +1980,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
1972
1980
// second part effectively subtracts 8 from each quant value
1973
1981
return d4 * (sumi * ds8f.x - (8 *vdr/QI4_0) * ds8f.y );
1974
1982
#else
1975
- assert (false );
1976
- return 0 .0f ; // only to satisfy the compiler
1983
+ bad_arch ();
1977
1984
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
1978
1985
}
1979
1986
@@ -2010,8 +2017,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
2010
2017
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
2011
2018
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
2012
2019
#else
2013
- assert (false );
2014
- return 0 .0f ; // only to satisfy the compiler
2020
+ bad_arch ();
2015
2021
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2016
2022
}
2017
2023
@@ -2046,8 +2052,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
2046
2052
// second part effectively subtracts 16 from each quant value
2047
2053
return d5 * (sumi * ds8f.x - (16 *vdr/QI5_0) * ds8f.y );
2048
2054
#else
2049
- assert (false );
2050
- return 0 .0f ; // only to satisfy the compiler
2055
+ bad_arch ();
2051
2056
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2052
2057
}
2053
2058
@@ -2092,8 +2097,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
2092
2097
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
2093
2098
2094
2099
#else
2095
- assert (false );
2096
- return 0 .0f ; // only to satisfy the compiler
2100
+ bad_arch ();
2097
2101
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2098
2102
}
2099
2103
@@ -2114,8 +2118,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
2114
2118
2115
2119
return d8_0*d8_1 * sumi;
2116
2120
#else
2117
- assert (false );
2118
- return 0 .0f ; // only to satisfy the compiler
2121
+ bad_arch ();
2119
2122
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2120
2123
}
2121
2124
@@ -2145,8 +2148,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
2145
2148
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
2146
2149
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
2147
2150
#else
2148
- assert (false );
2149
- return 0 .0f ; // only to satisfy the compiler
2151
+ bad_arch ();
2150
2152
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2151
2153
}
2152
2154
@@ -2181,8 +2183,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
2181
2183
2182
2184
return dm2f.x *sumf_d - dm2f.y *sumf_m;
2183
2185
#else
2184
- assert (false );
2185
- return 0 .0f ; // only to satisfy the compiler
2186
+ bad_arch ();
2186
2187
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2187
2188
}
2188
2189
@@ -2219,8 +2220,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
2219
2220
2220
2221
return d8 * (dm2f.x *sumi_d - dm2f.y *sumi_m);
2221
2222
#else
2222
- assert (false );
2223
- return 0 .0f ; // only to satisfy the compiler
2223
+ bad_arch ();
2224
2224
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2225
2225
}
2226
2226
@@ -2260,8 +2260,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
2260
2260
2261
2261
return d3 * sumf;
2262
2262
#else
2263
- assert (false );
2264
- return 0 .0f ; // only to satisfy the compiler
2263
+ bad_arch ();
2265
2264
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2266
2265
}
2267
2266
@@ -2286,8 +2285,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
2286
2285
2287
2286
return d3*d8 * sumi;
2288
2287
#else
2289
- assert (false );
2290
- return 0 .0f ; // only to satisfy the compiler
2288
+ bad_arch ();
2291
2289
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2292
2290
}
2293
2291
@@ -2320,8 +2318,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
2320
2318
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2321
2319
2322
2320
#else
2323
- assert (false );
2324
- return 0 .0f ; // only to satisfy the compiler
2321
+ bad_arch ();
2325
2322
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2326
2323
}
2327
2324
@@ -2354,8 +2351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
2354
2351
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2355
2352
2356
2353
#else
2357
- assert (false );
2358
- return 0 .0f ; // only to satisfy the compiler
2354
+ bad_arch ();
2359
2355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2360
2356
}
2361
2357
@@ -2395,8 +2391,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
2395
2391
return dm5f.x *sumf_d - dm5f.y *sumf_m;
2396
2392
2397
2393
#else
2398
- assert (false );
2399
- return 0 .0f ; // only to satisfy the compiler
2394
+ bad_arch ();
2400
2395
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2401
2396
}
2402
2397
@@ -2429,8 +2424,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
2429
2424
return dm4f.x *sumf_d - dm4f.y *sumf_m;
2430
2425
2431
2426
#else
2432
- assert (false );
2433
- return 0 .0f ; // only to satisfy the compiler
2427
+ bad_arch ();
2434
2428
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2435
2429
}
2436
2430
@@ -2460,8 +2454,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
2460
2454
2461
2455
return d*sumf;
2462
2456
#else
2463
- assert (false );
2464
- return 0 .0f ; // only to satisfy the compiler
2457
+ bad_arch ();
2465
2458
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2466
2459
}
2467
2460
@@ -2492,8 +2485,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
2492
2485
return d6 * sumf_d;
2493
2486
2494
2487
#else
2495
- assert (false );
2496
- return 0 .0f ; // only to satisfy the compiler
2488
+ bad_arch ();
2497
2489
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
2498
2490
}
2499
2491
@@ -3359,8 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
3359
3351
return dall * sumf_d - dmin * sumf_m;
3360
3352
3361
3353
#else
3362
- assert (false );
3363
- return 0 .0f ; // only to satisfy the compiler
3354
+ bad_arch ();
3364
3355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3365
3356
3366
3357
#endif
@@ -3543,8 +3534,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
3543
3534
return d * sumf_d;
3544
3535
3545
3536
#else
3546
- assert (false );
3547
- return 0 .0f ; // only to satisfy the compiler
3537
+ bad_arch ();
3548
3538
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
3549
3539
3550
3540
#endif
@@ -3954,7 +3944,7 @@ template <bool need_check> static __global__ void
3954
3944
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
3955
3945
#else
3956
3946
(void ) vec_dot_q4_0_q8_1_mul_mat;
3957
- assert ( false );
3947
+ bad_arch ( );
3958
3948
#endif // __CUDA_ARCH__ >= CC_VOLTA
3959
3949
}
3960
3950
@@ -4023,7 +4013,7 @@ template <bool need_check> static __global__ void
4023
4013
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4024
4014
#else
4025
4015
(void ) vec_dot_q4_1_q8_1_mul_mat;
4026
- assert ( false );
4016
+ bad_arch ( );
4027
4017
#endif // __CUDA_ARCH__ >= CC_VOLTA
4028
4018
}
4029
4019
@@ -4090,7 +4080,7 @@ template <bool need_check> static __global__ void
4090
4080
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4091
4081
#else
4092
4082
(void ) vec_dot_q5_0_q8_1_mul_mat;
4093
- assert ( false );
4083
+ bad_arch ( );
4094
4084
#endif // __CUDA_ARCH__ >= CC_VOLTA
4095
4085
}
4096
4086
@@ -4157,7 +4147,7 @@ mul_mat_q5_1(
4157
4147
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4158
4148
#else
4159
4149
(void ) vec_dot_q5_1_q8_1_mul_mat;
4160
- assert ( false );
4150
+ bad_arch ( );
4161
4151
#endif // __CUDA_ARCH__ >= CC_VOLTA
4162
4152
}
4163
4153
@@ -4224,7 +4214,7 @@ template <bool need_check> static __global__ void
4224
4214
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4225
4215
#else
4226
4216
(void ) vec_dot_q8_0_q8_1_mul_mat;
4227
- assert ( false );
4217
+ bad_arch ( );
4228
4218
#endif // __CUDA_ARCH__ >= CC_VOLTA
4229
4219
}
4230
4220
@@ -4291,7 +4281,7 @@ mul_mat_q2_K(
4291
4281
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4292
4282
#else
4293
4283
(void ) vec_dot_q2_K_q8_1_mul_mat;
4294
- assert ( false );
4284
+ bad_arch ( );
4295
4285
#endif // __CUDA_ARCH__ >= CC_VOLTA
4296
4286
}
4297
4287
@@ -4360,7 +4350,7 @@ template <bool need_check> static __global__ void
4360
4350
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4361
4351
#else
4362
4352
(void ) vec_dot_q3_K_q8_1_mul_mat;
4363
- assert ( false );
4353
+ bad_arch ( );
4364
4354
#endif // __CUDA_ARCH__ >= CC_VOLTA
4365
4355
}
4366
4356
@@ -4429,7 +4419,7 @@ template <bool need_check> static __global__ void
4429
4419
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4430
4420
#else
4431
4421
(void ) vec_dot_q4_K_q8_1_mul_mat;
4432
- assert ( false );
4422
+ bad_arch ( );
4433
4423
#endif // __CUDA_ARCH__ >= CC_VOLTA
4434
4424
}
4435
4425
@@ -4496,7 +4486,7 @@ mul_mat_q5_K(
4496
4486
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4497
4487
#else
4498
4488
(void ) vec_dot_q5_K_q8_1_mul_mat;
4499
- assert ( false );
4489
+ bad_arch ( );
4500
4490
#endif // __CUDA_ARCH__ >= CC_VOLTA
4501
4491
}
4502
4492
@@ -4565,7 +4555,7 @@ template <bool need_check> static __global__ void
4565
4555
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
4566
4556
#else
4567
4557
(void ) vec_dot_q6_K_q8_1_mul_mat;
4568
- assert ( false );
4558
+ bad_arch ( );
4569
4559
#endif // __CUDA_ARCH__ >= CC_VOLTA
4570
4560
}
4571
4561
@@ -6825,6 +6815,7 @@ static void ggml_cuda_op_get_rows(
6825
6815
break ;
6826
6816
default :
6827
6817
// TODO: k-quants
6818
+ fprintf (stderr, " %s: unsupported type: %s\n " , __func__, ggml_type_name (src0->type ));
6828
6819
GGML_ASSERT (false );
6829
6820
break ;
6830
6821
}
@@ -8782,8 +8773,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8782
8773
// TODO: mmq/mmv support
8783
8774
#endif
8784
8775
8785
- GGML_ASSERT (dst->backend == GGML_BACKEND_GPU);
8786
-
8787
8776
const int64_t nb11 = src1->nb [1 ];
8788
8777
const int64_t nb1 = dst->nb [1 ];
8789
8778
@@ -8812,13 +8801,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8812
8801
ggml_tensor src1_row = *src1;
8813
8802
ggml_tensor dst_row = *dst;
8814
8803
8804
+ src1_row.backend = GGML_BACKEND_GPU;
8805
+ dst_row.backend = GGML_BACKEND_GPU;
8806
+
8815
8807
src1_row.extra = &src1_row_extra;
8816
8808
dst_row.extra = &dst_row_extra;
8817
8809
8818
- char * src1_original = (char *) src1_extra->data_device [g_main_device];
8819
- char * dst_original = (char *) dst_extra->data_device [g_main_device];
8810
+ char * src1_original = src1->backend == GGML_BACKEND_CPU ?
8811
+ (char *) src1->data : (char *) src1_extra->data_device [g_main_device];
8812
+ char * dst_original = dst->backend == GGML_BACKEND_CPU ?
8813
+ (char *) dst->data : (char *) dst_extra->data_device [g_main_device];
8820
8814
8821
8815
if (src1->ne [1 ] == 1 ) {
8816
+ GGML_ASSERT (src1->backend == GGML_BACKEND_GPU);
8817
+ GGML_ASSERT (dst->backend == GGML_BACKEND_GPU);
8818
+
8822
8819
for (int64_t i01 = 0 ; i01 < ids->ne [1 ]; i01++) {
8823
8820
// int32_t row_id;
8824
8821
// CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
@@ -8846,6 +8843,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8846
8843
src1_row_extra.data_device [g_main_device] = src1_contiguous;
8847
8844
dst_row_extra.data_device [g_main_device] = dst_contiguous;
8848
8845
8846
+ const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8847
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8848
+ const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8849
+ cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8850
+
8849
8851
for (int32_t row_id = 0 ; row_id < n_as; ++row_id) {
8850
8852
const struct ggml_tensor * src0_row = dst->src [row_id + 2 ];
8851
8853
@@ -8860,7 +8862,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8860
8862
GGML_ASSERT (row_id >= 0 && row_id < n_as);
8861
8863
8862
8864
CUDA_CHECK (cudaMemcpyAsync (src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8863
- nb11, cudaMemcpyDeviceToDevice , stream));
8865
+ nb11, src1_kind , stream));
8864
8866
num_src1_rows++;
8865
8867
}
8866
8868
@@ -8892,14 +8894,18 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
8892
8894
GGML_ASSERT (row_id >= 0 && row_id < n_as);
8893
8895
8894
8896
CUDA_CHECK (cudaMemcpyAsync (dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8895
- nb1, cudaMemcpyDeviceToDevice , stream));
8897
+ nb1, dst_kind , stream));
8896
8898
num_src1_rows++;
8897
8899
}
8898
8900
}
8899
8901
8900
8902
ggml_cuda_pool_free (src1_contiguous, as_src1);
8901
8903
ggml_cuda_pool_free (dst_contiguous, as_dst);
8902
8904
}
8905
+
8906
+ if (dst->backend == GGML_BACKEND_CPU) {
8907
+ CUDA_CHECK (cudaStreamSynchronize (stream));
8908
+ }
8903
8909
}
8904
8910
8905
8911
static void ggml_cuda_scale (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -9297,7 +9303,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
9297
9303
|| (tensor->src [0 ] != nullptr && (tensor->src [0 ]->backend == GGML_BACKEND_GPU || tensor->src [0 ]->backend == GGML_BACKEND_GPU_SPLIT))
9298
9304
|| (tensor->src [1 ] != nullptr && tensor->src [1 ]->backend == GGML_BACKEND_GPU);
9299
9305
9300
- if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
9306
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor-> op != GGML_OP_MUL_MAT_ID ) {
9301
9307
return false ;
9302
9308
}
9303
9309
0 commit comments