Skip to content

Commit 16582cd

Browse files
committed
Merge remote-tracking branch 'origin/master' into sl/ggml-backend-int
2 parents cd4167b + 8fe03ff commit 16582cd

File tree

5 files changed

+64
-56
lines changed

5 files changed

+64
-56
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,8 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /
982982
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
983983
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
984984
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
985+
- Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
986+
- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT`
985987

986988
### Docs
987989

common/common.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -920,7 +920,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
920920
printf(" -m FNAME, --model FNAME\n");
921921
printf(" model path (default: %s)\n", params.model.c_str());
922922
printf(" -md FNAME, --model-draft FNAME\n");
923-
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
923+
printf(" draft model for speculative decoding\n");
924924
printf(" -ld LOGDIR, --logdir LOGDIR\n");
925925
printf(" path under which to save YAML logs (no logging if unset)\n");
926926
printf(" --override-kv KEY=TYPE:VALUE\n");

ggml-cuda.cu

Lines changed: 59 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,14 @@ static size_t g_scratch_offset = 0;
512512

513513
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
514514

515+
[[noreturn]]
516+
static __device__ void bad_arch() {
517+
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
518+
__trap();
519+
520+
(void) bad_arch; // suppress unused function warning
521+
}
522+
515523
static __device__ __forceinline__ float warp_reduce_sum(float x) {
516524
#pragma unroll
517525
for (int mask = 16; mask > 0; mask >>= 1) {
@@ -1972,8 +1980,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
19721980
// second part effectively subtracts 8 from each quant value
19731981
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
19741982
#else
1975-
assert(false);
1976-
return 0.0f; // only to satisfy the compiler
1983+
bad_arch();
19771984
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
19781985
}
19791986

@@ -2010,8 +2017,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
20102017
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
20112018
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
20122019
#else
2013-
assert(false);
2014-
return 0.0f; // only to satisfy the compiler
2020+
bad_arch();
20152021
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20162022
}
20172023

@@ -2046,8 +2052,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
20462052
// second part effectively subtracts 16 from each quant value
20472053
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
20482054
#else
2049-
assert(false);
2050-
return 0.0f; // only to satisfy the compiler
2055+
bad_arch();
20512056
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20522057
}
20532058

@@ -2092,8 +2097,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
20922097
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
20932098

20942099
#else
2095-
assert(false);
2096-
return 0.0f; // only to satisfy the compiler
2100+
bad_arch();
20972101
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
20982102
}
20992103

@@ -2114,8 +2118,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
21142118

21152119
return d8_0*d8_1 * sumi;
21162120
#else
2117-
assert(false);
2118-
return 0.0f; // only to satisfy the compiler
2121+
bad_arch();
21192122
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21202123
}
21212124

@@ -2145,8 +2148,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
21452148
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
21462149
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
21472150
#else
2148-
assert(false);
2149-
return 0.0f; // only to satisfy the compiler
2151+
bad_arch();
21502152
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21512153
}
21522154

@@ -2181,8 +2183,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
21812183

21822184
return dm2f.x*sumf_d - dm2f.y*sumf_m;
21832185
#else
2184-
assert(false);
2185-
return 0.0f; // only to satisfy the compiler
2186+
bad_arch();
21862187
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
21872188
}
21882189

@@ -2219,8 +2220,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
22192220

22202221
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
22212222
#else
2222-
assert(false);
2223-
return 0.0f; // only to satisfy the compiler
2223+
bad_arch();
22242224
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22252225
}
22262226

@@ -2260,8 +2260,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
22602260

22612261
return d3 * sumf;
22622262
#else
2263-
assert(false);
2264-
return 0.0f; // only to satisfy the compiler
2263+
bad_arch();
22652264
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22662265
}
22672266

@@ -2286,8 +2285,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
22862285

22872286
return d3*d8 * sumi;
22882287
#else
2289-
assert(false);
2290-
return 0.0f; // only to satisfy the compiler
2288+
bad_arch();
22912289
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
22922290
}
22932291

@@ -2320,8 +2318,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
23202318
return dm4f.x*sumf_d - dm4f.y*sumf_m;
23212319

23222320
#else
2323-
assert(false);
2324-
return 0.0f; // only to satisfy the compiler
2321+
bad_arch();
23252322
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
23262323
}
23272324

@@ -2354,8 +2351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
23542351
return dm4f.x*sumf_d - dm4f.y*sumf_m;
23552352

23562353
#else
2357-
assert(false);
2358-
return 0.0f; // only to satisfy the compiler
2354+
bad_arch();
23592355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
23602356
}
23612357

@@ -2395,8 +2391,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
23952391
return dm5f.x*sumf_d - dm5f.y*sumf_m;
23962392

23972393
#else
2398-
assert(false);
2399-
return 0.0f; // only to satisfy the compiler
2394+
bad_arch();
24002395
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24012396
}
24022397

@@ -2429,8 +2424,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
24292424
return dm4f.x*sumf_d - dm4f.y*sumf_m;
24302425

24312426
#else
2432-
assert(false);
2433-
return 0.0f; // only to satisfy the compiler
2427+
bad_arch();
24342428
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24352429
}
24362430

@@ -2460,8 +2454,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
24602454

24612455
return d*sumf;
24622456
#else
2463-
assert(false);
2464-
return 0.0f; // only to satisfy the compiler
2457+
bad_arch();
24652458
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24662459
}
24672460

@@ -2492,8 +2485,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
24922485
return d6 * sumf_d;
24932486

24942487
#else
2495-
assert(false);
2496-
return 0.0f; // only to satisfy the compiler
2488+
bad_arch();
24972489
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
24982490
}
24992491

@@ -3359,8 +3351,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
33593351
return dall * sumf_d - dmin * sumf_m;
33603352

33613353
#else
3362-
assert(false);
3363-
return 0.0f; // only to satisfy the compiler
3354+
bad_arch();
33643355
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
33653356

33663357
#endif
@@ -3543,8 +3534,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
35433534
return d * sumf_d;
35443535

35453536
#else
3546-
assert(false);
3547-
return 0.0f; // only to satisfy the compiler
3537+
bad_arch();
35483538
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
35493539

35503540
#endif
@@ -3954,7 +3944,7 @@ template <bool need_check> static __global__ void
39543944
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
39553945
#else
39563946
(void) vec_dot_q4_0_q8_1_mul_mat;
3957-
assert(false);
3947+
bad_arch();
39583948
#endif // __CUDA_ARCH__ >= CC_VOLTA
39593949
}
39603950

@@ -4023,7 +4013,7 @@ template <bool need_check> static __global__ void
40234013
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
40244014
#else
40254015
(void) vec_dot_q4_1_q8_1_mul_mat;
4026-
assert(false);
4016+
bad_arch();
40274017
#endif // __CUDA_ARCH__ >= CC_VOLTA
40284018
}
40294019

@@ -4090,7 +4080,7 @@ template <bool need_check> static __global__ void
40904080
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
40914081
#else
40924082
(void) vec_dot_q5_0_q8_1_mul_mat;
4093-
assert(false);
4083+
bad_arch();
40944084
#endif // __CUDA_ARCH__ >= CC_VOLTA
40954085
}
40964086

@@ -4157,7 +4147,7 @@ mul_mat_q5_1(
41574147
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
41584148
#else
41594149
(void) vec_dot_q5_1_q8_1_mul_mat;
4160-
assert(false);
4150+
bad_arch();
41614151
#endif // __CUDA_ARCH__ >= CC_VOLTA
41624152
}
41634153

@@ -4224,7 +4214,7 @@ template <bool need_check> static __global__ void
42244214
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
42254215
#else
42264216
(void) vec_dot_q8_0_q8_1_mul_mat;
4227-
assert(false);
4217+
bad_arch();
42284218
#endif // __CUDA_ARCH__ >= CC_VOLTA
42294219
}
42304220

@@ -4291,7 +4281,7 @@ mul_mat_q2_K(
42914281
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
42924282
#else
42934283
(void) vec_dot_q2_K_q8_1_mul_mat;
4294-
assert(false);
4284+
bad_arch();
42954285
#endif // __CUDA_ARCH__ >= CC_VOLTA
42964286
}
42974287

@@ -4360,7 +4350,7 @@ template <bool need_check> static __global__ void
43604350
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
43614351
#else
43624352
(void) vec_dot_q3_K_q8_1_mul_mat;
4363-
assert(false);
4353+
bad_arch();
43644354
#endif // __CUDA_ARCH__ >= CC_VOLTA
43654355
}
43664356

@@ -4429,7 +4419,7 @@ template <bool need_check> static __global__ void
44294419
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
44304420
#else
44314421
(void) vec_dot_q4_K_q8_1_mul_mat;
4432-
assert(false);
4422+
bad_arch();
44334423
#endif // __CUDA_ARCH__ >= CC_VOLTA
44344424
}
44354425

@@ -4496,7 +4486,7 @@ mul_mat_q5_K(
44964486
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
44974487
#else
44984488
(void) vec_dot_q5_K_q8_1_mul_mat;
4499-
assert(false);
4489+
bad_arch();
45004490
#endif // __CUDA_ARCH__ >= CC_VOLTA
45014491
}
45024492

@@ -4565,7 +4555,7 @@ template <bool need_check> static __global__ void
45654555
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
45664556
#else
45674557
(void) vec_dot_q6_K_q8_1_mul_mat;
4568-
assert(false);
4558+
bad_arch();
45694559
#endif // __CUDA_ARCH__ >= CC_VOLTA
45704560
}
45714561

@@ -6825,6 +6815,7 @@ static void ggml_cuda_op_get_rows(
68256815
break;
68266816
default:
68276817
// TODO: k-quants
6818+
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
68286819
GGML_ASSERT(false);
68296820
break;
68306821
}
@@ -8782,8 +8773,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
87828773
// TODO: mmq/mmv support
87838774
#endif
87848775

8785-
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8786-
87878776
const int64_t nb11 = src1->nb[1];
87888777
const int64_t nb1 = dst->nb[1];
87898778

@@ -8812,13 +8801,21 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
88128801
ggml_tensor src1_row = *src1;
88138802
ggml_tensor dst_row = *dst;
88148803

8804+
src1_row.backend = GGML_BACKEND_GPU;
8805+
dst_row.backend = GGML_BACKEND_GPU;
8806+
88158807
src1_row.extra = &src1_row_extra;
88168808
dst_row.extra = &dst_row_extra;
88178809

8818-
char * src1_original = (char *) src1_extra->data_device[g_main_device];
8819-
char * dst_original = (char *) dst_extra->data_device[g_main_device];
8810+
char * src1_original = src1->backend == GGML_BACKEND_CPU ?
8811+
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
8812+
char * dst_original = dst->backend == GGML_BACKEND_CPU ?
8813+
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
88208814

88218815
if (src1->ne[1] == 1) {
8816+
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
8817+
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
8818+
88228819
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
88238820
//int32_t row_id;
88248821
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
@@ -8846,6 +8843,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
88468843
src1_row_extra.data_device[g_main_device] = src1_contiguous;
88478844
dst_row_extra.data_device[g_main_device] = dst_contiguous;
88488845

8846+
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ?
8847+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8848+
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ?
8849+
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
8850+
88498851
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
88508852
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
88518853

@@ -8860,7 +8862,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
88608862
GGML_ASSERT(row_id >= 0 && row_id < n_as);
88618863

88628864
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous + num_src1_rows*nb11, src1_original + i01*nb11,
8863-
nb11, cudaMemcpyDeviceToDevice, stream));
8865+
nb11, src1_kind, stream));
88648866
num_src1_rows++;
88658867
}
88668868

@@ -8892,14 +8894,18 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
88928894
GGML_ASSERT(row_id >= 0 && row_id < n_as);
88938895

88948896
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous + num_src1_rows*nb1,
8895-
nb1, cudaMemcpyDeviceToDevice, stream));
8897+
nb1, dst_kind, stream));
88968898
num_src1_rows++;
88978899
}
88988900
}
88998901

89008902
ggml_cuda_pool_free(src1_contiguous, as_src1);
89018903
ggml_cuda_pool_free(dst_contiguous, as_dst);
89028904
}
8905+
8906+
if (dst->backend == GGML_BACKEND_CPU) {
8907+
CUDA_CHECK(cudaStreamSynchronize(stream));
8908+
}
89038909
}
89048910

89058911
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -9297,7 +9303,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
92979303
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
92989304
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
92999305

9300-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
9306+
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
93019307
return false;
93029308
}
93039309

0 commit comments

Comments
 (0)