Skip to content

Commit 665018c

Browse files
authored
CLBlast: Add broadcast support for matrix multiplication (#3402)
Broadcast src0 into src1 across dimensions 2 and 3 when needed. This is required for models that use GQA.
1 parent 29a404a commit 665018c

File tree

2 files changed

+67
-28
lines changed

2 files changed

+67
-28
lines changed

ggml-opencl.cpp

Lines changed: 67 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14761476

14771477
const int64_t ne10 = src1->ne[0];
14781478
const int64_t ne11 = src1->ne[1];
1479+
const int64_t ne12 = src1->ne[2];
1480+
const int64_t ne13 = src1->ne[3];
14791481

14801482
const int nb2 = dst->nb[2];
14811483
const int nb3 = dst->nb[3];
14821484

1485+
const int64_t r2 = ne12 / ne02;
1486+
const int64_t r3 = ne13 / ne03;
1487+
14831488
const float alpha = 1.0f;
14841489
const float beta = 0.0f;
14851490
const int x_ne = ne01 * ne00;
@@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
14981503
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
14991504
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
15001505

1501-
for (int64_t i03 = 0; i03 < ne03; i03++) {
1502-
for (int64_t i02 = 0; i02 < ne02; i02++) {
1506+
int64_t pi02 = -1;
1507+
int64_t pi03 = -1;
1508+
1509+
for (int64_t i13 = 0; i13 < ne13; i13++) {
1510+
int64_t i03 = i13 / r3;
1511+
1512+
for (int64_t i12 = 0; i12 < ne12; i12++) {
1513+
int64_t i02 = i12 / r2;
1514+
15031515
// copy data to device
1504-
if (src0->backend != GGML_BACKEND_GPU) {
1516+
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
15051517
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1518+
pi02 = i02;
1519+
pi03 = i03;
15061520
}
1507-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1521+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
15081522

15091523
CL_CHECK(clFinish(queue));
15101524

@@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
15251539
}
15261540

15271541
// copy dst to host
1528-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1542+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
15291543
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
15301544
}
15311545
}
@@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15471561

15481562
const int64_t ne10 = src1->ne[0];
15491563
const int64_t ne11 = src1->ne[1];
1564+
const int64_t ne12 = src1->ne[2];
1565+
const int64_t ne13 = src1->ne[3];
15501566

15511567
const int nb10 = src1->nb[0];
15521568
const int nb11 = src1->nb[1];
@@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15561572
const int nb2 = dst->nb[2];
15571573
const int nb3 = dst->nb[3];
15581574

1575+
const int64_t r2 = ne12 / ne02;
1576+
const int64_t r3 = ne13 / ne03;
1577+
15591578
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
15601579
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
15611580
const int x_ne = ne01 * ne00;
@@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15771596
bool src1_cont_rows = nb10 == sizeof(float);
15781597
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
15791598

1580-
for (int64_t i03 = 0; i03 < ne03; i03++) {
1581-
for (int64_t i02 = 0; i02 < ne02; i02++) {
1599+
int64_t pi02 = -1;
1600+
int64_t pi03 = -1;
1601+
1602+
for (int64_t i13 = 0; i13 < ne13; i13++) {
1603+
int64_t i03 = i13 / r3;
1604+
1605+
for (int64_t i12 = 0; i12 < ne12; i12++) {
1606+
int64_t i02 = i12 / r2;
1607+
15821608
// copy src0 to device
1583-
if (src0->backend != GGML_BACKEND_GPU) {
1609+
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
15841610
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
1611+
pi02 = i02;
1612+
pi03 = i03;
15851613
}
15861614

15871615
// convert src1 to fp16
15881616
// TODO: use multiple threads
1589-
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
1590-
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
1617+
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
1618+
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
15911619
if (src1_cont_rows) {
15921620
if (src1_cont_cols) {
15931621
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
15941622
}
15951623
else {
1596-
for (int64_t i01 = 0; i01 < ne11; i01++) {
1597-
ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
1624+
for (int64_t i11 = 0; i11 < ne11; i11++) {
1625+
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
15981626
}
15991627
}
16001628
}
16011629
else {
1602-
for (int64_t i01 = 0; i01 < ne11; i01++) {
1603-
for (int64_t i00 = 0; i00 < ne10; i00++) {
1630+
for (int64_t i11 = 0; i11 < ne11; i11++) {
1631+
for (int64_t i10 = 0; i10 < ne10; i10++) {
16041632
// very slow due to no inlining
1605-
tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
1633+
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
16061634
}
16071635
}
16081636
}
@@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
16311659
// copy dst to host, then convert to float
16321660
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
16331661

1634-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1662+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
16351663

16361664
ggml_fp16_to_fp32_row(tmp, d, d_ne);
16371665
}
@@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
16521680

16531681
const int64_t ne10 = src1->ne[0];
16541682
const int64_t ne11 = src1->ne[1];
1683+
const int64_t ne12 = src1->ne[2];
1684+
const int64_t ne13 = src1->ne[3];
16551685

16561686
const int nb2 = dst->nb[2];
16571687
const int nb3 = dst->nb[3];
16581688
const ggml_type type = src0->type;
16591689
const bool mul_mat_vec = ne11 == 1;
16601690

1691+
const int64_t r2 = ne12 / ne02;
1692+
const int64_t r3 = ne13 / ne03;
1693+
16611694
const float alpha = 1.0f;
16621695
const float beta = 0.0f;
16631696
const int x_ne = ne01 * ne00;
@@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
16901723
size_t ev_idx = 0;
16911724
std::vector<cl_event> events;
16921725

1693-
for (int64_t i03 = 0; i03 < ne03; i03++) {
1694-
for (int64_t i02 = 0; i02 < ne02; i02++) {
1726+
int64_t pi02 = -1;
1727+
int64_t pi03 = -1;
1728+
1729+
for (int64_t i13 = 0; i13 < ne13; i13++) {
1730+
int64_t i03 = i13 / r3;
1731+
1732+
for (int64_t i12 = 0; i12 < ne12; i12++) {
1733+
int64_t i02 = i12 / r2;
1734+
16951735
// copy src0 to device if necessary
16961736
if (src0->backend == GGML_BACKEND_CPU) {
1697-
events.emplace_back();
1698-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1737+
if (i02 != pi02 || i03 != pi03) {
1738+
events.emplace_back();
1739+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
1740+
pi02 = i02;
1741+
pi03 = i03;
1742+
}
16991743
} else if (src0->backend == GGML_BACKEND_GPU) {
17001744
d_Q = (cl_mem) src0->extra;
17011745
} else {
@@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17041748
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
17051749
// copy src1 to device
17061750
events.emplace_back();
1707-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
1751+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
17081752

17091753
// compute
17101754
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17251769
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
17261770

17271771
// copy src1 to device
1728-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
1772+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
17291773

17301774
events.emplace_back();
17311775

@@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17491793
}
17501794

17511795
// copy dst to host
1752-
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
1796+
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
17531797
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
17541798
for (auto *event : events) {
17551799
clReleaseEvent(event);

ggml.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11621,11 +11621,6 @@ static void ggml_compute_forward_mul_mat(
1162111621

1162211622
#if defined(GGML_USE_CLBLAST)
1162311623
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
11624-
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
11625-
// ref: https://github.com/ggerganov/ggml/pull/224
11626-
GGML_ASSERT(ne02 == ne12);
11627-
GGML_ASSERT(ne03 == ne13);
11628-
1162911624
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
1163011625
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
1163111626
}

0 commit comments

Comments
 (0)