Skip to content

Commit e2583cb

Browse files
committed
CLBlast: Fix handling of on-device tensor data
Fix uploading tensor data to device, including 3D, 4D, and non-contiguous tensors. Use correct offsets into data that is already in VRAM. Correct handling of OpenCL events when multiple commands are queued.
1 parent e8b8d32 commit e2583cb

File tree

1 file changed

+54
-32
lines changed

1 file changed

+54
-32
lines changed

ggml-opencl.cpp

Lines changed: 54 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
202202

203203
__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
204204
{
205-
const int i = get_group_id(0);
205+
const int i = get_group_id(0) + get_global_offset(0);
206206
const int tid = get_local_id(0);
207207
const int n = tid / 32;
208208
const int l = tid - 32 * n;
209209
const int is = 8 * n + l / 16;
210210

211211
const uint8_t q = x[i].qs[32 * n + l];
212-
__global float *y = yy + i * QK_K + 128 * n;
212+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n;
213213

214214
const float dall = vload_half(0, &x[i].d);
215215
const float dmin = vload_half(0, &x[i].dmin);
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
223223
__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
224224
{
225225
int r = get_local_id(0) / 4;
226-
int i = get_group_id(0);
226+
int i = get_group_id(0) + get_global_offset(0);
227227
int tid = r / 2;
228228
int is0 = r % 2;
229229
int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
241241
float d_all = vload_half(0, &x[i].d);
242242
float dl = d_all * (us - 32);
243243

244-
__global float *y = yy + i * QK_K + 128 * n + 32 * j;
244+
__global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
245245
const __global uint8_t *q = x[i].qs + 32 * n;
246246
const __global uint8_t *hm = x[i].hmask;
247247

@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
251251

252252
__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
253253
{
254-
const int i = get_group_id(0);
254+
const int i = get_group_id(0) + get_global_offset(0);
255255
const int tid = get_local_id(0);
256256
const int il = tid / 8;
257257
const int ir = tid % 8;
258258
const int is = 2 * il;
259259
const int n = 4;
260260

261-
__global float *y = yy + i * QK_K + 64 * il + n * ir;
261+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
262262

263263
const float dall = vload_half(0, &x[i].d);
264264
const float dmin = vload_half(0, &x[i].dmin);
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
281281

282282
__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
283283
{
284-
const int i = get_group_id(0);
284+
const int i = get_group_id(0) + get_global_offset(0);
285285
const int tid = get_local_id(0);
286286
const int il = tid / 16;
287287
const int ir = tid % 16;
288288
const int is = 2 * il;
289289

290-
__global float *y = yy + i * QK_K + 64 * il + 2 * ir;
290+
__global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
291291

292292
const float dall = vload_half(0, &x[i].d);
293293
const float dmin = vload_half(0, &x[i].dmin);
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
313313

314314
__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
315315
{
316-
const int i = get_group_id(0);
316+
const int i = get_group_id(0) + get_global_offset(0);
317317
const int tid = get_local_id(0);
318318
const int ip = tid / 32;
319319
const int il = tid - 32 * ip;
320320
const int is = 8 * ip + il / 16;
321321

322-
__global float *y = yy + i * QK_K + 128 * ip + il;
322+
__global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
323323

324324
const float d = vload_half(0, &x[i].d);
325325

@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
730730
const uint qk = QUANT_K;
731731
const uint qr = QUANT_R;
732732

733-
const int ib = i/qk; // block index
733+
const int ib = i/qk + get_global_offset(0); // block index
734734
const int iqs = (i%qk)/qr; // quant index
735735
const int iybs = i - i%qk; // y block start index
736736
const int y_offset = qr == 1 ? 1 : qk/2;
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
13491349
const enum ggml_type type = src->type;
13501350
const size_t ts = ggml_type_size(type);
13511351
const size_t bs = ggml_blck_size(type);
1352+
const uint64_t row_size = ts*ne0/bs;
13521353

1353-
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354-
if (nb0 == ts && nb1 == ts*ne0/bs) {
1355-
err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
1356-
return err;
1354+
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1355+
if (nb0 == ts && nb1 == row_size) {
1356+
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
13571357
}
13581358
if (nb0 == ts) {
13591359
const size_t buffer_origin[3] = { offset, 0, 0 };
13601360
const size_t host_origin[3] = { 0, 0, 0 };
1361-
const size_t region[3] = { ts*ne0/bs, ne1, 1 };
1362-
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
1363-
return err;
1361+
const size_t region[3] = { row_size, ne1, 1 };
1362+
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
13641363
}
1364+
std::vector<cl_event> events;
1365+
if (ev && ne1>1) events.reserve(ne1-1);
13651366
for (uint64_t i1 = 0; i1 < ne1; i1++) {
13661367
// pretend the row is a matrix with cols=1
1367-
const size_t buffer_origin[3] = { offset, i1, 0 };
1368+
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
13681369
const size_t host_origin[3] = { 0, 0, 0 };
1369-
const size_t region[3] = { ts/bs, ne0, 1 };
1370-
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
1370+
const size_t region[3] = { ts, ne0/bs, 1 };
1371+
// if an event is requested, make the last write wait for all previous writes to complete
1372+
if (ev && i1) {
1373+
events.push_back(*ev);
1374+
}
1375+
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
1376+
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
13711377
if (err != CL_SUCCESS) {
1372-
break;
1378+
for (auto event : events) {
1379+
clReleaseEvent(event);
1380+
}
1381+
return err;
13731382
}
13741383
}
1375-
return err;
1384+
for (auto event : events) {
1385+
CL_CHECK(clReleaseEvent(event));
1386+
}
1387+
return CL_SUCCESS;
13761388
}
13771389

13781390
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1503,6 +1515,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
15031515
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
15041516
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
15051517

1518+
size_t x_offset = 0;
15061519
int64_t pi02 = -1;
15071520
int64_t pi03 = -1;
15081521

@@ -1513,7 +1526,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
15131526
int64_t i02 = i12 / r2;
15141527

15151528
// copy data to device
1516-
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
1529+
if (src0->backend == GGML_BACKEND_GPU) {
1530+
x_offset = (i03 * ne02 + i02) * x_ne;
1531+
} else if (i02 != pi02 || i03 != pi03) {
15171532
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
15181533
pi02 = i02;
15191534
pi03 = i03;
@@ -1528,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
15281543
clblast::Transpose::kYes, clblast::Transpose::kNo,
15291544
ne01, ne11, ne10,
15301545
alpha,
1531-
d_X, 0, ne00,
1546+
d_X, x_offset, ne00,
15321547
d_Y, 0, ne10,
15331548
beta,
15341549
d_D, 0, ne01,
@@ -1596,6 +1611,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
15961611
bool src1_cont_rows = nb10 == sizeof(float);
15971612
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
15981613

1614+
size_t x_offset = 0;
15991615
int64_t pi02 = -1;
16001616
int64_t pi03 = -1;
16011617

@@ -1606,7 +1622,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
16061622
int64_t i02 = i12 / r2;
16071623

16081624
// copy src0 to device
1609-
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
1625+
if (src0->backend == GGML_BACKEND_GPU) {
1626+
x_offset = (i03 * ne02 + i02) * x_ne;
1627+
} else if (i02 != pi02 || i03 != pi03) {
16101628
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
16111629
pi02 = i02;
16121630
pi03 = i03;
@@ -1646,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
16461664
clblast::Transpose::kYes, clblast::Transpose::kNo,
16471665
ne01, ne11, ne10,
16481666
alpha,
1649-
d_X, 0, ne00,
1667+
d_X, x_offset, ne00,
16501668
d_Y, 0, ne10,
16511669
beta,
16521670
d_D, 0, ne01,
@@ -1696,7 +1714,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
16961714
const int x_ne = ne01 * ne00;
16971715
const int y_ne = ne11 * ne10;
16981716
const int d_ne = ne11 * ne01;
1699-
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
1717+
const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
1718+
const size_t q_sz = ggml_type_size(type) * x_bps;
17001719

17011720
size_t x_size;
17021721
size_t y_size;
@@ -1764,9 +1783,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
17641783
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
17651784
// convert src0 to fp32 on device
17661785
const size_t global = x_ne / global_denom;
1786+
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
17671787
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
17681788
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
1769-
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
1789+
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
17701790

17711791
// copy src1 to device
17721792
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
@@ -1888,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
18881908
const int64_t ne3 = tensor->ne[3];
18891909

18901910
const ggml_type type = tensor->type;
1891-
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1911+
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
1912+
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
18921913

18931914
size_t q_size;
18941915
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
18951916

18961917
tensor->data = data;
18971918
// copy tensor to device
1919+
size_t offset = 0;
18981920
for (int64_t i3 = 0; i3 < ne3; i3++) {
18991921
for (int64_t i2 = 0; i2 < ne2; i2++) {
1900-
int i = i3*ne2 + i2;
1901-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1922+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
1923+
offset += s_sz;
19021924
}
19031925
}
19041926

0 commit comments

Comments
 (0)