Skip to content

Commit f58ebcb

Browse files
committed
CLBlast: Fix uploading tensor data to device
ggml_cl_transform_tensor: * Fix offset of 3D and 4D tensors. ggml_cl_h2d_tensor_2d: * Fix parameters for non-contiguous data. * Correct handling of OpenCL events when multiple commands are queued.
1 parent e78f0b0 commit f58ebcb

File tree

1 file changed

+29
-15
lines changed

1 file changed

+29
-15
lines changed

ggml-opencl.cpp

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
13491349
const enum ggml_type type = src->type;
13501350
const size_t ts = ggml_type_size(type);
13511351
const size_t bs = ggml_blck_size(type);
1352+
const uint64_t row_size = ts*ne0/bs;
13521353

1353-
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354-
if (nb0 == ts && nb1 == ts*ne0/bs) {
1355-
err = clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*nb1, x, 0, NULL, ev);
1356-
return err;
1354+
const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1355+
if (nb0 == ts && nb1 == row_size) {
1356+
return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
13571357
}
13581358
if (nb0 == ts) {
13591359
const size_t buffer_origin[3] = { offset, 0, 0 };
13601360
const size_t host_origin[3] = { 0, 0, 0 };
1361-
const size_t region[3] = { ts*ne0/bs, ne1, 1 };
1362-
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0, nb1, 0, x, 0, NULL, ev);
1363-
return err;
1361+
const size_t region[3] = { row_size, ne1, 1 };
1362+
return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
13641363
}
1364+
std::vector<cl_event> events;
1365+
if (ev && ne1>1) events.reserve(ne1-1);
13651366
for (uint64_t i1 = 0; i1 < ne1; i1++) {
13661367
// pretend the row is a matrix with cols=1
1367-
const size_t buffer_origin[3] = { offset, i1, 0 };
1368+
const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
13681369
const size_t host_origin[3] = { 0, 0, 0 };
1369-
const size_t region[3] = { ts/bs, ne0, 1 };
1370-
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0, 0, nb0, 0, ((const char *)x) + i1*nb0, 0, NULL, ev);
1370+
const size_t region[3] = { ts, ne0/bs, 1 };
1371+
// if an event is requested, make the last write wait for all previous writes to complete
1372+
if (ev && i1) {
1373+
events.push_back(*ev);
1374+
}
1375+
cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
1376+
err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
13711377
if (err != CL_SUCCESS) {
1372-
break;
1378+
for (auto event : events) {
1379+
clReleaseEvent(event);
1380+
}
1381+
return err;
13731382
}
13741383
}
1375-
return err;
1384+
for (auto event : events) {
1385+
CL_CHECK(clReleaseEvent(event));
1386+
}
1387+
return CL_SUCCESS;
13761388
}
13771389

13781390
static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1888,17 +1900,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
18881900
const int64_t ne3 = tensor->ne[3];
18891901

18901902
const ggml_type type = tensor->type;
1891-
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
1903+
const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
1904+
const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
18921905

18931906
size_t q_size;
18941907
cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
18951908

18961909
tensor->data = data;
18971910
// copy tensor to device
1911+
size_t offset = 0;
18981912
for (int64_t i3 = 0; i3 < ne3; i3++) {
18991913
for (int64_t i2 = 0; i2 < ne2; i2++) {
1900-
int i = i3*ne2 + i2;
1901-
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, i*ne0*ne1, tensor, i3, i2, NULL));
1914+
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
1915+
offset += s_sz;
19021916
}
19031917
}
19041918

0 commit comments

Comments
 (0)