@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1349
1349
const enum ggml_type type = src->type ;
1350
1350
const size_t ts = ggml_type_size (type);
1351
1351
const size_t bs = ggml_blck_size (type);
1352
+ const uint64_t row_size = ts*ne0/bs;
1352
1353
1353
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1355
- err = clEnqueueWriteBuffer (queue, dst, CL_FALSE, offset, ne1*nb1, x, 0 , NULL , ev);
1356
- return err;
1354
+ const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1355
+ if (nb0 == ts && nb1 == row_size) {
1356
+ return clEnqueueWriteBuffer (queue, dst, CL_FALSE, offset, ne1*row_size, x, 0 , NULL , ev);
1357
1357
}
1358
1358
if (nb0 == ts) {
1359
1359
const size_t buffer_origin[3 ] = { offset, 0 , 0 };
1360
1360
const size_t host_origin[3 ] = { 0 , 0 , 0 };
1361
- const size_t region[3 ] = { ts*ne0/bs, ne1, 1 };
1362
- err = clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0 , nb1, 0 , x, 0 , NULL , ev);
1363
- return err;
1361
+ const size_t region[3 ] = { row_size, ne1, 1 };
1362
+ return clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0 , nb1, 0 , x, 0 , NULL , ev);
1364
1363
}
1364
+ std::vector<cl_event> events;
1365
+ if (ev && ne1>1 ) events.reserve (ne1-1 );
1365
1366
for (uint64_t i1 = 0 ; i1 < ne1; i1++) {
1366
1367
// pretend the row is a matrix with cols=1
1367
- const size_t buffer_origin[3 ] = { offset, i1 , 0 };
1368
+ const size_t buffer_origin[3 ] = { offset + i1*row_size, 0 , 0 };
1368
1369
const size_t host_origin[3 ] = { 0 , 0 , 0 };
1369
- const size_t region[3 ] = { ts/bs, ne0, 1 };
1370
- err = clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0 , 0 , nb0, 0 , ((const char *)x) + i1*nb0, 0 , NULL , ev);
1370
+ const size_t region[3 ] = { ts, ne0/bs, 1 };
1371
+ // if an event is requested, make the last write wait for all previous writes to complete
1372
+ if (ev && i1) {
1373
+ events.push_back (*ev);
1374
+ }
1375
+ cl_uint nevents = i1 == ne1-1 ? events.size () : 0U ;
1376
+ err = clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0 , nb0, 0 , x + i1*nb1, nevents, nevents ? events.data () : nullptr , ev);
1371
1377
if (err != CL_SUCCESS) {
1372
- break ;
1378
+ for (auto event : events) {
1379
+ clReleaseEvent (event);
1380
+ }
1381
+ return err;
1373
1382
}
1374
1383
}
1375
- return err;
1384
+ for (auto event : events) {
1385
+ CL_CHECK (clReleaseEvent (event));
1386
+ }
1387
+ return CL_SUCCESS;
1376
1388
}
1377
1389
1378
1390
static void ggml_cl_mul_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1888,17 +1900,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1888
1900
const int64_t ne3 = tensor->ne [3 ];
1889
1901
1890
1902
const ggml_type type = tensor->type ;
1891
- const size_t q_sz = ggml_type_size (type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size (type);
1903
+ const size_t s_sz = ggml_type_size (type) * (size_t ) (ne0 * ne1 / ggml_blck_size (type));
1904
+ const size_t q_sz = s_sz * (size_t ) (ne2 * ne3);
1892
1905
1893
1906
size_t q_size;
1894
1907
cl_mem dst = ggml_cl_pool_malloc (q_sz, &q_size);
1895
1908
1896
1909
tensor->data = data;
1897
1910
// copy tensor to device
1911
+ size_t offset = 0 ;
1898
1912
for (int64_t i3 = 0 ; i3 < ne3; i3++) {
1899
1913
for (int64_t i2 = 0 ; i2 < ne2; i2++) {
1900
- int i = i3*ne2 + i2 ;
1901
- CL_CHECK ( ggml_cl_h2d_tensor_2d (queue, dst, i*ne0*ne1, tensor, i3, i2, NULL )) ;
1914
+ CL_CHECK ( ggml_cl_h2d_tensor_2d (queue, dst, offset, tensor, i3, i2, NULL )) ;
1915
+ offset += s_sz ;
1902
1916
}
1903
1917
}
1904
1918
0 commit comments