@@ -202,14 +202,14 @@ inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8
202
202
203
203
__kernel void dequantize_block_q2_K (__global const struct block_q2_K *x, __global float *yy)
204
204
{
205
- const int i = get_group_id (0 );
205
+ const int i = get_group_id (0 ) + get_global_offset ( 0 ) ;
206
206
const int tid = get_local_id (0 );
207
207
const int n = tid / 32 ;
208
208
const int l = tid - 32 * n;
209
209
const int is = 8 * n + l / 16 ;
210
210
211
211
const uint8_t q = x[i].qs [32 * n + l];
212
- __global float *y = yy + i * QK_K + 128 * n;
212
+ __global float *y = yy + get_group_id ( 0 ) * QK_K + 128 * n;
213
213
214
214
const float dall = vload_half (0 , &x[i].d );
215
215
const float dmin = vload_half (0 , &x[i].dmin );
@@ -223,7 +223,7 @@ __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __globa
223
223
__kernel void dequantize_block_q3_K (__global const struct block_q3_K *x, __global float *yy)
224
224
{
225
225
int r = get_local_id (0 ) / 4 ;
226
- int i = get_group_id (0 );
226
+ int i = get_group_id (0 ) + get_global_offset ( 0 ) ;
227
227
int tid = r / 2 ;
228
228
int is0 = r % 2 ;
229
229
int l0 = 16 * is0 + 4 * (get_local_id (0 ) % 4 );
@@ -241,7 +241,7 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
241
241
float d_all = vload_half (0 , &x[i].d );
242
242
float dl = d_all * (us - 32 );
243
243
244
- __global float *y = yy + i * QK_K + 128 * n + 32 * j;
244
+ __global float *y = yy + get_group_id ( 0 ) * QK_K + 128 * n + 32 * j;
245
245
const __global uint8_t *q = x[i].qs + 32 * n;
246
246
const __global uint8_t *hm = x[i].hmask ;
247
247
@@ -251,14 +251,14 @@ __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __globa
251
251
252
252
__kernel void dequantize_block_q4_K (__global const struct block_q4_K *x, __global float *yy)
253
253
{
254
- const int i = get_group_id (0 );
254
+ const int i = get_group_id (0 ) + get_global_offset ( 0 ) ;
255
255
const int tid = get_local_id (0 );
256
256
const int il = tid / 8 ;
257
257
const int ir = tid % 8 ;
258
258
const int is = 2 * il;
259
259
const int n = 4 ;
260
260
261
- __global float *y = yy + i * QK_K + 64 * il + n * ir;
261
+ __global float *y = yy + get_group_id ( 0 ) * QK_K + 64 * il + n * ir;
262
262
263
263
const float dall = vload_half (0 , &x[i].d );
264
264
const float dmin = vload_half (0 , &x[i].dmin );
@@ -281,13 +281,13 @@ __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __globa
281
281
282
282
__kernel void dequantize_block_q5_K (__global const struct block_q5_K *x, __global float *yy)
283
283
{
284
- const int i = get_group_id (0 );
284
+ const int i = get_group_id (0 ) + get_global_offset ( 0 ) ;
285
285
const int tid = get_local_id (0 );
286
286
const int il = tid / 16 ;
287
287
const int ir = tid % 16 ;
288
288
const int is = 2 * il;
289
289
290
- __global float *y = yy + i * QK_K + 64 * il + 2 * ir;
290
+ __global float *y = yy + get_group_id ( 0 ) * QK_K + 64 * il + 2 * ir;
291
291
292
292
const float dall = vload_half (0 , &x[i].d );
293
293
const float dmin = vload_half (0 , &x[i].dmin );
@@ -313,13 +313,13 @@ __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __globa
313
313
314
314
__kernel void dequantize_block_q6_K (__global const struct block_q6_K *x, __global float *yy)
315
315
{
316
- const int i = get_group_id (0 );
316
+ const int i = get_group_id (0 ) + get_global_offset ( 0 ) ;
317
317
const int tid = get_local_id (0 );
318
318
const int ip = tid / 32 ;
319
319
const int il = tid - 32 * ip;
320
320
const int is = 8 * ip + il / 16 ;
321
321
322
- __global float *y = yy + i * QK_K + 128 * ip + il;
322
+ __global float *y = yy + get_group_id ( 0 ) * QK_K + 128 * ip + il;
323
323
324
324
const float d = vload_half (0 , &x[i].d );
325
325
@@ -730,7 +730,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
730
730
const uint qk = QUANT_K;
731
731
const uint qr = QUANT_R;
732
732
733
- const int ib = i/qk; // block index
733
+ const int ib = i/qk + get_global_offset ( 0 ) ; // block index
734
734
const int iqs = (i%qk)/qr; // quant index
735
735
const int iybs = i - i%qk; // y block start index
736
736
const int y_offset = qr == 1 ? 1 : qk/2 ;
@@ -1349,30 +1349,42 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
1349
1349
const enum ggml_type type = src->type ;
1350
1350
const size_t ts = ggml_type_size (type);
1351
1351
const size_t bs = ggml_blck_size (type);
1352
+ const uint64_t row_size = ts*ne0/bs;
1352
1353
1353
- const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
1354
- if (nb0 == ts && nb1 == ts*ne0/bs) {
1355
- err = clEnqueueWriteBuffer (queue, dst, CL_FALSE, offset, ne1*nb1, x, 0 , NULL , ev);
1356
- return err;
1354
+ const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
1355
+ if (nb0 == ts && nb1 == row_size) {
1356
+ return clEnqueueWriteBuffer (queue, dst, CL_FALSE, offset, ne1*row_size, x, 0 , NULL , ev);
1357
1357
}
1358
1358
if (nb0 == ts) {
1359
1359
const size_t buffer_origin[3 ] = { offset, 0 , 0 };
1360
1360
const size_t host_origin[3 ] = { 0 , 0 , 0 };
1361
- const size_t region[3 ] = { ts*ne0/bs, ne1, 1 };
1362
- err = clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts*ne0/bs, 0 , nb1, 0 , x, 0 , NULL , ev);
1363
- return err;
1361
+ const size_t region[3 ] = { row_size, ne1, 1 };
1362
+ return clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0 , nb1, 0 , x, 0 , NULL , ev);
1364
1363
}
1364
+ std::vector<cl_event> events;
1365
+ if (ev && ne1>1 ) events.reserve (ne1-1 );
1365
1366
for (uint64_t i1 = 0 ; i1 < ne1; i1++) {
1366
1367
// pretend the row is a matrix with cols=1
1367
- const size_t buffer_origin[3 ] = { offset, i1 , 0 };
1368
+ const size_t buffer_origin[3 ] = { offset + i1*row_size, 0 , 0 };
1368
1369
const size_t host_origin[3 ] = { 0 , 0 , 0 };
1369
- const size_t region[3 ] = { ts/bs, ne0, 1 };
1370
- err = clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, 0 , 0 , nb0, 0 , ((const char *)x) + i1*nb0, 0 , NULL , ev);
1370
+ const size_t region[3 ] = { ts, ne0/bs, 1 };
1371
+ // if an event is requested, make the last write wait for all previous writes to complete
1372
+ if (ev && i1) {
1373
+ events.push_back (*ev);
1374
+ }
1375
+ cl_uint nevents = i1 == ne1-1 ? events.size () : 0U ;
1376
+ err = clEnqueueWriteBufferRect (queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0 , nb0, 0 , x + i1*nb1, nevents, nevents ? events.data () : nullptr , ev);
1371
1377
if (err != CL_SUCCESS) {
1372
- break ;
1378
+ for (auto event : events) {
1379
+ clReleaseEvent (event);
1380
+ }
1381
+ return err;
1373
1382
}
1374
1383
}
1375
- return err;
1384
+ for (auto event : events) {
1385
+ CL_CHECK (clReleaseEvent (event));
1386
+ }
1387
+ return CL_SUCCESS;
1376
1388
}
1377
1389
1378
1390
static void ggml_cl_mul_f32 (const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -1503,6 +1515,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1503
1515
cl_mem d_Y = ggml_cl_pool_malloc (sizeof (float ) * y_ne, &y_size);
1504
1516
cl_mem d_D = ggml_cl_pool_malloc (sizeof (float ) * d_ne, &d_size);
1505
1517
1518
+ size_t x_offset = 0 ;
1506
1519
int64_t pi02 = -1 ;
1507
1520
int64_t pi03 = -1 ;
1508
1521
@@ -1513,7 +1526,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1513
1526
int64_t i02 = i12 / r2;
1514
1527
1515
1528
// copy data to device
1516
- if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
1529
+ if (src0->backend == GGML_BACKEND_GPU) {
1530
+ x_offset = (i03 * ne02 + i02) * x_ne;
1531
+ } else if (i02 != pi02 || i03 != pi03) {
1517
1532
CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1518
1533
pi02 = i02;
1519
1534
pi03 = i03;
@@ -1528,7 +1543,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1528
1543
clblast::Transpose::kYes , clblast::Transpose::kNo ,
1529
1544
ne01, ne11, ne10,
1530
1545
alpha,
1531
- d_X, 0 , ne00,
1546
+ d_X, x_offset , ne00,
1532
1547
d_Y, 0 , ne10,
1533
1548
beta,
1534
1549
d_D, 0 , ne01,
@@ -1596,6 +1611,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1596
1611
bool src1_cont_rows = nb10 == sizeof (float );
1597
1612
bool src1_cont_cols = (size_t )nb11 == ne11*sizeof (float );
1598
1613
1614
+ size_t x_offset = 0 ;
1599
1615
int64_t pi02 = -1 ;
1600
1616
int64_t pi03 = -1 ;
1601
1617
@@ -1606,7 +1622,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1606
1622
int64_t i02 = i12 / r2;
1607
1623
1608
1624
// copy src0 to device
1609
- if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
1625
+ if (src0->backend == GGML_BACKEND_GPU) {
1626
+ x_offset = (i03 * ne02 + i02) * x_ne;
1627
+ } else if (i02 != pi02 || i03 != pi03) {
1610
1628
CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1611
1629
pi02 = i02;
1612
1630
pi03 = i03;
@@ -1646,7 +1664,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1646
1664
clblast::Transpose::kYes , clblast::Transpose::kNo ,
1647
1665
ne01, ne11, ne10,
1648
1666
alpha,
1649
- d_X, 0 , ne00,
1667
+ d_X, x_offset , ne00,
1650
1668
d_Y, 0 , ne10,
1651
1669
beta,
1652
1670
d_D, 0 , ne01,
@@ -1696,7 +1714,8 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1696
1714
const int x_ne = ne01 * ne00;
1697
1715
const int y_ne = ne11 * ne10;
1698
1716
const int d_ne = ne11 * ne01;
1699
- const size_t q_sz = ggml_type_size (type) * x_ne / ggml_blck_size (type);
1717
+ const int x_bps = x_ne / ggml_blck_size (type); // blocks per 2D slice
1718
+ const size_t q_sz = ggml_type_size (type) * x_bps;
1700
1719
1701
1720
size_t x_size;
1702
1721
size_t y_size;
@@ -1764,9 +1783,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1764
1783
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
1765
1784
// convert src0 to fp32 on device
1766
1785
const size_t global = x_ne / global_denom;
1786
+ const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0 ;
1767
1787
CL_CHECK (clSetKernelArg (*to_fp32_cl, 0 , sizeof (cl_mem), &d_Q));
1768
1788
CL_CHECK (clSetKernelArg (*to_fp32_cl, 1 , sizeof (cl_mem), &d_X));
1769
- CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, local > 0 ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
1789
+ CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , offset > 0 ? &offset : NULL , &global, local > 0 ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
1770
1790
1771
1791
// copy src1 to device
1772
1792
CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12, NULL ));
@@ -1888,17 +1908,19 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
1888
1908
const int64_t ne3 = tensor->ne [3 ];
1889
1909
1890
1910
const ggml_type type = tensor->type ;
1891
- const size_t q_sz = ggml_type_size (type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size (type);
1911
+ const size_t s_sz = ggml_type_size (type) * (size_t ) (ne0 * ne1 / ggml_blck_size (type));
1912
+ const size_t q_sz = s_sz * (size_t ) (ne2 * ne3);
1892
1913
1893
1914
size_t q_size;
1894
1915
cl_mem dst = ggml_cl_pool_malloc (q_sz, &q_size);
1895
1916
1896
1917
tensor->data = data;
1897
1918
// copy tensor to device
1919
+ size_t offset = 0 ;
1898
1920
for (int64_t i3 = 0 ; i3 < ne3; i3++) {
1899
1921
for (int64_t i2 = 0 ; i2 < ne2; i2++) {
1900
- int i = i3*ne2 + i2 ;
1901
- CL_CHECK ( ggml_cl_h2d_tensor_2d (queue, dst, i*ne0*ne1, tensor, i3, i2, NULL )) ;
1922
+ CL_CHECK ( ggml_cl_h2d_tensor_2d (queue, dst, offset, tensor, i3, i2, NULL )) ;
1923
+ offset += s_sz ;
1902
1924
}
1903
1925
}
1904
1926
0 commit comments