@@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1476
1476
1477
1477
const int64_t ne10 = src1->ne [0 ];
1478
1478
const int64_t ne11 = src1->ne [1 ];
1479
+ const int64_t ne12 = src1->ne [2 ];
1480
+ const int64_t ne13 = src1->ne [3 ];
1479
1481
1480
1482
const int nb2 = dst->nb [2 ];
1481
1483
const int nb3 = dst->nb [3 ];
1482
1484
1485
+ const int64_t r2 = ne12 / ne02;
1486
+ const int64_t r3 = ne13 / ne03;
1487
+
1483
1488
const float alpha = 1 .0f ;
1484
1489
const float beta = 0 .0f ;
1485
1490
const int x_ne = ne01 * ne00;
@@ -1498,13 +1503,24 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1498
1503
cl_mem d_Y = ggml_cl_pool_malloc (sizeof (float ) * y_ne, &y_size);
1499
1504
cl_mem d_D = ggml_cl_pool_malloc (sizeof (float ) * d_ne, &d_size);
1500
1505
1501
- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1502
- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1506
+ int64_t pi02 = -1 ;
1507
+ int64_t pi03 = -1 ;
1508
+
1509
+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1510
+ int64_t i03 = i13 / r3;
1511
+
1512
+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1513
+ int64_t i02 = i12 / r2;
1514
+
1503
1515
// copy data to device
1504
1516
if (src0->backend != GGML_BACKEND_GPU) {
1505
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1517
+ if (i02 != pi02 || i03 != pi03) {
1518
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1519
+ pi02 = i02;
1520
+ pi03 = i03;
1521
+ }
1506
1522
}
1507
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1523
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
1508
1524
1509
1525
CL_CHECK (clFinish (queue));
1510
1526
@@ -1525,7 +1541,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1525
1541
}
1526
1542
1527
1543
// copy dst to host
1528
- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1544
+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
1529
1545
CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
1530
1546
}
1531
1547
}
@@ -1547,6 +1563,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1547
1563
1548
1564
const int64_t ne10 = src1->ne [0 ];
1549
1565
const int64_t ne11 = src1->ne [1 ];
1566
+ const int64_t ne12 = src1->ne [2 ];
1567
+ const int64_t ne13 = src1->ne [3 ];
1550
1568
1551
1569
const int nb10 = src1->nb [0 ];
1552
1570
const int nb11 = src1->nb [1 ];
@@ -1556,6 +1574,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1556
1574
const int nb2 = dst->nb [2 ];
1557
1575
const int nb3 = dst->nb [3 ];
1558
1576
1577
+ const int64_t r2 = ne12 / ne02;
1578
+ const int64_t r3 = ne13 / ne03;
1579
+
1559
1580
const ggml_fp16_t alpha = ggml_fp32_to_fp16 (1 .0f );
1560
1581
const ggml_fp16_t beta = ggml_fp32_to_fp16 (0 .0f );
1561
1582
const int x_ne = ne01 * ne00;
@@ -1577,32 +1598,43 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1577
1598
bool src1_cont_rows = nb10 == sizeof (float );
1578
1599
bool src1_cont_cols = (size_t )nb11 == ne11*sizeof (float );
1579
1600
1580
- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1581
- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1601
+ int64_t pi02 = -1 ;
1602
+ int64_t pi03 = -1 ;
1603
+
1604
+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1605
+ int64_t i03 = i13 / r3;
1606
+
1607
+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1608
+ int64_t i02 = i12 / r2;
1609
+
1582
1610
// copy src0 to device
1583
1611
if (src0->backend != GGML_BACKEND_GPU) {
1584
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1612
+ if (i02 != pi02 || i03 != pi03) {
1613
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1614
+ pi02 = i02;
1615
+ pi03 = i03;
1616
+ }
1585
1617
}
1586
1618
1587
1619
// convert src1 to fp16
1588
1620
// TODO: use multiple threads
1589
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02 );
1590
- char * src1i = (char *) src1->data + i03 *nb13 + i02 *nb12;
1621
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12 );
1622
+ char * src1i = (char *) src1->data + i13 *nb13 + i12 *nb12;
1591
1623
if (src1_cont_rows) {
1592
1624
if (src1_cont_cols) {
1593
1625
ggml_fp32_to_fp16_row ((float *) src1i, tmp, ne10*ne11);
1594
1626
}
1595
1627
else {
1596
- for (int64_t i01 = 0 ; i01 < ne11; i01 ++) {
1597
- ggml_fp32_to_fp16_row ((float *) (src1i + i01 *nb11), tmp + i01 *ne10, ne10);
1628
+ for (int64_t i11 = 0 ; i11 < ne11; i11 ++) {
1629
+ ggml_fp32_to_fp16_row ((float *) (src1i + i11 *nb11), tmp + i11 *ne10, ne10);
1598
1630
}
1599
1631
}
1600
1632
}
1601
1633
else {
1602
- for (int64_t i01 = 0 ; i01 < ne11; i01 ++) {
1603
- for (int64_t i00 = 0 ; i00 < ne10; i00 ++) {
1634
+ for (int64_t i11 = 0 ; i11 < ne11; i11 ++) {
1635
+ for (int64_t i10 = 0 ; i10 < ne10; i10 ++) {
1604
1636
// very slow due to no inlining
1605
- tmp[i01 *ne10 + i00 ] = ggml_fp32_to_fp16 (*(float *) (src1i + i01 *nb11 + i00 *nb10));
1637
+ tmp[i11 *ne10 + i10 ] = ggml_fp32_to_fp16 (*(float *) (src1i + i11 *nb11 + i10 *nb10));
1606
1638
}
1607
1639
}
1608
1640
}
@@ -1631,7 +1663,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1631
1663
// copy dst to host, then convert to float
1632
1664
CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (ggml_fp16_t ) * d_ne, tmp, 1 , &ev_sgemm, NULL ));
1633
1665
1634
- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1666
+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
1635
1667
1636
1668
ggml_fp16_to_fp32_row (tmp, d, d_ne);
1637
1669
}
@@ -1652,12 +1684,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1652
1684
1653
1685
const int64_t ne10 = src1->ne [0 ];
1654
1686
const int64_t ne11 = src1->ne [1 ];
1687
+ const int64_t ne12 = src1->ne [2 ];
1688
+ const int64_t ne13 = src1->ne [3 ];
1655
1689
1656
1690
const int nb2 = dst->nb [2 ];
1657
1691
const int nb3 = dst->nb [3 ];
1658
1692
const ggml_type type = src0->type ;
1659
1693
const bool mul_mat_vec = ne11 == 1 ;
1660
1694
1695
+ const int64_t r2 = ne12 / ne02;
1696
+ const int64_t r3 = ne13 / ne03;
1697
+
1661
1698
const float alpha = 1 .0f ;
1662
1699
const float beta = 0 .0f ;
1663
1700
const int x_ne = ne01 * ne00;
@@ -1690,12 +1727,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1690
1727
size_t ev_idx = 0 ;
1691
1728
std::vector<cl_event> events;
1692
1729
1693
- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1694
- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1730
+ int64_t pi02 = -1 ;
1731
+ int64_t pi03 = -1 ;
1732
+
1733
+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1734
+ int64_t i03 = i13 / r3;
1735
+
1736
+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1737
+ int64_t i02 = i12 / r2;
1738
+
1695
1739
// copy src0 to device if necessary
1696
1740
if (src0->backend == GGML_BACKEND_CPU) {
1697
- events.emplace_back ();
1698
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1741
+ if (i02 != pi02 || i03 != pi03) {
1742
+ events.emplace_back ();
1743
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1744
+ pi02 = i02;
1745
+ pi03 = i03;
1746
+ }
1699
1747
} else if (src0->backend == GGML_BACKEND_GPU) {
1700
1748
d_Q = (cl_mem) src0->extra ;
1701
1749
} else {
@@ -1704,7 +1752,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1752
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1705
1753
// copy src1 to device
1706
1754
events.emplace_back ();
1707
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , events.data () + ev_idx++));
1755
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , events.data () + ev_idx++));
1708
1756
1709
1757
// compute
1710
1758
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1725,7 +1773,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1725
1773
CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, local > 0 ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
1726
1774
1727
1775
// copy src1 to device
1728
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1776
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
1729
1777
1730
1778
events.emplace_back ();
1731
1779
@@ -1749,7 +1797,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1749
1797
}
1750
1798
1751
1799
// copy dst to host
1752
- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1800
+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
1753
1801
CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
1754
1802
for (auto *event : events) {
1755
1803
clReleaseEvent (event);
0 commit comments