@@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1476
1476
1477
1477
const int64_t ne10 = src1->ne [0 ];
1478
1478
const int64_t ne11 = src1->ne [1 ];
1479
+ const int64_t ne12 = src1->ne [2 ];
1480
+ const int64_t ne13 = src1->ne [3 ];
1479
1481
1480
1482
const int nb2 = dst->nb [2 ];
1481
1483
const int nb3 = dst->nb [3 ];
1482
1484
1485
+ const int64_t r2 = ne12 / ne02;
1486
+ const int64_t r3 = ne13 / ne03;
1487
+
1483
1488
const float alpha = 1 .0f ;
1484
1489
const float beta = 0 .0f ;
1485
1490
const int x_ne = ne01 * ne00;
@@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1498
1503
cl_mem d_Y = ggml_cl_pool_malloc (sizeof (float ) * y_ne, &y_size);
1499
1504
cl_mem d_D = ggml_cl_pool_malloc (sizeof (float ) * d_ne, &d_size);
1500
1505
1501
- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1502
- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1506
+ int64_t pi02 = -1 ;
1507
+ int64_t pi03 = -1 ;
1508
+
1509
+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1510
+ int64_t i03 = i13 / r3;
1511
+
1512
+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1513
+ int64_t i02 = i12 / r2;
1514
+
1503
1515
// copy data to device
1504
- if (src0->backend != GGML_BACKEND_GPU) {
1516
+ if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03) ) {
1505
1517
CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1518
+ pi02 = i02;
1519
+ pi03 = i03;
1506
1520
}
1507
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1521
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
1508
1522
1509
1523
CL_CHECK (clFinish (queue));
1510
1524
@@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
1525
1539
}
1526
1540
1527
1541
// copy dst to host
1528
- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1542
+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
1529
1543
CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &ev_sgemm, NULL ));
1530
1544
}
1531
1545
}
@@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1547
1561
1548
1562
const int64_t ne10 = src1->ne [0 ];
1549
1563
const int64_t ne11 = src1->ne [1 ];
1564
+ const int64_t ne12 = src1->ne [2 ];
1565
+ const int64_t ne13 = src1->ne [3 ];
1550
1566
1551
1567
const int nb10 = src1->nb [0 ];
1552
1568
const int nb11 = src1->nb [1 ];
@@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1556
1572
const int nb2 = dst->nb [2 ];
1557
1573
const int nb3 = dst->nb [3 ];
1558
1574
1575
+ const int64_t r2 = ne12 / ne02;
1576
+ const int64_t r3 = ne13 / ne03;
1577
+
1559
1578
const ggml_fp16_t alpha = ggml_fp32_to_fp16 (1 .0f );
1560
1579
const ggml_fp16_t beta = ggml_fp32_to_fp16 (0 .0f );
1561
1580
const int x_ne = ne01 * ne00;
@@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1577
1596
bool src1_cont_rows = nb10 == sizeof (float );
1578
1597
bool src1_cont_cols = (size_t )nb11 == ne11*sizeof (float );
1579
1598
1580
- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1581
- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1599
+ int64_t pi02 = -1 ;
1600
+ int64_t pi03 = -1 ;
1601
+
1602
+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1603
+ int64_t i03 = i13 / r3;
1604
+
1605
+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1606
+ int64_t i02 = i12 / r2;
1607
+
1582
1608
// copy src0 to device
1583
- if (src0->backend != GGML_BACKEND_GPU) {
1609
+ if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03) ) {
1584
1610
CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_X, 0 , src0, i03, i02, NULL ));
1611
+ pi02 = i02;
1612
+ pi03 = i03;
1585
1613
}
1586
1614
1587
1615
// convert src1 to fp16
1588
1616
// TODO: use multiple threads
1589
- ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02 );
1590
- char * src1i = (char *) src1->data + i03 *nb13 + i02 *nb12;
1617
+ ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12 );
1618
+ char * src1i = (char *) src1->data + i13 *nb13 + i12 *nb12;
1591
1619
if (src1_cont_rows) {
1592
1620
if (src1_cont_cols) {
1593
1621
ggml_fp32_to_fp16_row ((float *) src1i, tmp, ne10*ne11);
1594
1622
}
1595
1623
else {
1596
- for (int64_t i01 = 0 ; i01 < ne11; i01 ++) {
1597
- ggml_fp32_to_fp16_row ((float *) (src1i + i01 *nb11), tmp + i01 *ne10, ne10);
1624
+ for (int64_t i11 = 0 ; i11 < ne11; i11 ++) {
1625
+ ggml_fp32_to_fp16_row ((float *) (src1i + i11 *nb11), tmp + i11 *ne10, ne10);
1598
1626
}
1599
1627
}
1600
1628
}
1601
1629
else {
1602
- for (int64_t i01 = 0 ; i01 < ne11; i01 ++) {
1603
- for (int64_t i00 = 0 ; i00 < ne10; i00 ++) {
1630
+ for (int64_t i11 = 0 ; i11 < ne11; i11 ++) {
1631
+ for (int64_t i10 = 0 ; i10 < ne10; i10 ++) {
1604
1632
// very slow due to no inlining
1605
- tmp[i01 *ne10 + i00 ] = ggml_fp32_to_fp16 (*(float *) (src1i + i01 *nb11 + i00 *nb10));
1633
+ tmp[i11 *ne10 + i10 ] = ggml_fp32_to_fp16 (*(float *) (src1i + i11 *nb11 + i10 *nb10));
1606
1634
}
1607
1635
}
1608
1636
}
@@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
1631
1659
// copy dst to host, then convert to float
1632
1660
CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (ggml_fp16_t ) * d_ne, tmp, 1 , &ev_sgemm, NULL ));
1633
1661
1634
- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1662
+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
1635
1663
1636
1664
ggml_fp16_to_fp32_row (tmp, d, d_ne);
1637
1665
}
@@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1652
1680
1653
1681
const int64_t ne10 = src1->ne [0 ];
1654
1682
const int64_t ne11 = src1->ne [1 ];
1683
+ const int64_t ne12 = src1->ne [2 ];
1684
+ const int64_t ne13 = src1->ne [3 ];
1655
1685
1656
1686
const int nb2 = dst->nb [2 ];
1657
1687
const int nb3 = dst->nb [3 ];
1658
1688
const ggml_type type = src0->type ;
1659
1689
const bool mul_mat_vec = ne11 == 1 ;
1660
1690
1691
+ const int64_t r2 = ne12 / ne02;
1692
+ const int64_t r3 = ne13 / ne03;
1693
+
1661
1694
const float alpha = 1 .0f ;
1662
1695
const float beta = 0 .0f ;
1663
1696
const int x_ne = ne01 * ne00;
@@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1690
1723
size_t ev_idx = 0 ;
1691
1724
std::vector<cl_event> events;
1692
1725
1693
- for (int64_t i03 = 0 ; i03 < ne03; i03++) {
1694
- for (int64_t i02 = 0 ; i02 < ne02; i02++) {
1726
+ int64_t pi02 = -1 ;
1727
+ int64_t pi03 = -1 ;
1728
+
1729
+ for (int64_t i13 = 0 ; i13 < ne13; i13++) {
1730
+ int64_t i03 = i13 / r3;
1731
+
1732
+ for (int64_t i12 = 0 ; i12 < ne12; i12++) {
1733
+ int64_t i02 = i12 / r2;
1734
+
1695
1735
// copy src0 to device if necessary
1696
1736
if (src0->backend == GGML_BACKEND_CPU) {
1697
- events.emplace_back ();
1698
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1737
+ if (i02 != pi02 || i03 != pi03) {
1738
+ events.emplace_back ();
1739
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Q, 0 , src0, i03, i02, events.data () + ev_idx++));
1740
+ pi02 = i02;
1741
+ pi03 = i03;
1742
+ }
1699
1743
} else if (src0->backend == GGML_BACKEND_GPU) {
1700
1744
d_Q = (cl_mem) src0->extra ;
1701
1745
} else {
@@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1704
1748
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
1705
1749
// copy src1 to device
1706
1750
events.emplace_back ();
1707
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , events.data () + ev_idx++));
1751
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , events.data () + ev_idx++));
1708
1752
1709
1753
// compute
1710
1754
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
@@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1725
1769
CL_CHECK (clEnqueueNDRangeKernel (queue, *to_fp32_cl, 1 , NULL , &global, local > 0 ? &local : NULL , events.size (), !events.empty () ? events.data () : NULL , NULL ));
1726
1770
1727
1771
// copy src1 to device
1728
- CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i03, i02 , NULL ));
1772
+ CL_CHECK (ggml_cl_h2d_tensor_2d (queue, d_Y, 0 , src1, i13, i12 , NULL ));
1729
1773
1730
1774
events.emplace_back ();
1731
1775
@@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
1749
1793
}
1750
1794
1751
1795
// copy dst to host
1752
- float * d = (float *) ((char *) dst->data + i02 *nb2 + i03 *nb3);
1796
+ float * d = (float *) ((char *) dst->data + i12 *nb2 + i13 *nb3);
1753
1797
CL_CHECK (clEnqueueReadBuffer (queue, d_D, true , 0 , sizeof (float ) * d_ne, d, 1 , &events[events.size () - 1 ], NULL ));
1754
1798
for (auto *event : events) {
1755
1799
clReleaseEvent (event);
0 commit comments