Skip to content

Commit e0dc41a

Browse files
Rbiessyarthw
authored andcommitted
sycl: Use syclcompat::dp4a (ggml-org#10267)
* sycl: Use syclcompat::dp4a * Using the syclcompat version allow the compiler to optimize the operation with native function * Update news section * Update CI Windows oneAPI version to 2025.0 * Reword doc * Call syclcompat::dp4a inside dpct::dp4a This reverts commit 90cb61d.
1 parent 136c8fd commit e0dc41a

File tree

4 files changed

+10
-27
lines changed

4 files changed

+10
-27
lines changed

.github/workflows/build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -930,7 +930,7 @@ jobs:
930930
shell: bash
931931

932932
env:
933-
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
933+
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
934934
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
935935
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
936936
steps:

docs/backend/SYCL.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,9 @@ For CI and performance test summary, please refer to [llama.cpp CI for SYCL Back
4343

4444
## News
4545

46+
- 2024.11
47+
- Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
48+
4649
- 2024.8
4750
- Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
4851

ggml/src/ggml-sycl/dpct.hpp

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include <sycl/sycl.hpp>
1717
#include <sycl/half_type.hpp>
18+
#include <syclcompat/math.hpp>
1819
#include <oneapi/mkl.hpp>
1920
#include <map>
2021

@@ -1840,31 +1841,10 @@ namespace dpct
18401841
: id);
18411842
}
18421843

1843-
template <typename T>
1844-
sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
1845-
{
1846-
return sycl::vec<T, 1>(val)
1847-
.template as<sycl::vec<
1848-
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
1849-
.template convert<T>();
1850-
}
1851-
1852-
template <typename T1, typename T2>
1853-
using dot_product_acc_t =
1854-
std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
1855-
uint32_t, int32_t>;
1856-
18571844
template <typename T1, typename T2, typename T3>
18581845
inline auto dp4a(T1 a, T2 b, T3 c)
18591846
{
1860-
dot_product_acc_t<T1, T2> res = c;
1861-
auto va = extract_and_sign_or_zero_extend4(a);
1862-
auto vb = extract_and_sign_or_zero_extend4(b);
1863-
res += va[0] * vb[0];
1864-
res += va[1] * vb[1];
1865-
res += va[2] * vb[2];
1866-
res += va[3] * vb[3];
1867-
return res;
1847+
return syclcompat::dp4a(a, b, c);
18681848
}
18691849

18701850
struct sub_sat

ggml/src/ggml-sycl/vecdotq.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -968,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
968968
grid1[0] ^ signs[0], signs[0], std::minus<>());
969969
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
970970
grid2[0] ^ signs[1], signs[1], std::minus<>());
971-
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
972-
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
971+
sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
972+
sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
973973
q8 += 8;
974974
aux32 >>= 7;
975975
}
@@ -1009,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
10091009
grid1[0] ^ signs0, signs0, std::minus<>());
10101010
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
10111011
grid2[0] ^ signs1, signs1, std::minus<>());
1012-
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
1013-
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
1012+
sumi = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi);
1013+
sumi = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi);
10141014
q8 += 8;
10151015
}
10161016
const float d =

0 commit comments

Comments
 (0)