Skip to content

Commit 90cb61d

Browse files
committed
sycl: Use syclcompat::dp4a
* Using the syclcompat version allow the compiler to optimize the operation with native function
1 parent 1dc04b2 commit 90cb61d

File tree

3 files changed

+60
-86
lines changed

3 files changed

+60
-86
lines changed

ggml/src/ggml-sycl/dpct/helper.hpp

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,33 +1830,6 @@ namespace dpct
18301830
: id);
18311831
}
18321832

1833-
template <typename T>
1834-
sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
1835-
{
1836-
return sycl::vec<T, 1>(val)
1837-
.template as<sycl::vec<
1838-
std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
1839-
.template convert<T>();
1840-
}
1841-
1842-
template <typename T1, typename T2>
1843-
using dot_product_acc_t =
1844-
std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
1845-
uint32_t, int32_t>;
1846-
1847-
template <typename T1, typename T2, typename T3>
1848-
inline auto dp4a(T1 a, T2 b, T3 c)
1849-
{
1850-
dot_product_acc_t<T1, T2> res = c;
1851-
auto va = extract_and_sign_or_zero_extend4(a);
1852-
auto vb = extract_and_sign_or_zero_extend4(b);
1853-
res += va[0] * vb[0];
1854-
res += va[1] * vb[1];
1855-
res += va[2] * vb[2];
1856-
res += va[3] * vb[3];
1857-
return res;
1858-
}
1859-
18601833
struct sub_sat
18611834
{
18621835
template <typename T>

ggml/src/ggml-sycl/mmq.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -575,8 +575,8 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
575575

576576
#pragma unroll
577577
for (int i = i0; i < i0 + QI8_1/2; ++i) {
578-
sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
579-
sumi_m = dpct::dp4a(m, u[i],
578+
sumi_d_sc = syclcompat::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
579+
sumi_m = syclcompat::dp4a(m, u[i],
580580
sumi_m); // multiply sum of q8_1 values with m
581581
}
582582

@@ -730,7 +730,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
730730
int sumi_sc = 0;
731731

732732
for (int i = i0; i < i0 + QI8_1/2; ++i) {
733-
sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
733+
sumi_sc = syclcompat::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
734734
}
735735

736736
sumi += sumi_sc * scales[i0 / (QI8_1/2)];
@@ -873,7 +873,7 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
873873

874874
#pragma unroll
875875
for (int j = 0; j < QI8_1; ++j) {
876-
sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
876+
sumi_d = syclcompat::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
877877
u[i * QI8_1 + j], sumi_d); // SIMD dot product
878878
}
879879

@@ -1018,7 +1018,7 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
10181018

10191019
#pragma unroll
10201020
for (int j = 0; j < QI8_1; ++j) {
1021-
sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
1021+
sumi_d = syclcompat::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
10221022
sumi_d); // SIMD dot product
10231023
}
10241024

@@ -1156,14 +1156,14 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
11561156

11571157
#pragma unroll
11581158
for (int i = i0; i < i0 + 2; ++i) {
1159-
sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0],
1159+
sumi_d.x() = syclcompat::dp4a(v[2 * i + 0], u[2 * i + 0],
11601160
sumi_d.x()); // SIMD dot product
1161-
sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1],
1161+
sumi_d.x() = syclcompat::dp4a(v[2 * i + 1], u[2 * i + 1],
11621162
sumi_d.x()); // SIMD dot product
11631163

1164-
sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4],
1164+
sumi_d.y() = syclcompat::dp4a(v[2 * i + 4], u[2 * i + 4],
11651165
sumi_d.y()); // SIMD dot product
1166-
sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5],
1166+
sumi_d.y() = syclcompat::dp4a(v[2 * i + 5], u[2 * i + 5],
11671167
sumi_d.y()); // SIMD dot product
11681168
}
11691169

ggml/src/ggml-sycl/vecdotq.hpp

Lines changed: 51 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#define GGML_SYCL_VECDOTQ_HPP
1515

1616
#include "dpct/helper.hpp"
17+
#include "syclcompat/math.hpp"
1718

1819
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
1920

@@ -89,14 +90,14 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
8990
const int vi = (v >> (2*i)) & 0x03030303;
9091

9192
sumf_d +=
92-
d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
93+
d8[i] * (syclcompat::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
9394

9495
// fill int with 4x m
9596
int m = sc >> 4;
9697
m |= m << 8;
9798
m |= m << 16;
9899
sumf_m += d8[i] *
99-
dpct::dp4a(
100+
syclcompat::dp4a(
100101
m, u[i],
101102
0); // multiply constant q2_K part with sum of q8_1 values
102103
}
@@ -139,7 +140,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
139140
const int vi =
140141
dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
141142

142-
sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
143+
sumf += d8[i] * (syclcompat::dp4a(vi, u[i], 0) * sc); // SIMD dot product
143144
}
144145

145146
return d3 * sumf;
@@ -162,11 +163,11 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
162163
const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
163164

164165
const int dot1 =
165-
dpct::dp4a(v1i, u[2 * i + 1],
166-
dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
166+
syclcompat::dp4a(v1i, u[2 * i + 1],
167+
syclcompat::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
167168
const int dot2 =
168-
dpct::dp4a(0x01010101, u[2 * i + 1],
169-
dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
169+
syclcompat::dp4a(0x01010101, u[2 * i + 1],
170+
syclcompat::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
170171

171172
sumf_d += d8[i] * (dot1 * sc[i]);
172173
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
@@ -203,11 +204,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
203204
const int v1i = vl1i | vh1i;
204205

205206
const int dot1 =
206-
dpct::dp4a(v0i, u[2 * i + 0],
207-
dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
207+
syclcompat::dp4a(v0i, u[2 * i + 0],
208+
syclcompat::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
208209
const int dot2 =
209-
dpct::dp4a(0x01010101, u[2 * i + 0],
210-
dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
210+
syclcompat::dp4a(0x01010101, u[2 * i + 0],
211+
syclcompat::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
211212

212213
sumf_d += d8[i] * (dot1 * sc[i]);
213214
sumf_m += d8[i] * (dot2 * m[i]);
@@ -243,7 +244,7 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
243244
const int vi = dpct::vectorized_binary<sycl::char4>(
244245
(vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
245246

246-
sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
247+
sumf += d8[i] * (syclcompat::dp4a(vi, u[i], 0) * sc); // SIMD dot product
247248
}
248249

249250
return d*sumf;
@@ -266,8 +267,8 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
266267
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
267268

268269
// SIMD dot product of quantized values
269-
sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
270-
sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
270+
sumi = syclcompat::dp4a(vi0, u[2 * i + 0], sumi);
271+
sumi = syclcompat::dp4a(vi1, u[2 * i + 1], sumi);
271272
}
272273

273274
const sycl::float2 ds8f =
@@ -293,8 +294,8 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
293294
const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
294295

295296
// SIMD dot product of quantized values
296-
sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
297-
sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
297+
sumi = syclcompat::dp4a(vi0, u[2 * i + 0], sumi);
298+
sumi = syclcompat::dp4a(vi1, u[2 * i + 1], sumi);
298299
}
299300

300301
#ifdef GGML_SYCL_F16
@@ -331,15 +332,15 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
331332
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
332333
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
333334
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
334-
sumi = dpct::dp4a(vi0, u[2 * i + 0],
335+
sumi = syclcompat::dp4a(vi0, u[2 * i + 0],
335336
sumi); // SIMD dot product of quantized values
336337

337338
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
338339
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
339340
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
340341
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
341342
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
342-
sumi = dpct::dp4a(vi1, u[2 * i + 1],
343+
sumi = syclcompat::dp4a(vi1, u[2 * i + 1],
343344
sumi); // SIMD dot product of quantized values
344345
}
345346

@@ -367,15 +368,15 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
367368
vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12
368369
vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20
369370
vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28
370-
sumi = dpct::dp4a(vi0, u[2 * i + 0],
371+
sumi = syclcompat::dp4a(vi0, u[2 * i + 0],
371372
sumi); // SIMD dot product of quantized values
372373

373374
int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
374375
vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4
375376
vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12
376377
vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20
377378
vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28
378-
sumi = dpct::dp4a(vi1, u[2 * i + 1],
379+
sumi = syclcompat::dp4a(vi1, u[2 * i + 1],
379380
sumi); // SIMD dot product of quantized values
380381
}
381382

@@ -412,7 +413,7 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
412413
#pragma unroll
413414
for (int i = 0; i < vdr; ++i) {
414415
// SIMD dot product of quantized values
415-
sumi = dpct::dp4a(v[i], u[i], sumi);
416+
sumi = syclcompat::dp4a(v[i], u[i], sumi);
416417
}
417418

418419
return d8_0*d8_1 * sumi;
@@ -428,7 +429,7 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
428429
#pragma unroll
429430
for (int i = 0; i < vdr; ++i) {
430431
// SIMD dot product of quantized values
431-
sumi = dpct::dp4a(v[i], u[i], sumi);
432+
sumi = syclcompat::dp4a(v[i], u[i], sumi);
432433
}
433434

434435
#ifdef GGML_SYCL_F16
@@ -677,10 +678,10 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
677678
const int v1 = q4[0];
678679
const int v2 = q4[4];
679680

680-
const int dot1 = dpct::dp4a(ui2, v2 & 0x0f0f0f0f, dpct::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
681-
const int dot2 = dpct::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, dpct::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
682-
const int dot3 = dpct::dp4a(0x01010101, ui2, dpct::dp4a(0x01010101, ui1, 0));
683-
const int dot4 = dpct::dp4a(0x01010101, ui4, dpct::dp4a(0x01010101, ui3, 0));
681+
const int dot1 = syclcompat::dp4a(ui2, v2 & 0x0f0f0f0f, syclcompat::dp4a(ui1, v1 & 0x0f0f0f0f, 0));
682+
const int dot2 = syclcompat::dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, syclcompat::dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
683+
const int dot3 = syclcompat::dp4a(0x01010101, ui2, syclcompat::dp4a(0x01010101, ui1, 0));
684+
const int dot4 = syclcompat::dp4a(0x01010101, ui4, syclcompat::dp4a(0x01010101, ui3, 0));
684685

685686
sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
686687
sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
@@ -772,8 +773,8 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
772773
const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
773774
const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
774775

775-
const float sumf_d = d8_1 * (dpct::dp4a(ui1, v1, 0) * s[0] + dpct::dp4a(ui2, v2, 0) * s[1])
776-
+ d8_2 * (dpct::dp4a(ui3, v3, 0) * s[2] + dpct::dp4a(ui4, v4, 0) * s[3]);
776+
const float sumf_d = d8_1 * (syclcompat::dp4a(ui1, v1, 0) * s[0] + syclcompat::dp4a(ui2, v2, 0) * s[1])
777+
+ d8_2 * (syclcompat::dp4a(ui3, v3, 0) * s[2] + syclcompat::dp4a(ui4, v4, 0) * s[3]);
777778

778779
return d * sumf_d;
779780

@@ -865,8 +866,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
865866
grid[0] ^ signs[0], signs[0], std::minus<>());
866867
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
867868
grid[1] ^ signs[1], signs[1], std::minus<>());
868-
sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
869-
sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
869+
sumi1 = syclcompat::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
870+
sumi1 = syclcompat::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
870871
q8 += 8;
871872
}
872873
int sumi2 = 0;
@@ -877,8 +878,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
877878
grid[0] ^ signs[0], signs[0], std::minus<>());
878879
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
879880
grid[1] ^ signs[1], signs[1], std::minus<>());
880-
sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
881-
sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
881+
sumi2 = syclcompat::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
882+
sumi2 = syclcompat::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
882883
q8 += 8;
883884
}
884885
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
@@ -917,8 +918,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
917918
grid[0] ^ signs0, signs0, std::minus<>());
918919
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
919920
grid[1] ^ signs1, signs1, std::minus<>());
920-
sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
921-
sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
921+
sumi1 = syclcompat::dp4a(grid_l, *((const int *)q8 + 0), sumi1);
922+
sumi1 = syclcompat::dp4a(grid_h, *((const int *)q8 + 1), sumi1);
922923
q8 += 8;
923924
}
924925
int sumi2 = 0;
@@ -934,8 +935,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
934935
grid[0] ^ signs0, signs0, std::minus<>());
935936
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
936937
grid[1] ^ signs1, signs1, std::minus<>());
937-
sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
938-
sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
938+
sumi2 = syclcompat::dp4a(grid_l, *((const int *)q8 + 0), sumi2);
939+
sumi2 = syclcompat::dp4a(grid_h, *((const int *)q8 + 1), sumi2);
939940
q8 += 8;
940941
}
941942
const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f;
@@ -968,8 +969,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
968969
grid1[0] ^ signs[0], signs[0], std::minus<>());
969970
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
970971
grid2[0] ^ signs[1], signs[1], std::minus<>());
971-
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
972-
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
972+
sumi = syclcompat::dp4a(grid_l, *((const int *)q8 + 0), sumi);
973+
sumi = syclcompat::dp4a(grid_h, *((const int *)q8 + 1), sumi);
973974
q8 += 8;
974975
aux32 >>= 7;
975976
}
@@ -1009,8 +1010,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
10091010
grid1[0] ^ signs0, signs0, std::minus<>());
10101011
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
10111012
grid2[0] ^ signs1, signs1, std::minus<>());
1012-
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
1013-
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
1013+
sumi = syclcompat::dp4a(grid_l, *((const int *)q8 + 0), sumi);
1014+
sumi = syclcompat::dp4a(grid_h, *((const int *)q8 + 1), sumi);
10141015
q8 += 8;
10151016
}
10161017
const float d =
@@ -1037,8 +1038,8 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
10371038
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8)));
10381039
int grid0 = grid[0] & 0x0f0f0f0f;
10391040
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1040-
sumi = dpct::dp4a(q8[2 * l + 1], grid1,
1041-
dpct::dp4a(q8[2 * l + 0], grid0, sumi));
1041+
sumi = syclcompat::dp4a(q8[2 * l + 1], grid1,
1042+
syclcompat::dp4a(q8[2 * l + 0], grid0, sumi));
10421043
}
10431044

10441045
const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA;
@@ -1066,11 +1067,11 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
10661067
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8)));
10671068
int grid0 = grid[0] & 0x0f0f0f0f;
10681069
int grid1 = (grid[0] >> 4) & 0x0f0f0f0f;
1069-
sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1,
1070-
dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
1070+
sumi[l / 2] = syclcompat::dp4a(q8[2 * l + 1], grid1,
1071+
syclcompat::dp4a(q8[2 * l + 0], grid0, sumi[l / 2]));
10711072
const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA;
1072-
const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101,
1073-
dpct::dp4a(q8[2 * l + 0], 0x01010101, 0));
1073+
const int sumy = syclcompat::dp4a(q8[2 * l + 1], 0x01010101,
1074+
syclcompat::dp4a(q8[2 * l + 0], 0x01010101, 0));
10741075
sumf[l/2] += delta*sumy;
10751076
}
10761077

@@ -1101,8 +1102,8 @@ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
11011102
for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
11021103
const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16);
11031104
get_int_from_table_16(aux, values, v1, v2);
1104-
sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1);
1105-
sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2);
1105+
sumi1 = syclcompat::dp4a(v1, q8[l + 0], sumi1);
1106+
sumi2 = syclcompat::dp4a(v2, q8[l + 4], sumi2);
11061107
}
11071108

11081109
const float d = (float)bq->d * bq8_1->ds[0];
@@ -1128,8 +1129,8 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
11281129
int sumi1 = 0, sumi2 = 0;
11291130
for (int j = 0; j < 4; ++j) {
11301131
get_int_from_table_16(q4[j], values, v1, v2);
1131-
sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1);
1132-
sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2);
1132+
sumi1 = syclcompat::dp4a(v1, q8[j + 0], sumi1);
1133+
sumi2 = syclcompat::dp4a(v2, q8[j + 4], sumi2);
11331134
}
11341135
return d * (sumi1 + sumi2);
11351136
#else

0 commit comments

Comments
 (0)