14
14
#define GGML_SYCL_VECDOTQ_HPP
15
15
16
16
#include " dpct/helper.hpp"
17
+ #include " syclcompat/math.hpp"
17
18
18
19
typedef float (*vec_dot_q_sycl_t )(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
19
20
@@ -89,14 +90,14 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
89
90
const int vi = (v >> (2 *i)) & 0x03030303 ;
90
91
91
92
sumf_d +=
92
- d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); // SIMD dot product
93
+ d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); // SIMD dot product
93
94
94
95
// fill int with 4x m
95
96
int m = sc >> 4 ;
96
97
m |= m << 8 ;
97
98
m |= m << 16 ;
98
99
sumf_m += d8[i] *
99
- dpct ::dp4a (
100
+ syclcompat ::dp4a (
100
101
m, u[i],
101
102
0 ); // multiply constant q2_K part with sum of q8_1 values
102
103
}
@@ -139,7 +140,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
139
140
const int vi =
140
141
dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat ());
141
142
142
- sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
143
+ sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
143
144
}
144
145
145
146
return d3 * sumf;
@@ -162,11 +163,11 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
162
163
const int v1i = (v[1 ] >> (4 *i)) & 0x0F0F0F0F ;
163
164
164
165
const int dot1 =
165
- dpct ::dp4a (v1i, u[2 * i + 1 ],
166
- dpct ::dp4a (v0i, u[2 * i + 0 ], 0 )); // SIMD dot product
166
+ syclcompat ::dp4a (v1i, u[2 * i + 1 ],
167
+ syclcompat ::dp4a (v0i, u[2 * i + 0 ], 0 )); // SIMD dot product
167
168
const int dot2 =
168
- dpct ::dp4a (0x01010101 , u[2 * i + 1 ],
169
- dpct ::dp4a (0x01010101 , u[2 * i + 0 ], 0 )); // sum of u
169
+ syclcompat ::dp4a (0x01010101 , u[2 * i + 1 ],
170
+ syclcompat ::dp4a (0x01010101 , u[2 * i + 0 ], 0 )); // sum of u
170
171
171
172
sumf_d += d8[i] * (dot1 * sc[i]);
172
173
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
@@ -203,11 +204,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
203
204
const int v1i = vl1i | vh1i;
204
205
205
206
const int dot1 =
206
- dpct ::dp4a (v0i, u[2 * i + 0 ],
207
- dpct ::dp4a (v1i, u[2 * i + 1 ], 0 )); // SIMD dot product
207
+ syclcompat ::dp4a (v0i, u[2 * i + 0 ],
208
+ syclcompat ::dp4a (v1i, u[2 * i + 1 ], 0 )); // SIMD dot product
208
209
const int dot2 =
209
- dpct ::dp4a (0x01010101 , u[2 * i + 0 ],
210
- dpct ::dp4a (0x01010101 , u[2 * i + 1 ], 0 )); // sum of u
210
+ syclcompat ::dp4a (0x01010101 , u[2 * i + 0 ],
211
+ syclcompat ::dp4a (0x01010101 , u[2 * i + 1 ], 0 )); // sum of u
211
212
212
213
sumf_d += d8[i] * (dot1 * sc[i]);
213
214
sumf_m += d8[i] * (dot2 * m[i]);
@@ -243,7 +244,7 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
243
244
const int vi = dpct::vectorized_binary<sycl::char4>(
244
245
(vil | vih), 0x20202020 , dpct::sub_sat ()); // vi = (vil | vih) - 32
245
246
246
- sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
247
+ sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
247
248
}
248
249
249
250
return d*sumf;
@@ -266,8 +267,8 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
266
267
const int vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
267
268
268
269
// SIMD dot product of quantized values
269
- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ], sumi);
270
- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ], sumi);
270
+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ], sumi);
271
+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ], sumi);
271
272
}
272
273
273
274
const sycl::float2 ds8f =
@@ -293,8 +294,8 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
293
294
const int vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
294
295
295
296
// SIMD dot product of quantized values
296
- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ], sumi);
297
- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ], sumi);
297
+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ], sumi);
298
+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ], sumi);
298
299
}
299
300
300
301
#ifdef GGML_SYCL_F16
@@ -331,15 +332,15 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
331
332
vi0 |= (vh[i] << 11 ) & 0x00001000 ; // 1 -> 12
332
333
vi0 |= (vh[i] << 18 ) & 0x00100000 ; // 2 -> 20
333
334
vi0 |= (vh[i] << 25 ) & 0x10000000 ; // 3 -> 28
334
- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ],
335
+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ],
335
336
sumi); // SIMD dot product of quantized values
336
337
337
338
int vi1 = (vl[i] >> 4 ) & 0x0F0F0F0F ; // upper 4 qs bits, still need qh as 5th bits
338
339
vi1 |= (vh[i] >> 12 ) & 0x00000010 ; // 16 -> 4
339
340
vi1 |= (vh[i] >> 5 ) & 0x00001000 ; // 17 -> 12
340
341
vi1 |= (vh[i] << 2 ) & 0x00100000 ; // 18 -> 20
341
342
vi1 |= (vh[i] << 9 ) & 0x10000000 ; // 19 -> 28
342
- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ],
343
+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ],
343
344
sumi); // SIMD dot product of quantized values
344
345
}
345
346
@@ -367,15 +368,15 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
367
368
vi0 |= (vh[i] << 11 ) & 0x00001000 ; // 1 -> 12
368
369
vi0 |= (vh[i] << 18 ) & 0x00100000 ; // 2 -> 20
369
370
vi0 |= (vh[i] << 25 ) & 0x10000000 ; // 3 -> 28
370
- sumi = dpct ::dp4a (vi0, u[2 * i + 0 ],
371
+ sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ],
371
372
sumi); // SIMD dot product of quantized values
372
373
373
374
int vi1 = (vl[i] >> 4 ) & 0x0F0F0F0F ; // upper 4 qs bits, still need qh as 5th bits
374
375
vi1 |= (vh[i] >> 12 ) & 0x00000010 ; // 16 -> 4
375
376
vi1 |= (vh[i] >> 5 ) & 0x00001000 ; // 17 -> 12
376
377
vi1 |= (vh[i] << 2 ) & 0x00100000 ; // 18 -> 20
377
378
vi1 |= (vh[i] << 9 ) & 0x10000000 ; // 19 -> 28
378
- sumi = dpct ::dp4a (vi1, u[2 * i + 1 ],
379
+ sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ],
379
380
sumi); // SIMD dot product of quantized values
380
381
}
381
382
@@ -412,7 +413,7 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
412
413
#pragma unroll
413
414
for (int i = 0 ; i < vdr; ++i) {
414
415
// SIMD dot product of quantized values
415
- sumi = dpct ::dp4a (v[i], u[i], sumi);
416
+ sumi = syclcompat ::dp4a (v[i], u[i], sumi);
416
417
}
417
418
418
419
return d8_0*d8_1 * sumi;
@@ -428,7 +429,7 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
428
429
#pragma unroll
429
430
for (int i = 0 ; i < vdr; ++i) {
430
431
// SIMD dot product of quantized values
431
- sumi = dpct ::dp4a (v[i], u[i], sumi);
432
+ sumi = syclcompat ::dp4a (v[i], u[i], sumi);
432
433
}
433
434
434
435
#ifdef GGML_SYCL_F16
@@ -677,10 +678,10 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
677
678
const int v1 = q4[0 ];
678
679
const int v2 = q4[4 ];
679
680
680
- const int dot1 = dpct ::dp4a (ui2, v2 & 0x0f0f0f0f , dpct ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
681
- const int dot2 = dpct ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , dpct ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
682
- const int dot3 = dpct ::dp4a (0x01010101 , ui2, dpct ::dp4a (0x01010101 , ui1, 0 ));
683
- const int dot4 = dpct ::dp4a (0x01010101 , ui4, dpct ::dp4a (0x01010101 , ui3, 0 ));
681
+ const int dot1 = syclcompat ::dp4a (ui2, v2 & 0x0f0f0f0f , syclcompat ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
682
+ const int dot2 = syclcompat ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , syclcompat ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
683
+ const int dot3 = syclcompat ::dp4a (0x01010101 , ui2, syclcompat ::dp4a (0x01010101 , ui1, 0 ));
684
+ const int dot4 = syclcompat ::dp4a (0x01010101 , ui4, syclcompat ::dp4a (0x01010101 , ui3, 0 ));
684
685
685
686
sumf_d += d8_1 * (dot1 * s[0 ]) + d8_2 * (dot2 * s[1 ]);
686
687
sumf_m += d8_1 * (dot3 * s[2 ]) + d8_2 * (dot4 * s[3 ]);
@@ -772,8 +773,8 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
772
773
const int v3 = (((vh >> 0 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl1 >> 4 ) & 0x0f0f0f0f );
773
774
const int v4 = (((vh >> 2 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl2 >> 4 ) & 0x0f0f0f0f );
774
775
775
- const float sumf_d = d8_1 * (dpct ::dp4a (ui1, v1, 0 ) * s[0 ] + dpct ::dp4a (ui2, v2, 0 ) * s[1 ])
776
- + d8_2 * (dpct ::dp4a (ui3, v3, 0 ) * s[2 ] + dpct ::dp4a (ui4, v4, 0 ) * s[3 ]);
776
+ const float sumf_d = d8_1 * (syclcompat ::dp4a (ui1, v1, 0 ) * s[0 ] + syclcompat ::dp4a (ui2, v2, 0 ) * s[1 ])
777
+ + d8_2 * (syclcompat ::dp4a (ui3, v3, 0 ) * s[2 ] + syclcompat ::dp4a (ui4, v4, 0 ) * s[3 ]);
777
778
778
779
return d * sumf_d;
779
780
@@ -865,8 +866,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
865
866
grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
866
867
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
867
868
grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
868
- sumi1 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
869
- sumi1 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
869
+ sumi1 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
870
+ sumi1 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
870
871
q8 += 8 ;
871
872
}
872
873
int sumi2 = 0 ;
@@ -877,8 +878,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
877
878
grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
878
879
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
879
880
grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
880
- sumi2 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
881
- sumi2 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
881
+ sumi2 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
882
+ sumi2 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
882
883
q8 += 8 ;
883
884
}
884
885
const float d = (float )bq2->d * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -917,8 +918,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
917
918
grid[0 ] ^ signs0, signs0, std::minus<>());
918
919
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
919
920
grid[1 ] ^ signs1, signs1, std::minus<>());
920
- sumi1 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
921
- sumi1 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
921
+ sumi1 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
922
+ sumi1 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
922
923
q8 += 8 ;
923
924
}
924
925
int sumi2 = 0 ;
@@ -934,8 +935,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
934
935
grid[0 ] ^ signs0, signs0, std::minus<>());
935
936
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
936
937
grid[1 ] ^ signs1, signs1, std::minus<>());
937
- sumi2 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
938
- sumi2 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
938
+ sumi2 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
939
+ sumi2 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
939
940
q8 += 8 ;
940
941
}
941
942
const float d = (float )bq2->d * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -968,8 +969,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
968
969
grid1[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
969
970
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
970
971
grid2[0 ] ^ signs[1 ], signs[1 ], std::minus<>());
971
- sumi = dpct ::dp4a (grid_l, *((int *)q8 + 0 ), sumi);
972
- sumi = dpct ::dp4a (grid_h, *((int *)q8 + 1 ), sumi);
972
+ sumi = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
973
+ sumi = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
973
974
q8 += 8 ;
974
975
aux32 >>= 7 ;
975
976
}
@@ -1009,8 +1010,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
1009
1010
grid1[0 ] ^ signs0, signs0, std::minus<>());
1010
1011
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1011
1012
grid2[0 ] ^ signs1, signs1, std::minus<>());
1012
- sumi = dpct ::dp4a (grid_l, *((int *)q8 + 0 ), sumi);
1013
- sumi = dpct ::dp4a (grid_h, *((int *)q8 + 1 ), sumi);
1013
+ sumi = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
1014
+ sumi = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
1014
1015
q8 += 8 ;
1015
1016
}
1016
1017
const float d =
@@ -1037,8 +1038,8 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
1037
1038
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [ib32] >> 3 *l) & 7 ) << 8 )));
1038
1039
int grid0 = grid[0 ] & 0x0f0f0f0f ;
1039
1040
int grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1040
- sumi = dpct ::dp4a (q8[2 * l + 1 ], grid1,
1041
- dpct ::dp4a (q8[2 * l + 0 ], grid0, sumi));
1041
+ sumi = syclcompat ::dp4a (q8[2 * l + 1 ], grid1,
1042
+ syclcompat ::dp4a (q8[2 * l + 0 ], grid0, sumi));
1042
1043
}
1043
1044
1044
1045
const float delta = bq1->qh [ib32] & 0x8000 ? -1 -IQ1S_DELTA : -1 +IQ1S_DELTA;
@@ -1066,11 +1067,11 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
1066
1067
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 7 ) << 8 )));
1067
1068
int grid0 = grid[0 ] & 0x0f0f0f0f ;
1068
1069
int grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1069
- sumi[l / 2 ] = dpct ::dp4a (q8[2 * l + 1 ], grid1,
1070
- dpct ::dp4a (q8[2 * l + 0 ], grid0, sumi[l / 2 ]));
1070
+ sumi[l / 2 ] = syclcompat ::dp4a (q8[2 * l + 1 ], grid1,
1071
+ syclcompat ::dp4a (q8[2 * l + 0 ], grid0, sumi[l / 2 ]));
1071
1072
const float delta = (bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 0x08 ? -1 -IQ1M_DELTA : -1 +IQ1M_DELTA;
1072
- const int sumy = dpct ::dp4a (q8[2 * l + 1 ], 0x01010101 ,
1073
- dpct ::dp4a (q8[2 * l + 0 ], 0x01010101 , 0 ));
1073
+ const int sumy = syclcompat ::dp4a (q8[2 * l + 1 ], 0x01010101 ,
1074
+ syclcompat ::dp4a (q8[2 * l + 0 ], 0x01010101 , 0 ));
1074
1075
sumf[l/2 ] += delta*sumy;
1075
1076
}
1076
1077
@@ -1101,8 +1102,8 @@ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
1101
1102
for (int l = 0 ; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
1102
1103
const uint32_t aux = q4[2 *l] | (q4[2 *l+1 ] << 16 );
1103
1104
get_int_from_table_16 (aux, values, v1, v2);
1104
- sumi1 = dpct ::dp4a (v1, q8[l + 0 ], sumi1);
1105
- sumi2 = dpct ::dp4a (v2, q8[l + 4 ], sumi2);
1105
+ sumi1 = syclcompat ::dp4a (v1, q8[l + 0 ], sumi1);
1106
+ sumi2 = syclcompat ::dp4a (v2, q8[l + 4 ], sumi2);
1106
1107
}
1107
1108
1108
1109
const float d = (float )bq->d * bq8_1->ds [0 ];
@@ -1128,8 +1129,8 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
1128
1129
int sumi1 = 0 , sumi2 = 0 ;
1129
1130
for (int j = 0 ; j < 4 ; ++j) {
1130
1131
get_int_from_table_16 (q4[j], values, v1, v2);
1131
- sumi1 = dpct ::dp4a (v1, q8[j + 0 ], sumi1);
1132
- sumi2 = dpct ::dp4a (v2, q8[j + 4 ], sumi2);
1132
+ sumi1 = syclcompat ::dp4a (v1, q8[j + 0 ], sumi1);
1133
+ sumi2 = syclcompat ::dp4a (v2, q8[j + 4 ], sumi2);
1133
1134
}
1134
1135
return d * (sumi1 + sumi2);
1135
1136
#else
0 commit comments