@@ -50,8 +50,8 @@ std::string toString(dpas_argument_type T) {
50
50
return " bf16" ;
51
51
case dpas_argument_type::tf32:
52
52
return " tf32" ;
53
- case dpas_argument_type::S1 :
54
- case dpas_argument_type::U1 :
53
+ case dpas_argument_type::s1 :
54
+ case dpas_argument_type::u1 :
55
55
case dpas_argument_type::Invalid:
56
56
return " UNSUPPORTED" ;
57
57
}
@@ -65,7 +65,7 @@ template <dpas_argument_type T> struct DpasPrintType {
65
65
static constexpr bool is_uint = T == dpas_argument_type::u2 ||
66
66
T == dpas_argument_type::u4 ||
67
67
T == dpas_argument_type::u8 ;
68
- static constexpr bool is_fp = T == dpas_argument_type::FP16 ||
68
+ static constexpr bool is_fp = T == dpas_argument_type::fp16 ||
69
69
T == dpas_argument_type::bf16 ||
70
70
T == dpas_argument_type::tf32;
71
71
@@ -100,7 +100,7 @@ template <dpas_argument_type T> struct DpasNaturalOperandType {
100
100
is_uint, unsigned char ,
101
101
std::conditional_t <
102
102
is_fp16, sycl::half,
103
- std::conditional <
103
+ std::conditional_t <
104
104
is_bf16, sycl::ext::oneapi::experimental::bfloat16, void >>>>;
105
105
};
106
106
@@ -123,6 +123,11 @@ template <dpas_argument_type T> constexpr int getBitSize() {
123
123
124
124
case dpas_argument_type::tf32:
125
125
return 32 ;
126
+
127
+ case dpas_argument_type::Invalid:
128
+ case dpas_argument_type::s1:
129
+ case dpas_argument_type::u1:
130
+ break ;
126
131
}
127
132
return 0 ;
128
133
}
@@ -282,7 +287,8 @@ void printMatrix(void *Vec, std::string Msg) {
282
287
}
283
288
284
289
template <int SystolicDepth, int RepeatCount, dpas_argument_type BPrec,
285
- dpas_argument_type APrec, bool UseSrc0>
290
+ dpas_argument_type APrec, bool UseSrc0, int ExecSize,
291
+ bool LetDeduceArgs>
286
292
bool test (queue &Q, bool Print) {
287
293
constexpr unsigned Size = 128 ;
288
294
constexpr unsigned VL = 16 ;
@@ -300,12 +306,13 @@ bool test(queue &Q, bool Print) {
300
306
// where:
301
307
constexpr int M = RepeatCount;
302
308
constexpr int K = SystolicDepth * OpsPerChannel;
303
- constexpr int N = 16 ; // Execution size: 16 for PVC.
309
+ constexpr int N = ExecSize ; // 16 for PVC, 8 for DG2 .
304
310
305
311
auto Dev = Q.get_device ();
306
- std::cout << " Running test case " << toString (BPrec, APrec)
307
- << " with UseSrc0 = " << UseSrc0 << " on "
308
- << Dev.get_info <info::device::name>() << " \n " ;
312
+ std::cout << " Running on " << Dev.get_info <info::device::name>()
313
+ << " (ExecSize = " << ExecSize << " ): " << toString (BPrec, APrec)
314
+ << " , UseSrc0 = " << UseSrc0
315
+ << " , LetDeduceArgs = " << LetDeduceArgs << std::endl;
309
316
310
317
using ANaturalType = typename DpasNaturalOperandType<APrec>::type;
311
318
using BNaturalType = typename DpasNaturalOperandType<BPrec>::type;
@@ -317,10 +324,10 @@ bool test(queue &Q, bool Print) {
317
324
auto BPacked = aligned_alloc_shared<BNaturalType>(128 , BPackedSize, Q);
318
325
auto Res = aligned_alloc_shared<ResNaturalType>(128 , M * N, Q);
319
326
// Init APacked;
320
- int Value = 0 ;
327
+ float Value = 1.2 ;
321
328
for (int II = 0 ; II < M; II++) {
322
329
for (int JJ = 0 ; JJ < K; JJ++) {
323
- Value++ ;
330
+ Value += 1.1 ;
324
331
writeToHorizontallyPackedMatrix<M, K, APrec>(
325
332
APacked, II, JJ, static_cast <ANaturalType>(Value));
326
333
}
@@ -345,15 +352,27 @@ bool test(queue &Q, bool Print) {
345
352
simd<BNaturalType, BPackedSize> B (BPacked, overaligned_tag<16 >{});
346
353
simd<ResNaturalType, M * N> C;
347
354
348
- if constexpr (UseSrc0) {
349
- // Compute C = C + AxB;
350
- C = 1 ;
351
- C = dpas<8 , RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
352
- ANaturalType, BPrec, APrec>(C, B, A);
355
+ if constexpr (LetDeduceArgs) {
356
+ if constexpr (UseSrc0) {
357
+ // Compute C = C + AxB;
358
+ C = 1 ;
359
+ C = dpas<8 , RepeatCount, ResNaturalType>(C, B, A);
360
+ } else {
361
+ // Compute C = AxB;
362
+ C = dpas<8 , RepeatCount, ResNaturalType>(B, A);
363
+ }
364
+
353
365
} else {
354
- // Compute C = AxB;
355
- C = dpas<8 , RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
356
- BPrec, APrec>(B, A);
366
+ if constexpr (UseSrc0) {
367
+ // Compute C = C + AxB;
368
+ C = 1 ;
369
+ C = dpas<8 , RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
370
+ ANaturalType, BPrec, APrec>(C, B, A);
371
+ } else {
372
+ // Compute C = AxB;
373
+ C = dpas<8 , RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
374
+ BPrec, APrec>(B, A);
375
+ }
357
376
}
358
377
359
378
C.copy_to (Res);
@@ -396,11 +415,40 @@ bool test(queue &Q, bool Print) {
396
415
}
397
416
398
417
template <int SystolicDepth, int RepeatCount, dpas_argument_type T1,
399
- dpas_argument_type T2>
418
+ dpas_argument_type T2, bool LetDeduceArgs = false >
400
419
bool tests (queue Q, bool Print) {
401
420
bool Passed = true ;
402
421
constexpr bool UseSrc0 = true ;
403
- Passed &= test<SystolicDepth, RepeatCount, T1, T2, UseSrc0>(Q, Print);
404
- Passed &= test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0>(Q, Print);
422
+ auto Dev = Q.get_device ();
423
+
424
+ // Detect the execution size.
425
+ // The device trait is not implemented for esimd_emulator. Use both 8 and 16.
426
+ int ExecSize;
427
+ bool IsEmulator = false ;
428
+ try {
429
+ ExecSize = Dev.get_info <ext::intel::info::device::gpu_eu_simd_width>();
430
+ } catch (sycl::exception e) {
431
+ IsEmulator = true ;
432
+ }
433
+ assert ((IsEmulator || (ExecSize == 8 || ExecSize == 16 )) &&
434
+ " Execution size must be 8 or 16" );
435
+
436
+ if (ExecSize == 16 || IsEmulator) {
437
+ Passed &=
438
+ test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 16 , LetDeduceArgs>(
439
+ Q, Print);
440
+ Passed &=
441
+ test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 16 , LetDeduceArgs>(
442
+ Q, Print);
443
+ }
444
+ if (ExecSize == 8 || IsEmulator) {
445
+ Passed &=
446
+ test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 8 , LetDeduceArgs>(
447
+ Q, Print);
448
+ Passed &=
449
+ test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 8 , LetDeduceArgs>(
450
+ Q, Print);
451
+ }
452
+
405
453
return Passed;
406
454
}
0 commit comments