Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit a09a3ce

Browse files
committed
[ESIMD] Add more tests for new xmx::dpas()
* Add the test for bfloat16; * Add the tests for dg2; * Run DG2 and PVC tests on esimd_emulator; * Eliminated compilation warnings. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent be408a8 commit a09a3ce

File tree

4 files changed

+118
-34
lines changed

4 files changed

+118
-34
lines changed

SYCL/ESIMD/dpas/dpas_bf16.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
//==---------------- dpas_bf16.cpp - DPC++ ESIMD on-device test ----------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
9+
// UNSUPPORTED: cuda || hip
10+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
13+
// This test verifies DPAS support for bfloat16.
14+
15+
#include "dpas_common.hpp"
16+
17+
int main(int argc, const char *argv[]) {
18+
queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler());
19+
20+
bool Print = argc > 1 && std::string(argv[1]) == "-debug";
21+
bool Passed = true;
22+
23+
constexpr bool LetDeduceArgs = true;
24+
Passed &= tests<8, 8, bf16, bf16, LetDeduceArgs>(Q, Print);
25+
Passed &= tests<8, 4, bf16, bf16, LetDeduceArgs>(Q, Print);
26+
Passed &= tests<8, 1, bf16, bf16, LetDeduceArgs>(Q, Print);
27+
28+
// TODO: Enable these cases when esimd::simd(ptr) constructor is fixed.
29+
// Passed &= tests<8, 5, bf16, bf16, LetDeduceArgs>(Q, Print);
30+
// Passed &= tests<8, 3, bf16, bf16, LetDeduceArgs>(Q, Print);
31+
32+
std::cout << (Passed ? "Test Passed\n" : "Test FAILED\n");
33+
return Passed ? 0 : 1;
34+
}

SYCL/ESIMD/dpas/dpas_common.hpp

Lines changed: 70 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ std::string toString(dpas_argument_type T) {
5050
return "bf16";
5151
case dpas_argument_type::tf32:
5252
return "tf32";
53-
case dpas_argument_type::S1:
54-
case dpas_argument_type::U1:
53+
case dpas_argument_type::s1:
54+
case dpas_argument_type::u1:
5555
case dpas_argument_type::Invalid:
5656
return "UNSUPPORTED";
5757
}
@@ -65,7 +65,7 @@ template <dpas_argument_type T> struct DpasPrintType {
6565
static constexpr bool is_uint = T == dpas_argument_type::u2 ||
6666
T == dpas_argument_type::u4 ||
6767
T == dpas_argument_type::u8;
68-
static constexpr bool is_fp = T == dpas_argument_type::FP16 ||
68+
static constexpr bool is_fp = T == dpas_argument_type::fp16 ||
6969
T == dpas_argument_type::bf16 ||
7070
T == dpas_argument_type::tf32;
7171

@@ -100,7 +100,7 @@ template <dpas_argument_type T> struct DpasNaturalOperandType {
100100
is_uint, unsigned char,
101101
std::conditional_t<
102102
is_fp16, sycl::half,
103-
std::conditional<
103+
std::conditional_t<
104104
is_bf16, sycl::ext::oneapi::experimental::bfloat16, void>>>>;
105105
};
106106

@@ -123,6 +123,11 @@ template <dpas_argument_type T> constexpr int getBitSize() {
123123

124124
case dpas_argument_type::tf32:
125125
return 32;
126+
127+
case dpas_argument_type::Invalid:
128+
case dpas_argument_type::s1:
129+
case dpas_argument_type::u1:
130+
break;
126131
}
127132
return 0;
128133
}
@@ -282,7 +287,8 @@ void printMatrix(void *Vec, std::string Msg) {
282287
}
283288

284289
template <int SystolicDepth, int RepeatCount, dpas_argument_type BPrec,
285-
dpas_argument_type APrec, bool UseSrc0>
290+
dpas_argument_type APrec, bool UseSrc0, int ExecSize,
291+
bool LetDeduceArgs>
286292
bool test(queue &Q, bool Print) {
287293
constexpr unsigned Size = 128;
288294
constexpr unsigned VL = 16;
@@ -300,12 +306,13 @@ bool test(queue &Q, bool Print) {
300306
// where:
301307
constexpr int M = RepeatCount;
302308
constexpr int K = SystolicDepth * OpsPerChannel;
303-
constexpr int N = 16; // Execution size: 16 for PVC.
309+
constexpr int N = ExecSize; // 16 for PVC, 8 for DG2.
304310

305311
auto Dev = Q.get_device();
306-
std::cout << "Running test case " << toString(BPrec, APrec)
307-
<< " with UseSrc0 = " << UseSrc0 << " on "
308-
<< Dev.get_info<info::device::name>() << "\n";
312+
std::cout << "Running on " << Dev.get_info<info::device::name>()
313+
<< " (ExecSize = " << ExecSize << "): " << toString(BPrec, APrec)
314+
<< ", UseSrc0 = " << UseSrc0
315+
<< ", LetDeduceArgs = " << LetDeduceArgs << std::endl;
309316

310317
using ANaturalType = typename DpasNaturalOperandType<APrec>::type;
311318
using BNaturalType = typename DpasNaturalOperandType<BPrec>::type;
@@ -317,10 +324,10 @@ bool test(queue &Q, bool Print) {
317324
auto BPacked = aligned_alloc_shared<BNaturalType>(128, BPackedSize, Q);
318325
auto Res = aligned_alloc_shared<ResNaturalType>(128, M * N, Q);
319326
// Init APacked;
320-
int Value = 0;
327+
float Value = 1.2;
321328
for (int II = 0; II < M; II++) {
322329
for (int JJ = 0; JJ < K; JJ++) {
323-
Value++;
330+
Value += 1.1;
324331
writeToHorizontallyPackedMatrix<M, K, APrec>(
325332
APacked, II, JJ, static_cast<ANaturalType>(Value));
326333
}
@@ -345,15 +352,27 @@ bool test(queue &Q, bool Print) {
345352
simd<BNaturalType, BPackedSize> B(BPacked, overaligned_tag<16>{});
346353
simd<ResNaturalType, M * N> C;
347354

348-
if constexpr (UseSrc0) {
349-
// Compute C = C + AxB;
350-
C = 1;
351-
C = dpas<8, RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
352-
ANaturalType, BPrec, APrec>(C, B, A);
355+
if constexpr (LetDeduceArgs) {
356+
if constexpr (UseSrc0) {
357+
// Compute C = C + AxB;
358+
C = 1;
359+
C = dpas<8, RepeatCount, ResNaturalType>(C, B, A);
360+
} else {
361+
// Compute C = AxB;
362+
C = dpas<8, RepeatCount, ResNaturalType>(B, A);
363+
}
364+
353365
} else {
354-
// Compute C = AxB;
355-
C = dpas<8, RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
356-
BPrec, APrec>(B, A);
366+
if constexpr (UseSrc0) {
367+
// Compute C = C + AxB;
368+
C = 1;
369+
C = dpas<8, RepeatCount, ResNaturalType, ResNaturalType, BNaturalType,
370+
ANaturalType, BPrec, APrec>(C, B, A);
371+
} else {
372+
// Compute C = AxB;
373+
C = dpas<8, RepeatCount, ResNaturalType, BNaturalType, ANaturalType,
374+
BPrec, APrec>(B, A);
375+
}
357376
}
358377

359378
C.copy_to(Res);
@@ -396,11 +415,40 @@ bool test(queue &Q, bool Print) {
396415
}
397416

398417
template <int SystolicDepth, int RepeatCount, dpas_argument_type T1,
399-
dpas_argument_type T2>
418+
dpas_argument_type T2, bool LetDeduceArgs = false>
400419
bool tests(queue Q, bool Print) {
401420
bool Passed = true;
402421
constexpr bool UseSrc0 = true;
403-
Passed &= test<SystolicDepth, RepeatCount, T1, T2, UseSrc0>(Q, Print);
404-
Passed &= test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0>(Q, Print);
422+
auto Dev = Q.get_device();
423+
424+
// Detect the execution size.
425+
// The device trait is not implemented for esimd_emulator. Use both 8 and 16.
426+
int ExecSize;
427+
bool IsEmulator = false;
428+
try {
429+
ExecSize = Dev.get_info<ext::intel::info::device::gpu_eu_simd_width>();
430+
} catch (sycl::exception e) {
431+
IsEmulator = true;
432+
}
433+
assert((IsEmulator || (ExecSize == 8 || ExecSize == 16)) &&
434+
"Execution size must be 8 or 16");
435+
436+
if (ExecSize == 16 || IsEmulator) {
437+
Passed &=
438+
test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 16, LetDeduceArgs>(
439+
Q, Print);
440+
Passed &=
441+
test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 16, LetDeduceArgs>(
442+
Q, Print);
443+
}
444+
if (ExecSize == 8 || IsEmulator) {
445+
Passed &=
446+
test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 8, LetDeduceArgs>(
447+
Q, Print);
448+
Passed &=
449+
test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 8, LetDeduceArgs>(
450+
Q, Print);
451+
}
452+
405453
return Passed;
406454
}

SYCL/ESIMD/dpas/dpas_fp16.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8-
// REQUIRES: gpu-intel-pvc || esimd_emulator
8+
// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
99
// UNSUPPORTED: cuda || hip
10-
// RUN: %clangxx -fsycl %s -o %t.out
10+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
1111
// RUN: %GPU_RUN_PLACEHOLDER %t.out
1212

1313
// This test verifies DPAS support for float16.
@@ -20,14 +20,14 @@ int main(int argc, const char *argv[]) {
2020
bool Print = argc > 1 && std::string(argv[1]) == "-debug";
2121
bool Passed = true;
2222

23-
// Test unsigned 2-bit integers./
24-
Passed &= tests<8, 8, fp16, fp16>(Q, Print);
25-
Passed &= tests<8, 4, fp16, fp16>(Q, Print);
26-
Passed &= tests<8, 1, fp16, fp16>(Q, Print);
23+
constexpr bool LetDeduceArgs = true;
24+
Passed &= tests<8, 8, fp16, fp16, LetDeduceArgs>(Q, Print);
25+
Passed &= tests<8, 4, fp16, fp16, LetDeduceArgs>(Q, Print);
26+
Passed &= tests<8, 1, fp16, fp16, LetDeduceArgs>(Q, Print);
2727

2828
// TODO: Enable these cases when esimd::simd(ptr) constructor is fixed.
29-
// Passed &= tests<8, 5, fp16, fp16>(Q, Print);
30-
// Passed &= tests<8, 3, fp16, fp16>(Q, Print);
29+
// Passed &= tests<8, 5, fp16, fp16, LetDeduceArgs>(Q, Print);
30+
// Passed &= tests<8, 3, fp16, fp16, LetDeduceArgs>(Q, Print);
3131

3232
std::cout << (Passed ? "Test Passed\n" : "Test FAILED\n");
3333
return Passed ? 0 : 1;

SYCL/ESIMD/dpas/dpas_int.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8-
// REQUIRES: gpu-intel-pvc || esimd_emulator
8+
// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
99
// UNSUPPORTED: cuda || hip
10-
// RUN: %clangxx -fsycl %s -o %t.out
10+
// RUN: %clangxx -fsycl -fsycl-device-code-split=per_kernel %s -o %t.out
1111
// RUN: %GPU_RUN_PLACEHOLDER %t.out
1212

1313
// This test verifies DPAS support for 2,4,8-bit integers.
@@ -20,7 +20,9 @@ int main(int argc, const char *argv[]) {
2020
bool Print = argc > 1 && std::string(argv[1]) == "-debug";
2121
bool Passed = true;
2222

23-
// Test unsigned 2-bit integers./
23+
constexpr bool LetDeduceArgs = true;
24+
25+
// Test unsigned 2-bit integers.
2426
Passed &= tests<8, 8, u2, u2>(Q, Print);
2527
Passed &= tests<8, 4, u2, u2>(Q, Print);
2628
// TODO: enable this case when the problem with simd constructor
@@ -46,7 +48,7 @@ int main(int argc, const char *argv[]) {
4648

4749
// Test couple combinations with 8-bit integers.
4850
Passed &= tests<8, 8, s8, s8>(Q, Print);
49-
Passed &= tests<8, 2, u8, s8>(Q, Print);
51+
Passed &= tests<8, 2, u8, s8, LetDeduceArgs>(Q, Print);
5052

5153
// Test some mixes of 2/4/8-bit integers.
5254
Passed &= tests<8, 8, s2, s4>(Q, Print);

0 commit comments

Comments
 (0)