diff --git a/SYCL/ESIMD/dpas/dpas_common.hpp b/SYCL/ESIMD/dpas/dpas_common.hpp new file mode 100644 index 0000000000..bc2d4452e3 --- /dev/null +++ b/SYCL/ESIMD/dpas/dpas_common.hpp @@ -0,0 +1,406 @@ +//==---------------- dpas_common.hpp - DPC++ ESIMD on-device test ---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// This file contains the common utility/helper functions for the tests +// verifying DPAS functionality. + +#include "../esimd_test_utils.hpp" + +#include +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel::esimd; +using namespace sycl::ext::intel::esimd::xmx; + +constexpr dpas_argument_type s2 = dpas_argument_type::s2; +constexpr dpas_argument_type u2 = dpas_argument_type::u2; +constexpr dpas_argument_type s4 = dpas_argument_type::s4; +constexpr dpas_argument_type u4 = dpas_argument_type::u4; +constexpr dpas_argument_type s8 = dpas_argument_type::s8; +constexpr dpas_argument_type u8 = dpas_argument_type::u8; + +constexpr dpas_argument_type fp16 = dpas_argument_type::fp16; +constexpr dpas_argument_type bf16 = dpas_argument_type::bf16; + +std::string toString(dpas_argument_type T) { + switch (T) { + case dpas_argument_type::s2: + return "s2"; + case dpas_argument_type::u2: + return "u2"; + case dpas_argument_type::s4: + return "s4"; + case dpas_argument_type::u4: + return "u4"; + case dpas_argument_type::s8: + return "s8"; + case dpas_argument_type::u8: + return "u8"; + case dpas_argument_type::fp16: + return "fp16"; + case dpas_argument_type::bf16: + return "bf16"; + case dpas_argument_type::tf32: + return "tf32"; + case dpas_argument_type::S1: + case dpas_argument_type::U1: + case dpas_argument_type::Invalid: + return "UNSUPPORTED"; + } + return "UNRECOGNIZED"; +} + +template struct DpasPrintType { + static constexpr bool is_sint = T == dpas_argument_type::s2 || + T == dpas_argument_type::s4 || + T == dpas_argument_type::s8; + static constexpr bool is_uint = T == dpas_argument_type::u2 || + T == dpas_argument_type::u4 || + T == dpas_argument_type::u8; + static constexpr bool is_fp = T == dpas_argument_type::FP16 || + T == dpas_argument_type::bf16 || + T == dpas_argument_type::tf32; + + using type = std::conditional_t< + is_fp, float, + std::conditional_t>>; +}; + +template struct getIntTypeWithSize { + using type = + std::conditional_t>; +}; + +template struct DpasNaturalOperandType { + static constexpr bool is_sint = T == dpas_argument_type::s2 || + T == dpas_argument_type::s4 || + T == dpas_argument_type::s8; + static constexpr bool is_uint = T == dpas_argument_type::u2 || + T == dpas_argument_type::u4 || + T == dpas_argument_type::u8; + + static constexpr bool is_fp16 = T == dpas_argument_type::fp16; + static constexpr bool is_bf16 = T == dpas_argument_type::bf16; + static constexpr bool is_tf32 = T == dpas_argument_type::tf32; + + // TODO: support tf32 here. + using type = std::conditional_t< + is_sint, signed char, + std::conditional_t< + is_uint, unsigned char, + std::conditional_t< + is_fp16, sycl::half, + std::conditional< + is_bf16, sycl::ext::oneapi::experimental::bfloat16, void>>>>; +}; + +template constexpr int getBitSize() { + switch (T) { + case dpas_argument_type::s2: + case dpas_argument_type::u2: + return 2; + + case dpas_argument_type::s4: + case dpas_argument_type::u4: + return 4; + + case dpas_argument_type::s8: + case dpas_argument_type::u8: + return 8; + case dpas_argument_type::fp16: + case dpas_argument_type::bf16: + return 16; + + case dpas_argument_type::tf32: + return 32; + } + return 0; +} + +std::string toString(dpas_argument_type T1, dpas_argument_type T2) { + return std::string("{") + toString(T1) + ", " + toString(T2) + "}"; +} + +template +void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col, + ElemT Value) { + constexpr int ElemBitSize = getBitSize(); + + ElemT *Vec = reinterpret_cast(VVec); + + // 1. Find and read the target 'unsigned int' element. + // THe unpacked matrix has dimensions: NumRows*NumCols + constexpr int ElemsInElemT = sizeof(ElemT) * 8 / ElemBitSize; + int UnpackedLinearIndex = Row * NumCols + Col; + int PackedLinearIndex = UnpackedLinearIndex / ElemsInElemT; + + // 2. Update the corresponding bits of the target element. + if constexpr (ElemBitSize == sizeof(ElemT) * 8) { + Vec[PackedLinearIndex] = Value; + } else { + ElemT TargetElem = Vec[PackedLinearIndex]; + // TargetElem has 2 or more elements in it. Need to extract one. + // TODO: for now assume that is the case only for 2 or 4-bit integers. + assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type"); + + unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize; + unsigned int Mask = (1 << ElemBitSize) - 1; + + Value = (Value & Mask) << Offset; + Mask = Mask << Offset; + TargetElem = (TargetElem & ~Mask) | Value; + Vec[PackedLinearIndex] = TargetElem; + } +} + +template +ReadT readFromHorizontallyPackedMatrix(void *VVec, int Row, int Col) { + constexpr int ElemBitSize = ArgPrecision == dpas_argument_type::Invalid + ? (sizeof(ReadT) * 8) + : getBitSize(); + using ElemT = + std::conditional_t::type>; + ElemT *Vec = reinterpret_cast(VVec); + + // 1. Find and read the target 'unsigned int' element. + // The unpacked matrix has dimensions: NumRows*NumCols + constexpr int ElemsInElemT = sizeof(ElemT) * 8 / ElemBitSize; + int UnpackedLinearIndex = Row * NumCols + Col; + int PackedLinearIndex = UnpackedLinearIndex / ElemsInElemT; + ElemT TargetElem = Vec[PackedLinearIndex]; + + // 2. Extract, add sign and return the value. + if constexpr (ElemBitSize == sizeof(ElemT) * 8) { + return static_cast(TargetElem); + } else { + // TargetElem has 2 or more elements in it. Need to extract one. + // TODO: for now assume that is the case only for 2 or 4-bit integers. + assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type"); + unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize; + unsigned int Mask = (static_cast(1) << ElemBitSize) - 1; + ElemT Value = (TargetElem >> Offset) & Mask; + if constexpr (std::is_signed_v) { + Value <<= ((sizeof(ElemT) * 8) - ElemBitSize); + Value >>= ((sizeof(ElemT) * 8) - ElemBitSize); + } + return Value; + } +} + +template +void writeToVerticallyPackedMatrix(void *VVec, int Row, int Col, ElemT Value) { + int *Vec = reinterpret_cast(VVec); + constexpr int ElemBitSize = getBitSize(); + + // 1. Find and read the target 'int' element. + // The unpacked matrix has dimensions: NumRows*NumCols. + constexpr int ElemsInInt = 32 / ElemBitSize; + int PackedRow = Row / ElemsInInt; + int PackedLinearIndex = PackedRow * NumCols + Col; + int TargetElem = Vec[PackedLinearIndex]; + + // Insert sub-element 'Value' into 32-bit int and write back to matrix. + int ElemBitOffset = (Row % ElemsInInt) * ElemBitSize; + int Mask = (static_cast(1) << ElemBitSize) - 1; + using IType = typename getIntTypeWithSize::type; + int IValue = sycl::bit_cast(Value); + IValue = (IValue & Mask) << ElemBitOffset; + Mask = Mask << ElemBitOffset; + TargetElem = (TargetElem & ~Mask) | IValue; + Vec[PackedLinearIndex] = TargetElem; +} + +template +ReadT readFromVerticallyPackedMatrix(void *VVec, int Row, int Col) { + constexpr int ElemBitSize = getBitSize(); + using ElemT = typename DpasNaturalOperandType::type; + int *Vec = reinterpret_cast(VVec); + + // 1. Find and read the target 'int' element. + // The unpacked matrix has dimensions: NumRows*NumCols. + constexpr int ElemsInInt = 32 / ElemBitSize; + + int PackedRow = Row / ElemsInInt; + int TargetElem = Vec[PackedRow * NumCols + Col]; + + // 2. Extract the queried sub-elem from 32-bit int, bit-cast to ReadT and + // return. + int ElemBitOffset = (Row % ElemsInInt) * ElemBitSize; + unsigned int Mask = (static_cast(1) << ElemBitSize) - 1; + int Value = (TargetElem >> ElemBitOffset) & Mask; + if constexpr (std::is_signed_v && std::is_integral_v) { + Value <<= (32 - ElemBitSize); + Value >>= (32 - ElemBitSize); + return Value; + } else { + using IType = typename getIntTypeWithSize::type; + IType IValue = static_cast(Value); + return sycl::bit_cast(IValue); + } +} + +template +void printMatrix(void *Vec, std::string Msg) { + std::cout << Msg << "(" << M << "x" << N + << "), element precision = " << toString(ArgPrecision) << std::endl; + for (int I = 0; I < M; I++) { + for (int J = 0; J < N; J++) { + + ReadT Value; + if constexpr (IsHorizontalPack) + Value = readFromHorizontallyPackedMatrix( + Vec, I, J); + else + Value = readFromVerticallyPackedMatrix(Vec, + I, J); + + if constexpr (std::is_integral_v) + printf("%3d", Value); + else + std::cout << (float)Value; + if (J + 1 < N) + std::cout << ","; + } + std::cout << std::endl; + } +} + +template +bool test(queue &Q, bool Print) { + constexpr unsigned Size = 128; + constexpr unsigned VL = 16; + + constexpr int AElemBitSize = getBitSize(); + constexpr int BElemBitSize = getBitSize(); + constexpr int OpsPerChannel = + std::min(32 / std::max(AElemBitSize, BElemBitSize), 8); + + using BPrintT = typename DpasPrintType::type; + using APrintT = typename DpasPrintType::type; + using ABPrintT = decltype(std::declval() * std::declval()); + + // A(_Mx_K) * B(_Kx_N) + C(_Mx_N) + // where: + constexpr int M = RepeatCount; + constexpr int K = SystolicDepth * OpsPerChannel; + constexpr int N = 16; // Execution size: 16 for PVC. + + auto Dev = Q.get_device(); + std::cout << "Running test case " << toString(BPrec, APrec) + << " with UseSrc0 = " << UseSrc0 << " on " + << Dev.get_info() << "\n"; + + using ANaturalType = typename DpasNaturalOperandType::type; + using BNaturalType = typename DpasNaturalOperandType::type; + using ResNaturalType = ABPrintT; + constexpr int APackedSize = M * K * AElemBitSize / (sizeof(ANaturalType) * 8); + constexpr int BPackedSize = K * N * BElemBitSize / (sizeof(BNaturalType) * 8); + + auto APacked = aligned_alloc_shared(128, APackedSize, Q); + auto BPacked = aligned_alloc_shared(128, BPackedSize, Q); + auto Res = aligned_alloc_shared(128, M * N, Q); + // Init APacked; + int Value = 0; + for (int II = 0; II < M; II++) { + for (int JJ = 0; JJ < K; JJ++) { + Value++; + writeToHorizontallyPackedMatrix( + APacked, II, JJ, static_cast(Value)); + } + } + if (Print) + printMatrix(APacked, "A"); + + // Init BPacked; + for (int II = 0; II < K; II++) { + for (int JJ = 0; JJ < N; JJ++) { + int Value = (II + JJ % 4) == 0 ? 1 : (2 + II + JJ) % 3; + writeToVerticallyPackedMatrix( + BPacked, II, JJ, static_cast(Value)); + assert(Value == (int)(static_cast(Value)) && "ERROR"); + } + } + if (Print) + printMatrix(BPacked, "B"); + + Q.single_task([=]() SYCL_ESIMD_KERNEL { + simd A(APacked, overaligned_tag<16>{}); + simd B(BPacked, overaligned_tag<16>{}); + simd C; + + if constexpr (UseSrc0) { + // Compute C = C + AxB; + C = 1; + C = dpas<8, RepeatCount, ResNaturalType, ResNaturalType, BNaturalType, + ANaturalType, BPrec, APrec>(C, B, A); + } else { + // Compute C = AxB; + C = dpas<8, RepeatCount, ResNaturalType, BNaturalType, ANaturalType, + BPrec, APrec>(B, A); + } + + C.copy_to(Res); + }).wait(); + + if (Print) + printMatrix(Res, "C"); + + int NErrors = 0; + auto A = APacked; + auto B = BPacked; + for (int II = 0; II < M && NErrors < 10; II++) { + for (int JJ = 0; JJ < N && NErrors < 10; JJ++) { + ABPrintT GoldRes = 0; + if constexpr (UseSrc0) + GoldRes = 1; + + // Res(i,j) = C(i,j) = A(i,*)*B(*,j)) + for (int KK = 0; KK < K; KK++) { + APrintT AVal = + readFromHorizontallyPackedMatrix(A, II, KK); + BPrintT BVal = + readFromVerticallyPackedMatrix(B, KK, JJ); + GoldRes += AVal * BVal; + } + // Res(i,j) is Res[N*i + j] + if (Res[N * II + JJ] != GoldRes) { + NErrors++; + std::cerr << "Res[" << II << ", " << JJ << "] = (" << Res[M * II + JJ] + << ") != expected (" << GoldRes << ")" << std::endl; + } + } // end for JJ + } // end for II + + free(Res, Q); + free(APacked, Q); + free(BPacked, Q); + return NErrors == 0; +} + +template +bool tests(queue Q, bool Print) { + bool Passed = true; + constexpr bool UseSrc0 = true; + Passed &= test(Q, Print); + Passed &= test(Q, Print); + return Passed; +} diff --git a/SYCL/ESIMD/dpas/dpas_fp16.cpp b/SYCL/ESIMD/dpas/dpas_fp16.cpp new file mode 100644 index 0000000000..9e672a522d --- /dev/null +++ b/SYCL/ESIMD/dpas/dpas_fp16.cpp @@ -0,0 +1,34 @@ +//==---------------- dpas_fp16.cpp - DPC++ ESIMD on-device test ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || esimd_emulator +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// This test verifies DPAS support for float16. + +#include "dpas_common.hpp" + +int main(int argc, const char *argv[]) { + queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + bool Print = argc > 1 && std::string(argv[1]) == "-debug"; + bool Passed = true; + + // Test unsigned 2-bit integers./ + Passed &= tests<8, 8, fp16, fp16>(Q, Print); + Passed &= tests<8, 4, fp16, fp16>(Q, Print); + Passed &= tests<8, 1, fp16, fp16>(Q, Print); + + // TODO: Enable these cases when esimd::simd(ptr) constructor is fixed. + // Passed &= tests<8, 5, fp16, fp16>(Q, Print); + // Passed &= tests<8, 3, fp16, fp16>(Q, Print); + + std::cout << (Passed ? "Test Passed\n" : "Test FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/dpas/dpas_int.cpp b/SYCL/ESIMD/dpas/dpas_int.cpp new file mode 100644 index 0000000000..122dea02d3 --- /dev/null +++ b/SYCL/ESIMD/dpas/dpas_int.cpp @@ -0,0 +1,58 @@ +//==---------------- dpas_int.cpp - DPC++ ESIMD on-device test ------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || esimd_emulator +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// This test verifies DPAS support for 2,4,8-bit integers. + +#include "dpas_common.hpp" + +int main(int argc, const char *argv[]) { + queue Q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + bool Print = argc > 1 && std::string(argv[1]) == "-debug"; + bool Passed = true; + + // Test unsigned 2-bit integers./ + Passed &= tests<8, 8, u2, u2>(Q, Print); + Passed &= tests<8, 4, u2, u2>(Q, Print); + // TODO: enable this case when the problem with simd constructor + // is resolved. + // Passed &= tests<8, 3, u2, u2>(Q, Print); + Passed &= tests<8, 1, u2, u2>(Q, Print); + + // Test signed 2-bit integers. + Passed &= tests<8, 8, s2, s2>(Q, Print); + // TODO: enable this case when the problem with simd constructor + // is resolved. + // Passed &= tests<8, 5, s2, s2>(Q, Print); + Passed &= tests<8, 2, s2, s2>(Q, Print); + Passed &= tests<8, 1, s2, s2>(Q, Print); + + // Test the mix of signed and unsigned 2-bit integers. + Passed &= tests<8, 1, u2, s2>(Q, Print); + Passed &= tests<8, 1, s2, u2>(Q, Print); + + // Test couple combinations with 4-bit integers. + Passed &= tests<8, 8, s4, s4>(Q, Print); + Passed &= tests<8, 4, u4, s4>(Q, Print); + + // Test couple combinations with 8-bit integers. + Passed &= tests<8, 8, s8, s8>(Q, Print); + Passed &= tests<8, 2, u8, s8>(Q, Print); + + // Test some mixes of 2/4/8-bit integers. + Passed &= tests<8, 8, s2, s4>(Q, Print); + Passed &= tests<8, 1, s2, s8>(Q, Print); + Passed &= tests<8, 4, s8, s4>(Q, Print); + + std::cout << (Passed ? "Test Passed\n" : "Test FAILED\n"); + return Passed ? 0 : 1; +}