From ab4118f78c6a3737c0012eb701acdf94b4a3a558 Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Fri, 3 Sep 2021 14:55:01 +0800 Subject: [PATCH 1/2] [SYCL][Matrix] Add support for joint_matrix_mad when A is uint8/int8 and B is uint8/int8 With this patch, more cases for C=A*B+C can be realized: 1. A is uint8, B is int8 2. A is int8, B is uint8 3. A is uint8, B is uint8 --- sycl/include/CL/__spirv/spirv_ops.hpp | 36 ++++ .../sycl/ext/oneapi/matrix/matrix-jit.hpp | 25 ++- sycl/test/matrix/matrix-int8-uint8-test.cpp | 168 ++++++++++++++++++ sycl/test/matrix/matrix-uint8-int8-test.cpp | 168 ++++++++++++++++++ sycl/test/matrix/matrix-uint8-uint8-test.cpp | 168 ++++++++++++++++++ 5 files changed, 558 insertions(+), 7 deletions(-) create mode 100644 sycl/test/matrix/matrix-int8-uint8-test.cpp create mode 100644 sycl/test/matrix/matrix-uint8-int8-test.cpp create mode 100644 sycl/test/matrix/matrix-uint8-uint8-test.cpp diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index 18ef03cc70607..ff7c9982dd77a 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -50,6 +50,42 @@ __spirv_JointMatrixMadINTEL( __spv::__spirv_JointMatrixINTEL *C, __spv::Scope::Flag Sc = __spv::Scope::Flag::Subgroup); +template +extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * +__spirv_JointMatrixUUMadINTEL( + __spv::__spirv_JointMatrixINTEL *A, + __spv::__spirv_JointMatrixINTEL *B, + __spv::__spirv_JointMatrixINTEL *C, + __spv::Scope::Flag Sc = __spv::Scope::Flag::Subgroup); + +template +extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * +__spirv_JointMatrixUSMadINTEL( + __spv::__spirv_JointMatrixINTEL *A, + __spv::__spirv_JointMatrixINTEL *B, + __spv::__spirv_JointMatrixINTEL *C, + __spv::Scope::Flag Sc = __spv::Scope::Flag::Subgroup); + +template +extern SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * +__spirv_JointMatrixSUMadINTEL( + __spv::__spirv_JointMatrixINTEL *A, + __spv::__spirv_JointMatrixINTEL *B, + __spv::__spirv_JointMatrixINTEL *C, + __spv::Scope::Flag Sc = __spv::Scope::Flag::Subgroup); + #ifndef __SPIRV_BUILTIN_DECLARATIONS__ #error \ "SPIR-V built-ins are not available. Please set -fdeclare-spirv-builtins flag." diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp index 5343c3ccb301c..e81881e52f6a7 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-jit.hpp @@ -160,16 +160,27 @@ joint_matrix_store(Group sg, #endif // __SYCL_DEVICE_ONLY__ } -template -inline __SYCL_ALWAYS_INLINE joint_matrix +inline __SYCL_ALWAYS_INLINE joint_matrix joint_matrix_mad(Group sg, joint_matrix &mA, - joint_matrix &mB, - joint_matrix &mC) { + joint_matrix &mB, + joint_matrix &mC) { #ifdef __SYCL_DEVICE_ONLY__ - joint_matrix res(sg); - res.spvm = __spirv_JointMatrixMadINTEL(mA.spvm, mB.spvm, mC.spvm); + joint_matrix res(sg); + if constexpr (std::is_same::value && + std::is_same::value && + std::is_same::value) + res.spvm = __spirv_JointMatrixMadINTEL(mA.spvm, mB.spvm, mC.spvm); + else if constexpr (std::is_unsigned::value && std::is_unsigned::value) + res.spvm = __spirv_JointMatrixUUMadINTEL(mA.spvm, mB.spvm, mC.spvm); + else if constexpr (std::is_signed::value && std::is_unsigned::value) + res.spvm = __spirv_JointMatrixSUMadINTEL(mA.spvm, mB.spvm, mC.spvm); + else if constexpr (std::is_unsigned::value && std::is_signed::value) + res.spvm = __spirv_JointMatrixUSMadINTEL(mA.spvm, mB.spvm, mC.spvm); + else + res.spvm = __spirv_JointMatrixMadINTEL(mA.spvm, mB.spvm, mC.spvm); return res; #else (void)sg; diff --git a/sycl/test/matrix/matrix-int8-uint8-test.cpp b/sycl/test/matrix/matrix-int8-uint8-test.cpp new file mode 100644 index 0000000000000..f6ed60f4ecf83 --- /dev/null +++ b/sycl/test/matrix/matrix-int8-uint8-test.cpp @@ -0,0 +1,168 @@ +// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out +#include +#if (SYCL_EXT_ONEAPI_MATRIX == 2) +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define TILE_SZ 16 +#define TM (TILE_SZ-4) +#define TN (TILE_SZ-4) +#define TK (4 * TILE_SZ-16) + +#define SG_SZ 16 + +template struct big_matrix{ +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) { + } +}; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + // B => K/4 x N*4, A => M x K, C => M, N + // stride should be X's cols, e.g., B's stirde = N*4 + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + // For B, since current implementation does not support non-packed layout, + // users need to specify the updated VNNI sizes along with the packed_b layout. + // By default, the layout is row_major and size is (TK, TN). + joint_matrix sub_b(sg); + joint_matrix sub_c(sg); + + // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 + // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K, matrix_layout::packed_a); + // Assuming B data is already in VNNI format. + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 4) * (N * 4) + + sg_starty / SG_SZ * TN * 4, + N * 4, matrix_layout::packed_b); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +int8_t A[MATRIX_M][MATRIX_K]; +uint8_t B[MATRIX_K / 4][MATRIX_N * 4]; +int32_t C[MATRIX_M][MATRIX_N]; +int32_t D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, + int N, int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + char *va = (char *)(A_mem + m * K + k); + char *vb = (char *)(B_mem + k * N + n); + int acc = *(C_mem + m * N + n); + for (int i = 0; i < 4; i++) { + acc += (va[i] * vb[i]); + } + *(C_mem + m * N + n) = acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i+2*j; + } + } + for (int i = 0; i < MATRIX_K / 4; i++) { + for (int j = 0; j < MATRIX_N * 4; j++) { + B[i][j] = i+j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((int8_t *)&A); + big_matrix MB((uint8_t *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 4); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << C[i][j] << ", "; + std::cout << "\n"; + } + std::cout << std::endl; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << D[i][j] << ", "; + std::cout << "\n"; + } +} +#endif // (SYCL_EXT_ONEAPI_MATRIX == 2) diff --git a/sycl/test/matrix/matrix-uint8-int8-test.cpp b/sycl/test/matrix/matrix-uint8-int8-test.cpp new file mode 100644 index 0000000000000..0be24e4cbd79a --- /dev/null +++ b/sycl/test/matrix/matrix-uint8-int8-test.cpp @@ -0,0 +1,168 @@ +// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out +#include +#if (SYCL_EXT_ONEAPI_MATRIX == 2) +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define TILE_SZ 16 +#define TM (TILE_SZ-4) +#define TN (TILE_SZ-4) +#define TK (4 * TILE_SZ-16) + +#define SG_SZ 16 + +template struct big_matrix{ +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) { + } +}; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + // B => K/4 x N*4, A => M x K, C => M, N + // stride should be X's cols, e.g., B's stirde = N*4 + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + // For B, since current implementation does not support non-packed layout, + // users need to specify the updated VNNI sizes along with the packed_b layout. + // By default, the layout is row_major and size is (TK, TN). + joint_matrix sub_b(sg); + joint_matrix sub_c(sg); + + // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 + // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K, matrix_layout::packed_a); + // Assuming B data is already in VNNI format. + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 4) * (N * 4) + + sg_starty / SG_SZ * TN * 4, + N * 4, matrix_layout::packed_b); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +uint8_t A[MATRIX_M][MATRIX_K]; +int8_t B[MATRIX_K / 4][MATRIX_N * 4]; +int32_t C[MATRIX_M][MATRIX_N]; +int32_t D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, + int N, int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + char *va = (char *)(A_mem + m * K + k); + char *vb = (char *)(B_mem + k * N + n); + int acc = *(C_mem + m * N + n); + for (int i = 0; i < 4; i++) { + acc += (va[i] * vb[i]); + } + *(C_mem + m * N + n) = acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i+2*j; + } + } + for (int i = 0; i < MATRIX_K / 4; i++) { + for (int j = 0; j < MATRIX_N * 4; j++) { + B[i][j] = i+j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((uint8_t *)&A); + big_matrix MB((int8_t *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 4); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << C[i][j] << ", "; + std::cout << "\n"; + } + std::cout << std::endl; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << D[i][j] << ", "; + std::cout << "\n"; + } +} +#endif // (SYCL_EXT_ONEAPI_MATRIX == 2) diff --git a/sycl/test/matrix/matrix-uint8-uint8-test.cpp b/sycl/test/matrix/matrix-uint8-uint8-test.cpp new file mode 100644 index 0000000000000..6f743c1e71dca --- /dev/null +++ b/sycl/test/matrix/matrix-uint8-uint8-test.cpp @@ -0,0 +1,168 @@ +// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out +#include +#if (SYCL_EXT_ONEAPI_MATRIX == 2) +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; + +#define TILE_SZ 16 +#define TM (TILE_SZ-4) +#define TN (TILE_SZ-4) +#define TK (4 * TILE_SZ-16) + +#define SG_SZ 16 + +template struct big_matrix{ +public: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) { + } +}; + +template +void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { + size_t M = NUM_ROWS_C; + size_t N = NUM_COLS_C; + size_t K = NUM_COLS_A; + // B => K/4 x N*4, A => M x K, C => M, N + // stride should be X's cols, e.g., B's stirde = N*4 + assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + auto accC = bufC.get_access(cgh); + auto accA = bufA.get_access(cgh); + auto accB = bufB.get_access(cgh); + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [accA, accB, accC, M, N, K](nd_item<2> spmd_item) + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + ext::oneapi::sub_group sg = spmd_item.get_sub_group(); + joint_matrix sub_a(sg); + // For B, since current implementation does not support non-packed layout, + // users need to specify the updated VNNI sizes along with the packed_b layout. + // By default, the layout is row_major and size is (TK, TN). + joint_matrix sub_b(sg); + joint_matrix sub_c(sg); + + // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 + // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K, matrix_layout::packed_a); + // Assuming B data is already in VNNI format. + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / 4) * (N * 4) + + sg_starty / SG_SZ * TN * 4, + N * 4, matrix_layout::packed_b); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, matrix_layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = TM * 2; +static constexpr size_t MATRIX_N = TN * 2; +static constexpr size_t MATRIX_K = TK * 2; +uint8_t A[MATRIX_M][MATRIX_K]; +uint8_t B[MATRIX_K / 4][MATRIX_N * 4]; +int32_t C[MATRIX_M][MATRIX_N]; +int32_t D[MATRIX_M][MATRIX_N]; + +void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, + int N, int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + char *va = (char *)(A_mem + m * K + k); + char *vb = (char *)(B_mem + k * N + n); + int acc = *(C_mem + m * N + n); + for (int i = 0; i < 4; i++) { + acc += (va[i] * vb[i]); + } + *(C_mem + m * N + n) = acc; + } + } +} + +int main() { + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + A[i][j] = i+2*j; + } + } + for (int i = 0; i < MATRIX_K / 4; i++) { + for (int j = 0; j < MATRIX_N * 4; j++) { + B[i][j] = i+j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((int32_t *)&C); + big_matrix MD((int32_t *)&D); + big_matrix MA((uint8_t *)&A); + big_matrix MB((uint8_t *)&B); + matrix_multiply(MC, MA, MB); + matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, + MATRIX_N, MATRIX_K / 4); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (C[i][j] != D[i][j]) + res = false; + } + } + if (res) + std::cout << "passed\n"; + else + std::cout << "failed\n"; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << C[i][j] << ", "; + std::cout << "\n"; + } + std::cout << std::endl; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) + std::cout << D[i][j] << ", "; + std::cout << "\n"; + } +} +#endif // (SYCL_EXT_ONEAPI_MATRIX == 2) From 8c9127fe63de2c98bd10a3eec0bb2cf4f431ef3c Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Tue, 7 Sep 2021 22:30:15 +0800 Subject: [PATCH 2/2] Address Dounia's comments: 1. Remove the newly-added testcases since we add them into llvm-test-suite 2. Remove "-march=sapphirerapids" from previous testcases --- sycl/test/matrix/matrix-bf16-test-SG-16.cpp | 2 +- sycl/test/matrix/matrix-bf16-test.cpp | 2 +- sycl/test/matrix/matrix-int8-test-SG-16.cpp | 2 +- sycl/test/matrix/matrix-int8-test.cpp | 2 +- sycl/test/matrix/matrix-int8-uint8-test.cpp | 168 ------------------- sycl/test/matrix/matrix-uint8-int8-test.cpp | 168 ------------------- sycl/test/matrix/matrix-uint8-uint8-test.cpp | 168 ------------------- 7 files changed, 4 insertions(+), 508 deletions(-) delete mode 100644 sycl/test/matrix/matrix-int8-uint8-test.cpp delete mode 100644 sycl/test/matrix/matrix-uint8-int8-test.cpp delete mode 100644 sycl/test/matrix/matrix-uint8-uint8-test.cpp diff --git a/sycl/test/matrix/matrix-bf16-test-SG-16.cpp b/sycl/test/matrix/matrix-bf16-test-SG-16.cpp index 0d7d7e2ef4629..0150c420e3bd9 100644 --- a/sycl/test/matrix/matrix-bf16-test-SG-16.cpp +++ b/sycl/test/matrix/matrix-bf16-test-SG-16.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out +// RUN: %clangxx -fsycl -O2 %s -o %t.out #include #if (SYCL_EXT_ONEAPI_MATRIX == 2) #include diff --git a/sycl/test/matrix/matrix-bf16-test.cpp b/sycl/test/matrix/matrix-bf16-test.cpp index 224846fd6f9e7..efa118a33bff7 100644 --- a/sycl/test/matrix/matrix-bf16-test.cpp +++ b/sycl/test/matrix/matrix-bf16-test.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out +// RUN: %clangxx -fsycl -O2 %s -o %t.out #include #if (SYCL_EXT_ONEAPI_MATRIX == 2) #include diff --git a/sycl/test/matrix/matrix-int8-test-SG-16.cpp b/sycl/test/matrix/matrix-int8-test-SG-16.cpp index 6fead706ea137..0aace5f46c036 100644 --- a/sycl/test/matrix/matrix-int8-test-SG-16.cpp +++ b/sycl/test/matrix/matrix-int8-test-SG-16.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out +// RUN: %clangxx -fsycl -O2 %s -o %t.out #include #if (SYCL_EXT_ONEAPI_MATRIX == 2) #include diff --git a/sycl/test/matrix/matrix-int8-test.cpp b/sycl/test/matrix/matrix-int8-test.cpp index 9692f686e4859..263cc75c1097f 100644 --- a/sycl/test/matrix/matrix-int8-test.cpp +++ b/sycl/test/matrix/matrix-int8-test.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out +// RUN: %clangxx -fsycl -O2 %s -o %t.out #include #if (SYCL_EXT_ONEAPI_MATRIX == 2) #include diff --git a/sycl/test/matrix/matrix-int8-uint8-test.cpp b/sycl/test/matrix/matrix-int8-uint8-test.cpp deleted file mode 100644 index f6ed60f4ecf83..0000000000000 --- a/sycl/test/matrix/matrix-int8-uint8-test.cpp +++ /dev/null @@ -1,168 +0,0 @@ -// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out -#include -#if (SYCL_EXT_ONEAPI_MATRIX == 2) -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define TILE_SZ 16 -#define TM (TILE_SZ-4) -#define TN (TILE_SZ-4) -#define TK (4 * TILE_SZ-16) - -#define SG_SZ 16 - -template struct big_matrix{ -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) { - } -}; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - // B => K/4 x N*4, A => M x K, C => M, N - // stride should be X's cols, e.g., B's stirde = N*4 - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - ext::oneapi::sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a(sg); - // For B, since current implementation does not support non-packed layout, - // users need to specify the updated VNNI sizes along with the packed_b layout. - // By default, the layout is row_major and size is (TK, TN). - joint_matrix sub_b(sg); - joint_matrix sub_c(sg); - - // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 - // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 - joint_matrix_load(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, - K, matrix_layout::packed_a); - // Assuming B data is already in VNNI format. - joint_matrix_load(sg, sub_b, - accB.get_pointer() + (k * TK / 4) * (N * 4) + - sg_starty / SG_SZ * TN * 4, - N * 4, matrix_layout::packed_b); - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - }); // parallel for - }).wait(); -} - -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -static constexpr size_t MATRIX_K = TK * 2; -int8_t A[MATRIX_M][MATRIX_K]; -uint8_t B[MATRIX_K / 4][MATRIX_N * 4]; -int32_t C[MATRIX_M][MATRIX_N]; -int32_t D[MATRIX_M][MATRIX_N]; - -void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, - int N, int K) { - // tiling - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k++) { - char *va = (char *)(A_mem + m * K + k); - char *vb = (char *)(B_mem + k * N + n); - int acc = *(C_mem + m * N + n); - for (int i = 0; i < 4; i++) { - acc += (va[i] * vb[i]); - } - *(C_mem + m * N + n) = acc; - } - } -} - -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - A[i][j] = i+2*j; - } - } - for (int i = 0; i < MATRIX_K / 4; i++) { - for (int j = 0; j < MATRIX_N * 4; j++) { - B[i][j] = i+j; - } - } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1; - D[i][j] = 1; - } - } - - big_matrix MC((int32_t *)&C); - big_matrix MD((int32_t *)&D); - big_matrix MA((int8_t *)&A); - big_matrix MB((uint8_t *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 4); - - bool res = true; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - if (C[i][j] != D[i][j]) - res = false; - } - } - if (res) - std::cout << "passed\n"; - else - std::cout << "failed\n"; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << C[i][j] << ", "; - std::cout << "\n"; - } - std::cout << std::endl; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << D[i][j] << ", "; - std::cout << "\n"; - } -} -#endif // (SYCL_EXT_ONEAPI_MATRIX == 2) diff --git a/sycl/test/matrix/matrix-uint8-int8-test.cpp b/sycl/test/matrix/matrix-uint8-int8-test.cpp deleted file mode 100644 index 0be24e4cbd79a..0000000000000 --- a/sycl/test/matrix/matrix-uint8-int8-test.cpp +++ /dev/null @@ -1,168 +0,0 @@ -// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out -#include -#if (SYCL_EXT_ONEAPI_MATRIX == 2) -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define TILE_SZ 16 -#define TM (TILE_SZ-4) -#define TN (TILE_SZ-4) -#define TK (4 * TILE_SZ-16) - -#define SG_SZ 16 - -template struct big_matrix{ -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) { - } -}; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - // B => K/4 x N*4, A => M x K, C => M, N - // stride should be X's cols, e.g., B's stirde = N*4 - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - ext::oneapi::sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a(sg); - // For B, since current implementation does not support non-packed layout, - // users need to specify the updated VNNI sizes along with the packed_b layout. - // By default, the layout is row_major and size is (TK, TN). - joint_matrix sub_b(sg); - joint_matrix sub_c(sg); - - // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 - // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 - joint_matrix_load(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, - K, matrix_layout::packed_a); - // Assuming B data is already in VNNI format. - joint_matrix_load(sg, sub_b, - accB.get_pointer() + (k * TK / 4) * (N * 4) + - sg_starty / SG_SZ * TN * 4, - N * 4, matrix_layout::packed_b); - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - }); // parallel for - }).wait(); -} - -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -static constexpr size_t MATRIX_K = TK * 2; -uint8_t A[MATRIX_M][MATRIX_K]; -int8_t B[MATRIX_K / 4][MATRIX_N * 4]; -int32_t C[MATRIX_M][MATRIX_N]; -int32_t D[MATRIX_M][MATRIX_N]; - -void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, - int N, int K) { - // tiling - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k++) { - char *va = (char *)(A_mem + m * K + k); - char *vb = (char *)(B_mem + k * N + n); - int acc = *(C_mem + m * N + n); - for (int i = 0; i < 4; i++) { - acc += (va[i] * vb[i]); - } - *(C_mem + m * N + n) = acc; - } - } -} - -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - A[i][j] = i+2*j; - } - } - for (int i = 0; i < MATRIX_K / 4; i++) { - for (int j = 0; j < MATRIX_N * 4; j++) { - B[i][j] = i+j; - } - } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1; - D[i][j] = 1; - } - } - - big_matrix MC((int32_t *)&C); - big_matrix MD((int32_t *)&D); - big_matrix MA((uint8_t *)&A); - big_matrix MB((int8_t *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 4); - - bool res = true; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - if (C[i][j] != D[i][j]) - res = false; - } - } - if (res) - std::cout << "passed\n"; - else - std::cout << "failed\n"; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << C[i][j] << ", "; - std::cout << "\n"; - } - std::cout << std::endl; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << D[i][j] << ", "; - std::cout << "\n"; - } -} -#endif // (SYCL_EXT_ONEAPI_MATRIX == 2) diff --git a/sycl/test/matrix/matrix-uint8-uint8-test.cpp b/sycl/test/matrix/matrix-uint8-uint8-test.cpp deleted file mode 100644 index 6f743c1e71dca..0000000000000 --- a/sycl/test/matrix/matrix-uint8-uint8-test.cpp +++ /dev/null @@ -1,168 +0,0 @@ -// RUN: %clangxx -march=sapphirerapids -fsycl -O2 %s -o %t.out -#include -#if (SYCL_EXT_ONEAPI_MATRIX == 2) -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; - -#define TILE_SZ 16 -#define TM (TILE_SZ-4) -#define TN (TILE_SZ-4) -#define TK (4 * TILE_SZ-16) - -#define SG_SZ 16 - -template struct big_matrix{ -public: - T *mat; - -public: - T *get_data() { return mat; } - void set_data(T *data) { mat = data; } - big_matrix(T *data) : mat(data) { - } -}; - -template -void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { - size_t M = NUM_ROWS_C; - size_t N = NUM_COLS_C; - size_t K = NUM_COLS_A; - // B => K/4 x N*4, A => M x K, C => M, N - // stride should be X's cols, e.g., B's stirde = N*4 - assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4); - size_t NDRangeM = M / TM; - size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); - buffer bufC(C.get_data(), range<2>(M, N)); - - queue q; - q.submit([&](handler &cgh) { - auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); - - cgh.parallel_for( - nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), - [accA, accB, accC, M, N, K](nd_item<2> spmd_item) - - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - ext::oneapi::sub_group sg = spmd_item.get_sub_group(); - joint_matrix sub_a(sg); - // For B, since current implementation does not support non-packed layout, - // users need to specify the updated VNNI sizes along with the packed_b layout. - // By default, the layout is row_major and size is (TK, TN). - joint_matrix sub_b(sg); - joint_matrix sub_c(sg); - - // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64 - // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4 - joint_matrix_load(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - for (int k = 0; k < K / TK; k += 1) { - joint_matrix_load( - sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, - K, matrix_layout::packed_a); - // Assuming B data is already in VNNI format. - joint_matrix_load(sg, sub_b, - accB.get_pointer() + (k * TK / 4) * (N * 4) + - sg_starty / SG_SZ * TN * 4, - N * 4, matrix_layout::packed_b); - sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); - } - joint_matrix_store(sg, sub_c, - accC.get_pointer() + (sg_startx * TM) * N + - sg_starty / SG_SZ * TN, - N, matrix_layout::row_major); - }); // parallel for - }).wait(); -} - -static constexpr size_t MATRIX_M = TM * 2; -static constexpr size_t MATRIX_N = TN * 2; -static constexpr size_t MATRIX_K = TK * 2; -uint8_t A[MATRIX_M][MATRIX_K]; -uint8_t B[MATRIX_K / 4][MATRIX_N * 4]; -int32_t C[MATRIX_M][MATRIX_N]; -int32_t D[MATRIX_M][MATRIX_N]; - -void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M, - int N, int K) { - // tiling - for (int m = 0; m < M; m++) - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k++) { - char *va = (char *)(A_mem + m * K + k); - char *vb = (char *)(B_mem + k * N + n); - int acc = *(C_mem + m * N + n); - for (int i = 0; i < 4; i++) { - acc += (va[i] * vb[i]); - } - *(C_mem + m * N + n) = acc; - } - } -} - -int main() { - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_K; j++) { - A[i][j] = i+2*j; - } - } - for (int i = 0; i < MATRIX_K / 4; i++) { - for (int j = 0; j < MATRIX_N * 4; j++) { - B[i][j] = i+j; - } - } - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - C[i][j] = 1; - D[i][j] = 1; - } - } - - big_matrix MC((int32_t *)&C); - big_matrix MD((int32_t *)&D); - big_matrix MA((uint8_t *)&A); - big_matrix MB((uint8_t *)&B); - matrix_multiply(MC, MA, MB); - matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M, - MATRIX_N, MATRIX_K / 4); - - bool res = true; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) { - if (C[i][j] != D[i][j]) - res = false; - } - } - if (res) - std::cout << "passed\n"; - else - std::cout << "failed\n"; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << C[i][j] << ", "; - std::cout << "\n"; - } - std::cout << std::endl; - for (int i = 0; i < MATRIX_M; i++) { - for (int j = 0; j < MATRIX_N; j++) - std::cout << D[i][j] << ", "; - std::cout << "\n"; - } -} -#endif // (SYCL_EXT_ONEAPI_MATRIX == 2)