From 7179d3adb067ec14f85319dce2c85da76dbdc077 Mon Sep 17 00:00:00 2001 From: amochalo Date: Thu, 26 Mar 2020 23:25:56 +0300 Subject: [PATCH 01/10] [SYCL] Add tests for inline asm feature Signed-off-by: amochalo --- sycl/test/inline-asm/asm_16_empty.cpp | 73 ++++++++++++ sycl/test/inline-asm/asm_16_matrix_mult.cpp | 73 ++++++++++++ sycl/test/inline-asm/asm_16_no_input_int.cpp | 73 ++++++++++++ sycl/test/inline-asm/asm_16_no_opts.cpp | 75 +++++++++++++ sycl/test/inline-asm/asm_8_empty.cpp | 72 ++++++++++++ sycl/test/inline-asm/asm_8_no_input_int.cpp | 75 +++++++++++++ .../inline-asm/asm_arbitrary_ops_order.cpp | 98 ++++++++++++++++ sycl/test/inline-asm/asm_decl_in_scope.cpp | 103 +++++++++++++++++ sycl/test/inline-asm/asm_float_add.cpp | 94 ++++++++++++++++ sycl/test/inline-asm/asm_float_imm_arg.cpp | 89 +++++++++++++++ sycl/test/inline-asm/asm_float_neg.cpp | 89 +++++++++++++++ sycl/test/inline-asm/asm_imm_arg.cpp | 88 +++++++++++++++ sycl/test/inline-asm/asm_mul.cpp | 93 ++++++++++++++++ .../inline-asm/asm_multiple_instructions.cpp | 105 ++++++++++++++++++ sycl/test/inline-asm/asm_no_operands.cpp | 32 ++++++ sycl/test/inline-asm/asm_no_output.cpp | 78 +++++++++++++ sycl/test/inline-asm/asm_plus_mod.cpp | 87 +++++++++++++++ sycl/test/inline-asm/include/asmcheck.h | 17 +++ sycl/test/inline-asm/letter_example.cpp | 59 ++++++++++ sycl/test/inline-asm/malloc_shared_32.cpp | 87 +++++++++++++++ .../inline-asm/malloc_shared_in_out_dif.cpp | 65 +++++++++++ .../inline-asm/malloc_shared_no_input.cpp | 54 +++++++++ 22 files changed, 1679 insertions(+) create mode 100644 sycl/test/inline-asm/asm_16_empty.cpp create mode 100644 sycl/test/inline-asm/asm_16_matrix_mult.cpp create mode 100644 sycl/test/inline-asm/asm_16_no_input_int.cpp create mode 100644 sycl/test/inline-asm/asm_16_no_opts.cpp create mode 100644 sycl/test/inline-asm/asm_8_empty.cpp create mode 100644 sycl/test/inline-asm/asm_8_no_input_int.cpp create mode 100644 sycl/test/inline-asm/asm_arbitrary_ops_order.cpp create mode 100644 sycl/test/inline-asm/asm_decl_in_scope.cpp create mode 100644 sycl/test/inline-asm/asm_float_add.cpp create mode 100644 sycl/test/inline-asm/asm_float_imm_arg.cpp create mode 100644 sycl/test/inline-asm/asm_float_neg.cpp create mode 100644 sycl/test/inline-asm/asm_imm_arg.cpp create mode 100644 sycl/test/inline-asm/asm_mul.cpp create mode 100644 sycl/test/inline-asm/asm_multiple_instructions.cpp create mode 100644 sycl/test/inline-asm/asm_no_operands.cpp create mode 100644 sycl/test/inline-asm/asm_no_output.cpp create mode 100644 sycl/test/inline-asm/asm_plus_mod.cpp create mode 100644 sycl/test/inline-asm/include/asmcheck.h create mode 100644 sycl/test/inline-asm/letter_example.cpp create mode 100644 sycl/test/inline-asm/malloc_shared_32.cpp create mode 100644 sycl/test/inline-asm/malloc_shared_in_out_dif.cpp create mode 100644 sycl/test/inline-asm/malloc_shared_no_input.cpp diff --git a/sycl/test/inline-asm/asm_16_empty.cpp b/sycl/test/inline-asm/asm_16_empty.cpp new file mode 100644 index 000000000000..7b551b55e8a1 --- /dev/null +++ b/sycl/test/inline-asm/asm_16_empty.cpp @@ -0,0 +1,73 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include +#include + +constexpr int LIST_SIZE = 1024; +using arr_t = std::vector; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +// class is used for kernel name +template +class no_opts; + +template +void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferC(pc, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto C = bufferC.template get_access(cgh); + + auto kern = [C](cl::sycl::id<1> wiID) + [[cl::intel_reqd_sub_group_size(16)]] { + C[wiID] = 43; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(""); +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +}; + +int main() { + arr_t C(LIST_SIZE); + + cl::sycl::gpu_selector gpsel; + cl::sycl::queue deviceQueue(gpsel); + + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + C[i] = 0; + } + + process_buffers(deviceQueue, C.data(), LIST_SIZE); + + bool all_right = true; + + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != 43) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << 43 << "\n"; + all_right = false; + break; + } + if (all_right) { + std::cout << "Pass" << std::endl; + return 0; + } + std::cout << "Error" << std::endl; + return -1; +} diff --git a/sycl/test/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/inline-asm/asm_16_matrix_mult.cpp new file mode 100644 index 000000000000..3de2b2966abc --- /dev/null +++ b/sycl/test/inline-asm/asm_16_matrix_mult.cpp @@ -0,0 +1,73 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr int LIST_SIZE = 8; +using arr_t = std::vector; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +// class is used for kernel name +template +class simple_vector_add; + +template +void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferC(pc, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto C = bufferC.template get_access(cgh); + + auto kern = [C](cl::sycl::id<1> wiID) + [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }; + cgh.parallel_for>(numOfItems, kern); + }); +}; + +int main() { + arr_t C(LIST_SIZE); + + cl::sycl::gpu_selector gpsel; + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + for (int i = 0; i < LIST_SIZE; i++) { + C[i] = 0; + } + + process_buffers(deviceQueue, C.data(), LIST_SIZE); + + bool all_right = true; + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != 7) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << 7 << "\n"; + all_right = false; + break; + } + if (all_right) { + std::cout << "Pass" << std::endl; + return 0; + } + std::cout << "Error" << std::endl; + return -1; +} diff --git a/sycl/test/inline-asm/asm_16_no_input_int.cpp b/sycl/test/inline-asm/asm_16_no_input_int.cpp new file mode 100644 index 000000000000..3de2b2966abc --- /dev/null +++ b/sycl/test/inline-asm/asm_16_no_input_int.cpp @@ -0,0 +1,73 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr int LIST_SIZE = 8; +using arr_t = std::vector; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +// class is used for kernel name +template +class simple_vector_add; + +template +void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferC(pc, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto C = bufferC.template get_access(cgh); + + auto kern = [C](cl::sycl::id<1> wiID) + [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }; + cgh.parallel_for>(numOfItems, kern); + }); +}; + +int main() { + arr_t C(LIST_SIZE); + + cl::sycl::gpu_selector gpsel; + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + for (int i = 0; i < LIST_SIZE; i++) { + C[i] = 0; + } + + process_buffers(deviceQueue, C.data(), LIST_SIZE); + + bool all_right = true; + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != 7) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << 7 << "\n"; + all_right = false; + break; + } + if (all_right) { + std::cout << "Pass" << std::endl; + return 0; + } + std::cout << "Error" << std::endl; + return -1; +} diff --git a/sycl/test/inline-asm/asm_16_no_opts.cpp b/sycl/test/inline-asm/asm_16_no_opts.cpp new file mode 100644 index 000000000000..cc2df932a7d8 --- /dev/null +++ b/sycl/test/inline-asm/asm_16_no_opts.cpp @@ -0,0 +1,75 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr int LIST_SIZE = 1024; +using arr_t = std::vector; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +// class is used for kernel name +template +class simple_vector_add; + +template +void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferC(pc, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto C = bufferC.template get_access(cgh); + + auto kern = [C](cl::sycl::id<1> wiID) + [[cl::intel_reqd_sub_group_size(16)]] { + for (int i = 0; i < 10; ++i) { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("fence_sw"); + C[wiID] += i; + +#else + C[wiID] += i; +#endif + } + }; + cgh.parallel_for>(numOfItems, kern); + }); +}; + +int main() { + arr_t C(LIST_SIZE); + + cl::sycl::gpu_selector gpsel; + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + for (int i = 0; i < LIST_SIZE; i++) { + C[i] = 0; + } + + process_buffers(deviceQueue, C.data(), LIST_SIZE); + + bool all_right = true; + + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != 45) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << 45 << "\n"; + all_right = false; + break; + } + if (all_right) { + std::cout << "Pass" << std::endl; + return 0; + } + std::cout << "Error" << std::endl; + return -1; +} diff --git a/sycl/test/inline-asm/asm_8_empty.cpp b/sycl/test/inline-asm/asm_8_empty.cpp new file mode 100644 index 000000000000..a7f0b0bc6d4b --- /dev/null +++ b/sycl/test/inline-asm/asm_8_empty.cpp @@ -0,0 +1,72 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr int LIST_SIZE = 1024; +using arr_t = std::vector; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +// class is used for kernel name +template +class no_opts; + +template +void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferC(pc, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto C = bufferC.template get_access(cgh); + + auto kern = [C](cl::sycl::id<1> wiID) + [[cl::intel_reqd_sub_group_size(8)]] { + C[wiID] = 43; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(""); +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +}; + +int main() { + arr_t C(LIST_SIZE); + + cl::sycl::gpu_selector gpsel; + cl::sycl::queue deviceQueue(gpsel); + + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + C[i] = 0; + } + + process_buffers(deviceQueue, C.data(), LIST_SIZE); + + bool all_right = true; + + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != 43) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << 43 << "\n"; + all_right = false; + break; + } + if (all_right) { + std::cout << "Pass" << std::endl; + return 0; + } + std::cout << "Error" << std::endl; + return -1; +} diff --git a/sycl/test/inline-asm/asm_8_no_input_int.cpp b/sycl/test/inline-asm/asm_8_no_input_int.cpp new file mode 100644 index 000000000000..a370f77b3ada --- /dev/null +++ b/sycl/test/inline-asm/asm_8_no_input_int.cpp @@ -0,0 +1,75 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr int LIST_SIZE = 8; +using arr_t = std::vector; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +// class is used for kernel name +template +class simple_vector_add; + +template +void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferC(pc, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto C = bufferC.template get_access(cgh); + + auto kern = [C](cl::sycl::id<1> wiID) + [[cl::intel_reqd_sub_group_size(8)]] { + volatile int output = 0; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" + : "=rw"(output)); +#else + output = 7; +#endif + C[wiID] = output; + }; + cgh.parallel_for>(numOfItems, kern); + }); +}; + +int main() { + arr_t C(LIST_SIZE); + + cl::sycl::gpu_selector gpsel; + cl::sycl::queue deviceQueue(gpsel); + + for (int i = 0; i < LIST_SIZE; i++) { + C[i] = 0; + } + + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + process_buffers(deviceQueue, C.data(), LIST_SIZE); + + bool all_right = true; + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != 7) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << 7 << "\n"; + all_right = false; + break; + } + if (all_right) { + std::cout << "Pass" << std::endl; + return 0; + } + std::cout << "Error" << std::endl; + return -1; +} diff --git a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp new file mode 100644 index 000000000000..1b75b63d0027 --- /dev/null +++ b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp @@ -0,0 +1,98 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 1024; +using arr_t = std::vector; + +// class is used for kernel name +template +class vector_mad; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(T const *pa, T const *pb, const T *pc, T *pd, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + A[i] = i; + B[i] = i; + C[i] = LIST_SIZE - i * i; + } + + ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) + if (D[i] != A[i] * B[i] + C[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << D[i] << " != " << A[i] * B[i] + C[i] << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(T const *pa, T const *pb, const T *pc, T *pd, + size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + cl::sycl::buffer bufferC(pc, numOfItems); + cl::sycl::buffer bufferD(pd, numOfItems); + + bufferA.set_final_data(nullptr); + bufferB.set_final_data(nullptr); + bufferC.set_final_data(nullptr); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + auto C = bufferC.template get_access(cgh); + auto D = bufferD.template get_access(cgh); + + auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(D[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID])); +#else + D[wiID] = A[wiID] * B[wiID] + C[wiID]; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_decl_in_scope.cpp b/sycl/test/inline-asm/asm_decl_in_scope.cpp new file mode 100644 index 000000000000..2164a602bae6 --- /dev/null +++ b/sycl/test/inline-asm/asm_decl_in_scope.cpp @@ -0,0 +1,103 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 1024 * 1024; +using arr_t = std::vector; + +// class is used for kernel name +template +class decl_in_scope_kernel; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(T const *pa, T const *pb, T *pc, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + A[i] = i; + B[i] = 2; + } + + ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != A[i] * B[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << A[i] * B[i] << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + cl::sycl::buffer bufferC(pc, numOfItems); + + bufferA.set_final_data(nullptr); + bufferB.set_final_data(nullptr); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + auto C = bufferC.template get_access(cgh); + + auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + + // declaration of temp within and outside the scope +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + "}\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + : "+rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID]; + C[wiID] *= B[wiID]; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_float_add.cpp b/sycl/test/inline-asm/asm_float_add.cpp new file mode 100644 index 000000000000..4e58bb3b12d3 --- /dev/null +++ b/sycl/test/inline-asm/asm_float_add.cpp @@ -0,0 +1,94 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 16; +using arr_t = std::vector; + +// class is used for kernel name +template +class asm_float_add_kernel; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(T const *pa, T const *pb, T *pc, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + A[i] = (double)1 / std::pow(2, i); + B[i] = (double)2 / std::pow(2, i); + } + + ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; i++) + if (C[i] != A[i] + B[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << A[i] + B[i] << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + cl::sycl::buffer bufferC(pc, numOfItems); + + bufferA.set_final_data(nullptr); + bufferB.set_final_data(nullptr); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + auto C = bufferC.template get_access(cgh); + + auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] + B[wiID]; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_float_imm_arg.cpp b/sycl/test/inline-asm/asm_float_imm_arg.cpp new file mode 100644 index 000000000000..2b3f05772953 --- /dev/null +++ b/sycl/test/inline-asm/asm_float_imm_arg.cpp @@ -0,0 +1,89 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include +#include +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 8; +constexpr double IMM_ARGUMENT = 0.5; + +using arr_t = std::vector; + +// class is used for kernel name +template +class float_imm_arg_kernel; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(const T *pa, T *pb, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) + A[i] = (double)1 / std::pow(2, i); + + ct.process_buffers(A.data(), B.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) { + if (B[i] != A[i] * IMM_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << A[i] * IMM_ARGUMENT << "\n"; + abort(); + } + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + + auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(IMM_ARGUMENT)); +#else + B[wiID] = A[wiID] * IMM_ARGUMENT; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_float_neg.cpp b/sycl/test/inline-asm/asm_float_neg.cpp new file mode 100644 index 000000000000..f96ea348c468 --- /dev/null +++ b/sycl/test/inline-asm/asm_float_neg.cpp @@ -0,0 +1,89 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 1024; + +using arr_t = std::vector; + +// class is used for kernel name +template +class float_neg_kernel; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(const T *pa, T *pb, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) + A[i] = 1.0 / i; + + ct.process_buffers(A.data(), B.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) + if (B[i] != -A[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << -A[i] << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + + bufferA.set_final_data(nullptr); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + + auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>" + : "=rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] = -A[wiID]; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_imm_arg.cpp b/sycl/test/inline-asm/asm_imm_arg.cpp new file mode 100644 index 000000000000..361bcc4c48d1 --- /dev/null +++ b/sycl/test/inline-asm/asm_imm_arg.cpp @@ -0,0 +1,88 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 1024 * 1024; +constexpr int CONST_ARGUMENT = 0xabc; + +using arr_t = std::vector; + +// class is used for kernel name +template +class const_asm_arg_kernel; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(const T *pa, T *pb, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) + A[i] = i; + + ct.process_buffers(A.data(), B.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) + if (B[i] != A[i] + CONST_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << A[i] + CONST_ARGUMENT << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + + auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(CONST_ARGUMENT)); +#else + B[wiID] = A[wiID] + CONST_ARGUMENT; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_mul.cpp b/sycl/test/inline-asm/asm_mul.cpp new file mode 100644 index 000000000000..15fd7126e7d2 --- /dev/null +++ b/sycl/test/inline-asm/asm_mul.cpp @@ -0,0 +1,93 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 1024 * 1024; +using arr_t = std::vector; + +// class is used for kernel name +template +class vector_mul; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(T const *pa, T const *pb, T *pc, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + A[i] = i; + B[i] = LIST_SIZE - i; + } + + ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != A[i] * B[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << A[i] * B[i] << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + cl::sycl::buffer bufferC(pc, numOfItems); + + bufferA.set_final_data(nullptr); + bufferB.set_final_data(nullptr); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + auto C = bufferC.template get_access(cgh); + + auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] * B[wiID]; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_multiple_instructions.cpp b/sycl/test/inline-asm/asm_multiple_instructions.cpp new file mode 100644 index 000000000000..edbd21cfae94 --- /dev/null +++ b/sycl/test/inline-asm/asm_multiple_instructions.cpp @@ -0,0 +1,105 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 1024; + +using arr_t = std::vector; + +// class is used for kernel name +template +class vector_mul; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(T const *pa, T const *pb, T const *pc, T *pd, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + A[i] = B[i] = i; + C[i] = LIST_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE + } + + ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) + if (D[i] != LIST_SIZE) { + std::cerr << "At index: " << i << ". "; + std::cerr << D[i] << " != " << LIST_SIZE << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T const *pc, T *pd, + size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + cl::sycl::buffer bufferC(pc, numOfItems); + cl::sycl::buffer bufferD(pd, numOfItems); + + bufferA.set_final_data(nullptr); + bufferB.set_final_data(nullptr); + bufferC.set_final_data(nullptr); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + auto C = bufferC.template get_access(cgh); + auto D = bufferD.template get_access(cgh); + + auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID) + [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n" + "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n" + "}\n" + : "=rw"(D[wiID]), "+rw"(A[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID])); +#else + A[wiID] += B[wiID]; + A[wiID] += C[wiID]; + D[wiID] = A[wiID]; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/asm_no_operands.cpp b/sycl/test/inline-asm/asm_no_operands.cpp new file mode 100644 index 000000000000..f7abfd523928 --- /dev/null +++ b/sycl/test/inline-asm/asm_no_operands.cpp @@ -0,0 +1,32 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +class no_operands_kernel; + +int main() { + // Creating SYCL queue + cl::sycl::queue Queue; + sycl::device Device = Queue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + // Size of index space for kernel + cl::sycl::range<1> NumOfWorkItems{15}; + + // Submitting command group(work) to queue + Queue.submit([&](cl::sycl::handler &cgh) { + // Executing kernel + cgh.parallel_for( + NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("barrier"); +#endif + }); + }); +} diff --git a/sycl/test/inline-asm/asm_no_output.cpp b/sycl/test/inline-asm/asm_no_output.cpp new file mode 100644 index 000000000000..2440fd1c17a2 --- /dev/null +++ b/sycl/test/inline-asm/asm_no_output.cpp @@ -0,0 +1,78 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr int LIST_SIZE = 8; +using arr_t = std::vector; +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +// class is used for kernel name +template +class asm_no_output; + +template +void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferC(pc, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto C = bufferC.template get_access(cgh); + + auto kern = [C]() + [[cl::intel_reqd_sub_group_size(16)]] { + volatile int local_var = 47; + local_var += C[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("{\n" + ".decl temp v_type=G type=w num_elts=8 align=GRF\n" + "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n" + "}\n" ::"rw"(local_var)); +#else + volatile int temp = 0; + temp = local_var; +#endif + }; + cgh.single_task>(kern); + }); +}; + +int main() { + arr_t C(LIST_SIZE); + + cl::sycl::gpu_selector gpsel; + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device)) { + std::cout << "Skipping test\n"; + return 0; + } + + for (int i = 0; i < LIST_SIZE; i++) { + C[i] = 0; + } + + process_buffers(deviceQueue, C.data(), LIST_SIZE); + + bool all_right = true; + for (int i = 0; i < LIST_SIZE; ++i) + if (C[i] != 0) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << 0 << "\n"; + all_right = false; + break; + } + if (all_right) { + std::cout << "Pass" << std::endl; + return 0; + } + std::cout << "Error" << std::endl; + return -1; +} diff --git a/sycl/test/inline-asm/asm_plus_mod.cpp b/sycl/test/inline-asm/asm_plus_mod.cpp new file mode 100644 index 000000000000..d460406c71cd --- /dev/null +++ b/sycl/test/inline-asm/asm_plus_mod.cpp @@ -0,0 +1,87 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#include + +constexpr auto sycl_read = cl::sycl::access::mode::read; +constexpr auto sycl_write = cl::sycl::access::mode::write; + +constexpr int LIST_SIZE = 1024 * 1024; +using arr_t = std::vector; + +// class is used for kernel name +template +class plus_mod_kernel; + +class ocl_ctx_t { + cl::sycl::queue deviceQueue; + +public: + ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + + template + void process_buffers(T const *pa, T *pb, size_t sz); +}; + +int main() { + arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); + + try { + cl::sycl::gpu_selector gpsel; + ocl_ctx_t ct{gpsel}; + cl::sycl::queue deviceQueue(gpsel); + sycl::device Device = deviceQueue.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + for (int i = 0; i < LIST_SIZE; i++) { + A[i] = i; + B[i] = LIST_SIZE - i; + C[i] = A[i] + B[i]; + } + + ct.process_buffers(A.data(), B.data(), LIST_SIZE); + + for (int i = 0; i < LIST_SIZE; ++i) + if (B[i] != C[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << C[i] << "\n"; + abort(); + } + + std::cout << "Everything is correct" << std::endl; + } catch (cl::sycl::exception const &err) { + std::cerr << "ERROR: " << err.what() << ":\n"; + return -1; + } +} + +template +void ocl_ctx_t::process_buffers(T const *pa, T *pb, size_t sz) { + cl::sycl::range<1> numOfItems{sz}; + cl::sycl::buffer bufferA(pa, numOfItems); + cl::sycl::buffer bufferB(pb, numOfItems); + + deviceQueue.submit([&](cl::sycl::handler &cgh) { + auto A = bufferA.template get_access(cgh); + auto B = bufferB.template get_access(cgh); + + auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>" + : "+rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] += A[wiID]; +#endif + }; + cgh.parallel_for>(numOfItems, kern); + }); +} diff --git a/sycl/test/inline-asm/include/asmcheck.h b/sycl/test/inline-asm/include/asmcheck.h new file mode 100644 index 000000000000..2031455e214c --- /dev/null +++ b/sycl/test/inline-asm/include/asmcheck.h @@ -0,0 +1,17 @@ +#include + +#include +bool isInlineASMSupported(sycl::device Device) { + + sycl::string_class DriverVersion = Device.get_info(); + sycl::string_class DeviceVendorName = Device.get_info(); + if (DeviceVendorName.find("Intel") == sycl::string_class::npos) + return false; + if (DriverVersion.length() < 5) + return false; + if (DriverVersion[2] != '.') + return false; + if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12) + return false; + return true; +} diff --git a/sycl/test/inline-asm/letter_example.cpp b/sycl/test/inline-asm/letter_example.cpp new file mode 100644 index 000000000000..5b9bac0e22d2 --- /dev/null +++ b/sycl/test/inline-asm/letter_example.cpp @@ -0,0 +1,59 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#define N 100 +using namespace cl::sycl; +int main() { + int *a; + queue q; + sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + auto ctx = q.get_context(); + a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); + for (int i = 0; i < N; i++) + a[i] = i; + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<1>(N), [=](id<1> idx) + [[cl::intel_reqd_sub_group_size(16)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + int i = idx[0]; + asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n" + "svm_gather.4.1 (M1, 16) %0.0 V52.0\n" + "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n" + "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}" + : + : "rw"(&a[i])); +#else + a[idx[0]]++; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < N; i++) { + if (a[i] != (i + 1)) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << a[i] << "!=" << (i + 1) << std::endl; + break; + } + } + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + return -1; + } + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + return 0; +} diff --git a/sycl/test/inline-asm/malloc_shared_32.cpp b/sycl/test/inline-asm/malloc_shared_32.cpp new file mode 100644 index 000000000000..d521afabaaca --- /dev/null +++ b/sycl/test/inline-asm/malloc_shared_32.cpp @@ -0,0 +1,87 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#define N 1000 +using namespace cl::sycl; +int main() { + int *a; + int *b; + int *c; + queue q; + + sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + auto ctx = q.get_context(); + a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); + b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); + c = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); + for (int i = 0; i < N; i++) { + b[i] = -10; + a[i] = i; + c[i] = i; + } + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<1>(N), + [=](id<1> idx) + [[cl::intel_reqd_sub_group_size(32)]] { + int i = idx[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(R"a( + { + .decl V52 v_type=G type=d num_elts=16 align=GRF + .decl V53 v_type=G type=d num_elts=16 align=GRF + .decl V54 v_type=G type=d num_elts=16 align=GRF + .decl V55 v_type=G type=d num_elts=16 align=GRF + .decl V56 v_type=G type=d num_elts=16 align=GRF + .decl V57 v_type=G type=d num_elts=16 align=GRF + svm_gather.4.1 (M1, 16) %2.0 V54.0 + svm_gather.4.1 (M1, 16) %3.0 V55.0 + svm_gather.4.1 (M1, 16) %4.0 V56.0 + svm_gather.4.1 (M1, 16) %5.0 V57.0 + mul (M1, 16) V52(0,0)<1> V54(0,0)<1;1,0> V56(0,0)<1;1,0> + mul (M1, 16) V53(0,0)<1> V55(0,0)<1;1,0> V57(0,0)<1;1,0> + svm_scatter.4.1 (M1, 16) %0.0 V52.0 + svm_scatter.4.1 (M1, 16) %1.0 V53.0 + } + )a" ::"rw"(&b[i]), + "rw"(&b[i] + 16), "rw"(&a[i]), "rw"(&a[i] + 16), "rw"(&c[i]), + "rw"(&c[i] + 16)); +#else + b[i] = a[i] * c[i]; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < N; i++) { + if (b[i] != a[i] * b[i]) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << b[i] << "!=" << a[i] * b[i] << std::endl; + break; + } + } + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + cl::sycl::free(c, ctx); + return -1; + } + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + cl::sycl::free(c, ctx); + return 0; +} diff --git a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp new file mode 100644 index 000000000000..fc0fa139036f --- /dev/null +++ b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp @@ -0,0 +1,65 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#define N 100 +using namespace cl::sycl; +int main() { + int *a; + int *b; + queue q; + + sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + + auto ctx = q.get_context(); + a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); + b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); + for (int i = 0; i < N; i++) { + b[i] = -1; + a[i] = i; + } + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + int i = idx[0]; + volatile int tmp = a[i]; + tmp += 1; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile(" add (M1, 16) %0(0,0)<1> %0(0,0)<1;1,0> %1(0,0)<1;1,0>" + : "+rw"(b[i]) + : "rw"(tmp)); +#else + b[i] += tmp; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < N; i++) { + if (b[i] != a[i]) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << b[i] << "!=" << a[i] << std::endl; + break; + } + } + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + return -1; + } + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + cl::sycl::free(b, ctx); + return 0; +} diff --git a/sycl/test/inline-asm/malloc_shared_no_input.cpp b/sycl/test/inline-asm/malloc_shared_no_input.cpp new file mode 100644 index 000000000000..f2db2b8e7c9f --- /dev/null +++ b/sycl/test/inline-asm/malloc_shared_no_input.cpp @@ -0,0 +1,54 @@ +// UNSUPPORTED: cuda +// REQUIRES: gpu,linux +// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out +// RUN: %t.out + +#include "include/asmcheck.h" +#include +#include +#define N 100 +using namespace cl::sycl; +int main() { + int *a; + queue q; + sycl::device Device = q.get_device(); + + if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return 0; + } + auto ctx = q.get_context(); + a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); + for (int i = 0; i < N; i++) + a[i] = i; + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + int i = idx[0]; +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" + : "=rw"(a[i])); +#else + a[i] = 7; +#endif + }); + }).wait(); + + bool currect = true; + for (int i = 0; i < N; i++) { + if (a[i] != 7) { + currect = false; + std::cerr << "error in a[" << i << "]=" + << a[i] << "!=" << 7 << std::endl; + break; + } + } + if (!currect) { + std::cerr << "Error" << std::endl; + cl::sycl::free(a, ctx); + return -1; + } + std::cerr << "Pass" << std::endl; + cl::sycl::free(a, ctx); + return 0; +} From aa01a1dfafb8913d8050937e5ab96a0b426ce30f Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Wed, 15 Apr 2020 12:58:57 +0300 Subject: [PATCH 02/10] [SYCL] Refactor inline asm tests to reduce code duplication Signed-off-by: Alexey Sachkov --- sycl/test/inline-asm/asm_16_empty.cpp | 73 +++------- sycl/test/inline-asm/asm_16_matrix_mult.cpp | 75 ++++------ sycl/test/inline-asm/asm_16_no_input_int.cpp | 75 ++++------ sycl/test/inline-asm/asm_16_no_opts.cpp | 76 ++++------ sycl/test/inline-asm/asm_8_empty.cpp | 72 +++------- sycl/test/inline-asm/asm_8_no_input_int.cpp | 77 ++++------- .../inline-asm/asm_arbitrary_ops_order.cpp | 117 ++++++---------- sycl/test/inline-asm/asm_decl_in_scope.cpp | 130 +++++++----------- sycl/test/inline-asm/asm_float_add.cpp | 109 +++++---------- sycl/test/inline-asm/asm_float_imm_arg.cpp | 105 +++++--------- sycl/test/inline-asm/asm_float_neg.cpp | 106 +++++--------- sycl/test/inline-asm/asm_imm_arg.cpp | 105 +++++--------- sycl/test/inline-asm/asm_mul.cpp | 110 +++++---------- .../inline-asm/asm_multiple_instructions.cpp | 124 ++++++----------- sycl/test/inline-asm/asm_no_operands.cpp | 10 +- sycl/test/inline-asm/asm_no_output.cpp | 83 ++++------- sycl/test/inline-asm/asm_plus_mod.cpp | 103 +++++--------- sycl/test/inline-asm/include/asmcheck.h | 17 --- sycl/test/inline-asm/include/asmhelper.h | 128 +++++++++++++++++ sycl/test/inline-asm/letter_example.cpp | 53 +++---- sycl/test/inline-asm/malloc_shared_32.cpp | 41 +++--- .../inline-asm/malloc_shared_in_out_dif.cpp | 34 +++-- .../inline-asm/malloc_shared_no_input.cpp | 33 +++-- 23 files changed, 729 insertions(+), 1127 deletions(-) delete mode 100644 sycl/test/inline-asm/include/asmcheck.h create mode 100644 sycl/test/inline-asm/include/asmhelper.h diff --git a/sycl/test/inline-asm/asm_16_empty.cpp b/sycl/test/inline-asm/asm_16_empty.cpp index 7b551b55e8a1..ad4285e8ecbd 100644 --- a/sycl/test/inline-asm/asm_16_empty.cpp +++ b/sycl/test/inline-asm/asm_16_empty.cpp @@ -2,72 +2,39 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include -#include #include -constexpr int LIST_SIZE = 1024; -using arr_t = std::vector; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -// class is used for kernel name -template -class no_opts; +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} -template -void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferC(pc, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto C = bufferC.template get_access(cgh); - - auto kern = [C](cl::sycl::id<1> wiID) - [[cl::intel_reqd_sub_group_size(16)]] { - C[wiID] = 43; + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + C[wiID] = 43; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm volatile(""); + asm volatile(""); #endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + }); + } }; int main() { - arr_t C(LIST_SIZE); - - cl::sycl::gpu_selector gpsel; - cl::sycl::queue deviceQueue(gpsel); - - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) return 0; - } - for (int i = 0; i < LIST_SIZE; i++) { - C[i] = 0; - } - - process_buffers(deviceQueue, C.data(), LIST_SIZE); - - bool all_right = true; - - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != 43) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << 43 << "\n"; - all_right = false; - break; - } - if (all_right) { - std::cout << "Pass" << std::endl; + if (verify_all_the_same(f.getOutputBufferData(), 43)) return 0; - } - std::cout << "Error" << std::endl; - return -1; + + return 1; } diff --git a/sycl/test/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/inline-asm/asm_16_matrix_mult.cpp index 3de2b2966abc..6ae1debb6748 100644 --- a/sycl/test/inline-asm/asm_16_matrix_mult.cpp +++ b/sycl/test/inline-asm/asm_16_matrix_mult.cpp @@ -2,72 +2,43 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr int LIST_SIZE = 8; -using arr_t = std::vector; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -// class is used for kernel name -template -class simple_vector_add; +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} -template -void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferC(pc, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto C = bufferC.template get_access(cgh); - - auto kern = [C](cl::sycl::id<1> wiID) - [[cl::intel_reqd_sub_group_size(16)]] { - volatile int output = 0; + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" - : "=rw"(output)); + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); #else - output = 7; + output = 7; #endif - C[wiID] = output; - }; - cgh.parallel_for>(numOfItems, kern); - }); + C[wiID] = output; + }); + } }; int main() { - arr_t C(LIST_SIZE); - - cl::sycl::gpu_selector gpsel; - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) return 0; - } - for (int i = 0; i < LIST_SIZE; i++) { - C[i] = 0; - } - - process_buffers(deviceQueue, C.data(), LIST_SIZE); - bool all_right = true; - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != 7) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << 7 << "\n"; - all_right = false; - break; - } - if (all_right) { - std::cout << "Pass" << std::endl; + if (verify_all_the_same(f.getOutputBufferData(), 7)) return 0; - } - std::cout << "Error" << std::endl; - return -1; + + return 1; } diff --git a/sycl/test/inline-asm/asm_16_no_input_int.cpp b/sycl/test/inline-asm/asm_16_no_input_int.cpp index 3de2b2966abc..6ae1debb6748 100644 --- a/sycl/test/inline-asm/asm_16_no_input_int.cpp +++ b/sycl/test/inline-asm/asm_16_no_input_int.cpp @@ -2,72 +2,43 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr int LIST_SIZE = 8; -using arr_t = std::vector; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -// class is used for kernel name -template -class simple_vector_add; +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} -template -void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferC(pc, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto C = bufferC.template get_access(cgh); - - auto kern = [C](cl::sycl::id<1> wiID) - [[cl::intel_reqd_sub_group_size(16)]] { - volatile int output = 0; + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + volatile int output = 0; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" - : "=rw"(output)); + asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" + : "=rw"(output)); #else - output = 7; + output = 7; #endif - C[wiID] = output; - }; - cgh.parallel_for>(numOfItems, kern); - }); + C[wiID] = output; + }); + } }; int main() { - arr_t C(LIST_SIZE); - - cl::sycl::gpu_selector gpsel; - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) return 0; - } - for (int i = 0; i < LIST_SIZE; i++) { - C[i] = 0; - } - - process_buffers(deviceQueue, C.data(), LIST_SIZE); - bool all_right = true; - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != 7) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << 7 << "\n"; - all_right = false; - break; - } - if (all_right) { - std::cout << "Pass" << std::endl; + if (verify_all_the_same(f.getOutputBufferData(), 7)) return 0; - } - std::cout << "Error" << std::endl; - return -1; + + return 1; } diff --git a/sycl/test/inline-asm/asm_16_no_opts.cpp b/sycl/test/inline-asm/asm_16_no_opts.cpp index cc2df932a7d8..4b6d5146fd6b 100644 --- a/sycl/test/inline-asm/asm_16_no_opts.cpp +++ b/sycl/test/inline-asm/asm_16_no_opts.cpp @@ -2,74 +2,44 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr int LIST_SIZE = 1024; -using arr_t = std::vector; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -// class is used for kernel name -template -class simple_vector_add; +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} -template -void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferC(pc, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto C = bufferC.template get_access(cgh); - - auto kern = [C](cl::sycl::id<1> wiID) - [[cl::intel_reqd_sub_group_size(16)]] { - for (int i = 0; i < 10; ++i) { + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + for (int i = 0; i < 10; ++i) { #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("fence_sw"); - C[wiID] += i; + asm("fence_sw"); + C[wiID] += i; #else - C[wiID] += i; + C[wiID] += i; #endif - } - }; - cgh.parallel_for>(numOfItems, kern); - }); + } + }); + } }; int main() { - arr_t C(LIST_SIZE); - - cl::sycl::gpu_selector gpsel; - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) return 0; - } - for (int i = 0; i < LIST_SIZE; i++) { - C[i] = 0; - } - - process_buffers(deviceQueue, C.data(), LIST_SIZE); - - bool all_right = true; - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != 45) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << 45 << "\n"; - all_right = false; - break; - } - if (all_right) { - std::cout << "Pass" << std::endl; + if (verify_all_the_same(f.getOutputBufferData(), 45)) return 0; - } - std::cout << "Error" << std::endl; - return -1; + + return 1; } diff --git a/sycl/test/inline-asm/asm_8_empty.cpp b/sycl/test/inline-asm/asm_8_empty.cpp index a7f0b0bc6d4b..97fae0ed4eb2 100644 --- a/sycl/test/inline-asm/asm_8_empty.cpp +++ b/sycl/test/inline-asm/asm_8_empty.cpp @@ -2,71 +2,39 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr int LIST_SIZE = 1024; -using arr_t = std::vector; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -// class is used for kernel name -template -class no_opts; +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} -template -void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferC(pc, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto C = bufferC.template get_access(cgh); - - auto kern = [C](cl::sycl::id<1> wiID) - [[cl::intel_reqd_sub_group_size(8)]] { - C[wiID] = 43; + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + C[wiID] = 43; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm volatile(""); + asm volatile(""); #endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + }); + } }; int main() { - arr_t C(LIST_SIZE); - - cl::sycl::gpu_selector gpsel; - cl::sycl::queue deviceQueue(gpsel); - - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) return 0; - } - for (int i = 0; i < LIST_SIZE; i++) { - C[i] = 0; - } - - process_buffers(deviceQueue, C.data(), LIST_SIZE); - - bool all_right = true; - - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != 43) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << 43 << "\n"; - all_right = false; - break; - } - if (all_right) { - std::cout << "Pass" << std::endl; + if (verify_all_the_same(f.getOutputBufferData(), 43)) return 0; - } - std::cout << "Error" << std::endl; - return -1; + + return 1; } diff --git a/sycl/test/inline-asm/asm_8_no_input_int.cpp b/sycl/test/inline-asm/asm_8_no_input_int.cpp index a370f77b3ada..6d1dcbb832cf 100644 --- a/sycl/test/inline-asm/asm_8_no_input_int.cpp +++ b/sycl/test/inline-asm/asm_8_no_input_int.cpp @@ -2,74 +2,43 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr int LIST_SIZE = 8; -using arr_t = std::vector; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -// class is used for kernel name -template -class simple_vector_add; +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} -template -void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferC(pc, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto C = bufferC.template get_access(cgh); - - auto kern = [C](cl::sycl::id<1> wiID) - [[cl::intel_reqd_sub_group_size(8)]] { - volatile int output = 0; + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + volatile int output = 0; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" - : "=rw"(output)); + asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" + : "=rw"(output)); #else - output = 7; + output = 7; #endif - C[wiID] = output; - }; - cgh.parallel_for>(numOfItems, kern); - }); + C[wiID] = output; + }); + } }; int main() { - arr_t C(LIST_SIZE); - - cl::sycl::gpu_selector gpsel; - cl::sycl::queue deviceQueue(gpsel); - - for (int i = 0; i < LIST_SIZE; i++) { - C[i] = 0; - } - - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) return 0; - } - process_buffers(deviceQueue, C.data(), LIST_SIZE); - - bool all_right = true; - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != 7) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << 7 << "\n"; - all_right = false; - break; - } - if (all_right) { - std::cout << "Pass" << std::endl; + if (verify_all_the_same(f.getOutputBufferData(), 7)) return 0; - } - std::cout << "Error" << std::endl; - return -1; + + return 1; } diff --git a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp index 1b75b63d0027..28d0af1d455b 100644 --- a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp +++ b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp @@ -2,97 +2,58 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -constexpr int LIST_SIZE = 1024; -using arr_t = std::vector; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2, const std::vector &input3) : WithInputBuffers(input1, input2, input3), WithOutputBuffer(input1.size()) {} -// class is used for kernel name -template -class vector_mad; + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getInputBuffer(2).template get_access(cgh); + auto D = this->getOutputBuffer().template get_access(cgh); -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} - - template - void process_buffers(T const *pa, T const *pb, const T *pc, T *pd, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(D[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID])); +#else + D[wiID] = A[wiID] * B[wiID] + C[wiID]; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = i; + inputC[i] = DEFAULT_PROBLEM_SIZE - i * i; + } - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); + KernelFunctor<> f(inputA, inputB, inputC); + if (!launchInlineASMTest(f)) + return 0; - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; + auto &D = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (D[i] != inputA[i] * inputB[i] + inputC[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n"; + return 1; } - - for (int i = 0; i < LIST_SIZE; i++) { - A[i] = i; - B[i] = i; - C[i] = LIST_SIZE - i * i; - } - - ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) - if (D[i] != A[i] * B[i] + C[i]) { - std::cerr << "At index: " << i << ". "; - std::cerr << D[i] << " != " << A[i] * B[i] + C[i] << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(T const *pa, T const *pb, const T *pc, T *pd, - size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - cl::sycl::buffer bufferC(pc, numOfItems); - cl::sycl::buffer bufferD(pd, numOfItems); - - bufferA.set_final_data(nullptr); - bufferB.set_final_data(nullptr); - bufferC.set_final_data(nullptr); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - auto C = bufferC.template get_access(cgh); - auto D = bufferD.template get_access(cgh); - - auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" - : "=rw"(D[wiID]) - : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID])); -#else - D[wiID] = A[wiID] * B[wiID] + C[wiID]; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/asm_decl_in_scope.cpp b/sycl/test/inline-asm/asm_decl_in_scope.cpp index 2164a602bae6..db30e20f5e9e 100644 --- a/sycl/test/inline-asm/asm_decl_in_scope.cpp +++ b/sycl/test/inline-asm/asm_decl_in_scope.cpp @@ -2,102 +2,66 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -constexpr int LIST_SIZE = 1024 * 1024; -using arr_t = std::vector; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} -// class is used for kernel name -template -class decl_in_scope_kernel; + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} - - template - void process_buffers(T const *pa, T const *pb, T *pc, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, + [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { + // declaration of temp within and outside the scope +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + "}\n" + ".decl temp v_type=G type=d num_elts=16 align=GRF\n" + "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n" + "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" + : "+rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID]; + C[wiID] *= B[wiID]; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = 2; + } - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (C[i] != inputA[i] * inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n"; + return 1; } - - for (int i = 0; i < LIST_SIZE; i++) { - A[i] = i; - B[i] = 2; - } - - ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != A[i] * B[i]) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << A[i] * B[i] << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - cl::sycl::buffer bufferC(pc, numOfItems); - - bufferA.set_final_data(nullptr); - bufferB.set_final_data(nullptr); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - auto C = bufferC.template get_access(cgh); - - auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { - - // declaration of temp within and outside the scope -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("{\n" - ".decl temp v_type=G type=d num_elts=16 align=GRF\n" - "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n" - "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" - "}\n" - ".decl temp v_type=G type=d num_elts=16 align=GRF\n" - "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n" - "mov (M1, 16) %0(0, 0)<1> temp(0, 0)<1;1,0>\n" - : "+rw"(C[wiID]) - : "rw"(A[wiID]), "rw"(B[wiID])); -#else - C[wiID] = A[wiID]; - C[wiID] *= B[wiID]; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/asm_float_add.cpp b/sycl/test/inline-asm/asm_float_add.cpp index 4e58bb3b12d3..c23b084317c5 100644 --- a/sycl/test/inline-asm/asm_float_add.cpp +++ b/sycl/test/inline-asm/asm_float_add.cpp @@ -2,93 +2,58 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_double; -constexpr int LIST_SIZE = 16; -using arr_t = std::vector; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} -// class is used for kernel name -template -class asm_float_add_kernel; + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} - - template - void process_buffers(T const *pa, T const *pb, T *pc, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] + B[wiID]; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; - - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = (double)1 / std::pow(2, i); + inputB[i] = (double)2 / std::pow(2, i); + } - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; - } + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; - for (int i = 0; i < LIST_SIZE; i++) { - A[i] = (double)1 / std::pow(2, i); - B[i] = (double)2 / std::pow(2, i); + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + if (C[i] != inputA[i] + inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n"; + return 1; } - - ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; i++) - if (C[i] != A[i] + B[i]) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << A[i] + B[i] << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - cl::sycl::buffer bufferC(pc, numOfItems); - bufferA.set_final_data(nullptr); - bufferB.set_final_data(nullptr); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - auto C = bufferC.template get_access(cgh); - - auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" - : "=rw"(C[wiID]) - : "rw"(A[wiID]), "rw"(B[wiID])); -#else - C[wiID] = A[wiID] + B[wiID]; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/asm_float_imm_arg.cpp b/sycl/test/inline-asm/asm_float_imm_arg.cpp index 2b3f05772953..c9683cf020f7 100644 --- a/sycl/test/inline-asm/asm_float_imm_arg.cpp +++ b/sycl/test/inline-asm/asm_float_imm_arg.cpp @@ -2,88 +2,55 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; -constexpr int LIST_SIZE = 8; constexpr double IMM_ARGUMENT = 0.5; +using dataType = cl::sycl::cl_double; -using arr_t = std::vector; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} -// class is used for kernel name -template -class float_imm_arg_kernel; + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} - - template - void process_buffers(const T *pa, T *pb, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(IMM_ARGUMENT)); +#else + B[wiID] = A[wiID] * IMM_ARGUMENT; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; - - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = (double)1 / std::pow(2, i); + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != input[i] * IMM_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n"; + return 1; } - - for (int i = 0; i < LIST_SIZE; i++) - A[i] = (double)1 / std::pow(2, i); - - ct.process_buffers(A.data(), B.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) { - if (B[i] != A[i] * IMM_ARGUMENT) { - std::cerr << "At index: " << i << ". "; - std::cerr << B[i] << " != " << A[i] * IMM_ARGUMENT << "\n"; - abort(); - } - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - - auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" - : "=rw"(B[wiID]) - : "rw"(A[wiID]), "rw"(IMM_ARGUMENT)); -#else - B[wiID] = A[wiID] * IMM_ARGUMENT; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/asm_float_neg.cpp b/sycl/test/inline-asm/asm_float_neg.cpp index f96ea348c468..290b0898903e 100644 --- a/sycl/test/inline-asm/asm_float_neg.cpp +++ b/sycl/test/inline-asm/asm_float_neg.cpp @@ -2,88 +2,56 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_float; -constexpr int LIST_SIZE = 1024; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} -using arr_t = std::vector; + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer().template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); -// class is used for kernel name -template -class float_neg_kernel; - -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>" + : "=rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] = -A[wiID]; +#endif + }); + } - template - void process_buffers(const T *pa, T *pb, size_t sz); + size_t problem_size = 0; }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; - - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = 1.0 / i; + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &R = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (R[i] != -input[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << R[i] << " != " << -input[i] << "\n"; + return 1; } - - for (int i = 0; i < LIST_SIZE; i++) - A[i] = 1.0 / i; - - ct.process_buffers(A.data(), B.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) - if (B[i] != -A[i]) { - std::cerr << "At index: " << i << ". "; - std::cerr << B[i] << " != " << -A[i] << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - bufferA.set_final_data(nullptr); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - - auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>" - : "=rw"(B[wiID]) - : "rw"(A[wiID])); -#else - B[wiID] = -A[wiID]; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/asm_imm_arg.cpp b/sycl/test/inline-asm/asm_imm_arg.cpp index 361bcc4c48d1..ce9a3ad948a4 100644 --- a/sycl/test/inline-asm/asm_imm_arg.cpp +++ b/sycl/test/inline-asm/asm_imm_arg.cpp @@ -2,87 +2,54 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; - -constexpr int LIST_SIZE = 1024 * 1024; constexpr int CONST_ARGUMENT = 0xabc; +using dataType = cl::sycl::cl_int; -using arr_t = std::vector; - -// class is used for kernel name -template -class const_asm_arg_kernel; - -class ocl_ctx_t { - cl::sycl::queue deviceQueue; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input) : WithInputBuffers(input), WithOutputBuffer(input.size()) {} -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); - template - void process_buffers(const T *pa, T *pb, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" + : "=rw"(B[wiID]) + : "rw"(A[wiID]), "rw"(CONST_ARGUMENT)); +#else + B[wiID] = A[wiID] + CONST_ARGUMENT; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; - - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; + std::vector input(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) + input[i] = i; + + KernelFunctor<> f(input); + if (!launchInlineASMTest(f)) + return 0; + + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != input[i] * CONST_ARGUMENT) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << input[i] * CONST_ARGUMENT << "\n"; + return 1; } - - for (int i = 0; i < LIST_SIZE; i++) - A[i] = i; - - ct.process_buffers(A.data(), B.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) - if (B[i] != A[i] + CONST_ARGUMENT) { - std::cerr << "At index: " << i << ". "; - std::cerr << B[i] << " != " << A[i] + CONST_ARGUMENT << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - - auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" - : "=rw"(B[wiID]) - : "rw"(A[wiID]), "rw"(CONST_ARGUMENT)); -#else - B[wiID] = A[wiID] + CONST_ARGUMENT; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/asm_mul.cpp b/sycl/test/inline-asm/asm_mul.cpp index 15fd7126e7d2..726abcf787f2 100644 --- a/sycl/test/inline-asm/asm_mul.cpp +++ b/sycl/test/inline-asm/asm_mul.cpp @@ -2,92 +2,56 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -constexpr int LIST_SIZE = 1024 * 1024; -using arr_t = std::vector; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1, input2), WithOutputBuffer(input1.size()) {} + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getOutputBuffer().template get_access(cgh); -// class is used for kernel name -template -class vector_mul; - -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} - - template - void process_buffers(T const *pa, T const *pb, T *pc, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" + : "=rw"(C[wiID]) + : "rw"(A[wiID]), "rw"(B[wiID])); +#else + C[wiID] = A[wiID] * B[wiID]; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; - - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = DEFAULT_PROBLEM_SIZE - i; + } - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; - } + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; - for (int i = 0; i < LIST_SIZE; i++) { - A[i] = i; - B[i] = LIST_SIZE - i; + auto &C = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (C[i] != inputA[i] * inputB[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n"; + return 1; } - - ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != A[i] * B[i]) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << A[i] * B[i] << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - cl::sycl::buffer bufferC(pc, numOfItems); - bufferA.set_final_data(nullptr); - bufferB.set_final_data(nullptr); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - auto C = bufferC.template get_access(cgh); - - auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" - : "=rw"(C[wiID]) - : "rw"(A[wiID]), "rw"(B[wiID])); -#else - C[wiID] = A[wiID] * B[wiID]; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/asm_multiple_instructions.cpp b/sycl/test/inline-asm/asm_multiple_instructions.cpp index edbd21cfae94..e8cf02a5292e 100644 --- a/sycl/test/inline-asm/asm_multiple_instructions.cpp +++ b/sycl/test/inline-asm/asm_multiple_instructions.cpp @@ -2,104 +2,58 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -constexpr int LIST_SIZE = 1024; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2, const std::vector &input3) : WithInputBuffers(input1, input2, input3), WithOutputBuffer(input1.size()) {} -using arr_t = std::vector; + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getInputBuffer(1).template get_access(cgh); + auto C = this->getInputBuffer(2).template get_access(cgh); + auto D = this->getOutputBuffer().template get_access(cgh); -// class is used for kernel name -template -class vector_mul; - -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} - - template - void process_buffers(T const *pa, T const *pb, T const *pc, T *pd, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("{\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n" + "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n" + "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n" + "}\n" + : "=rw"(D[wiID]), "+rw"(A[wiID]) + : "rw"(B[wiID]), "rw"(C[wiID])); +#else + A[wiID] += B[wiID]; + A[wiID] += C[wiID]; + D[wiID] = A[wiID]; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE); - - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; - - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; - } - - for (int i = 0; i < LIST_SIZE; i++) { - A[i] = B[i] = i; - C[i] = LIST_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE - } - - ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) - if (D[i] != LIST_SIZE) { - std::cerr << "At index: " << i << ". "; - std::cerr << D[i] << " != " << LIST_SIZE << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = inputB[i] = i; + inputC[i] = DEFAULT_PROBLEM_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE } -} -template -void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T const *pc, T *pd, - size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - cl::sycl::buffer bufferC(pc, numOfItems); - cl::sycl::buffer bufferD(pd, numOfItems); + KernelFunctor<> f(inputA, inputB, inputC); + if (!launchInlineASMTest(f)) + return 0; - bufferA.set_final_data(nullptr); - bufferB.set_final_data(nullptr); - bufferC.set_final_data(nullptr); + if (verify_all_the_same(f.getOutputBufferData(), (dataType)DEFAULT_PROBLEM_SIZE)) + return 0; - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - auto C = bufferC.template get_access(cgh); - auto D = bufferD.template get_access(cgh); - - auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID) - [[cl::intel_reqd_sub_group_size(8)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("{\n" - "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n" - "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n" - "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n" - "}\n" - : "=rw"(D[wiID]), "+rw"(A[wiID]) - : "rw"(B[wiID]), "rw"(C[wiID])); -#else - A[wiID] += B[wiID]; - A[wiID] += C[wiID]; - D[wiID] = A[wiID]; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 1; } diff --git a/sycl/test/inline-asm/asm_no_operands.cpp b/sycl/test/inline-asm/asm_no_operands.cpp index f7abfd523928..3a3a919caa98 100644 --- a/sycl/test/inline-asm/asm_no_operands.cpp +++ b/sycl/test/inline-asm/asm_no_operands.cpp @@ -2,27 +2,29 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include class no_operands_kernel; int main() { // Creating SYCL queue cl::sycl::queue Queue; - sycl::device Device = Queue.get_device(); + cl::sycl::device Device = Queue.get_device(); if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { std::cout << "Skipping test\n"; return 0; } // Size of index space for kernel - cl::sycl::range<1> NumOfWorkItems{15}; + cl::sycl::range<1> NumOfWorkItems{16}; // Submitting command group(work) to queue Queue.submit([&](cl::sycl::handler &cgh) { // Executing kernel - cgh.parallel_for( + cgh.parallel_for( NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] { #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) asm("barrier"); diff --git a/sycl/test/inline-asm/asm_no_output.cpp b/sycl/test/inline-asm/asm_no_output.cpp index 2440fd1c17a2..ab65346d1313 100644 --- a/sycl/test/inline-asm/asm_no_output.cpp +++ b/sycl/test/inline-asm/asm_no_output.cpp @@ -2,77 +2,46 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr int LIST_SIZE = 8; -using arr_t = std::vector; -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -// class is used for kernel name -template -class asm_no_output; +template +struct KernelFunctor : WithOutputBuffer { + KernelFunctor(size_t problem_size) : WithOutputBuffer(problem_size) {} -template -void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferC(pc, numOfItems); - - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto C = bufferC.template get_access(cgh); - - auto kern = [C]() - [[cl::intel_reqd_sub_group_size(16)]] { - volatile int local_var = 47; - local_var += C[0]; + void operator()(cl::sycl::handler &cgh) { + auto C = this->getOutputBuffer().template get_access(cgh); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] { + volatile int local_var = 47; + local_var += C[0]; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm volatile("{\n" - ".decl temp v_type=G type=w num_elts=8 align=GRF\n" - "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n" - "}\n" ::"rw"(local_var)); + asm volatile("{\n" + ".decl temp v_type=G type=w num_elts=8 align=GRF\n" + "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n" + "}\n" ::"rw"(local_var)); #else - volatile int temp = 0; - temp = local_var; + volatile int temp = 0; + temp = local_var; #endif - }; - cgh.single_task>(kern); - }); + }); + } }; int main() { - arr_t C(LIST_SIZE); - - cl::sycl::gpu_selector gpsel; - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); - - if (!isInlineASMSupported(Device)) { - std::cout << "Skipping test\n"; + KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); + if (!launchInlineASMTest(f)) return 0; - } - for (int i = 0; i < LIST_SIZE; i++) { - C[i] = 0; - } - - process_buffers(deviceQueue, C.data(), LIST_SIZE); - - bool all_right = true; - for (int i = 0; i < LIST_SIZE; ++i) - if (C[i] != 0) { - std::cerr << "At index: " << i << ". "; - std::cerr << C[i] << " != " << 0 << "\n"; - all_right = false; - break; - } - if (all_right) { - std::cout << "Pass" << std::endl; + if (verify_all_the_same(f.getOutputBufferData(), 47)) return 0; - } - std::cout << "Error" << std::endl; - return -1; + + return 1; } diff --git a/sycl/test/inline-asm/asm_plus_mod.cpp b/sycl/test/inline-asm/asm_plus_mod.cpp index d460406c71cd..f65cda777ef9 100644 --- a/sycl/test/inline-asm/asm_plus_mod.cpp +++ b/sycl/test/inline-asm/asm_plus_mod.cpp @@ -2,86 +2,57 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include #include -constexpr auto sycl_read = cl::sycl::access::mode::read; -constexpr auto sycl_write = cl::sycl::access::mode::write; +using dataType = cl::sycl::cl_int; -constexpr int LIST_SIZE = 1024 * 1024; -using arr_t = std::vector; +template +struct KernelFunctor : WithInputBuffers, WithOutputBuffer { + KernelFunctor(const std::vector &input1, const std::vector &input2) : WithInputBuffers(input1), WithOutputBuffer(input2) {} -// class is used for kernel name -template -class plus_mod_kernel; + void operator()(cl::sycl::handler &cgh) { + auto A = this->getInputBuffer(0).template get_access(cgh); + auto B = this->getOutputBuffer().template get_access(cgh); -class ocl_ctx_t { - cl::sycl::queue deviceQueue; - -public: - ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {} - - template - void process_buffers(T const *pa, T *pb, size_t sz); + cgh.parallel_for>( + cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { +#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) + asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>" + : "+rw"(B[wiID]) + : "rw"(A[wiID])); +#else + B[wiID] += A[wiID]; +#endif + }); + } }; int main() { - arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE); + std::vector inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), R(DEFAULT_PROBLEM_SIZE); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) { + inputA[i] = i; + inputB[i] = DEFAULT_PROBLEM_SIZE - i; + R[i] = inputA[i] + inputB[i]; + } - try { - cl::sycl::gpu_selector gpsel; - ocl_ctx_t ct{gpsel}; - cl::sycl::queue deviceQueue(gpsel); - sycl::device Device = deviceQueue.get_device(); + KernelFunctor<> f(inputA, inputB); + if (!launchInlineASMTest(f)) + return 0; - if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return 0; + auto &B = f.getOutputBufferData(); + for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { + if (B[i] != R[i]) { + std::cerr << "At index: " << i << ". "; + std::cerr << B[i] << " != " << R[i] << "\n"; + return 1; } - for (int i = 0; i < LIST_SIZE; i++) { - A[i] = i; - B[i] = LIST_SIZE - i; - C[i] = A[i] + B[i]; - } - - ct.process_buffers(A.data(), B.data(), LIST_SIZE); - - for (int i = 0; i < LIST_SIZE; ++i) - if (B[i] != C[i]) { - std::cerr << "At index: " << i << ". "; - std::cerr << B[i] << " != " << C[i] << "\n"; - abort(); - } - - std::cout << "Everything is correct" << std::endl; - } catch (cl::sycl::exception const &err) { - std::cerr << "ERROR: " << err.what() << ":\n"; - return -1; } -} - -template -void ocl_ctx_t::process_buffers(T const *pa, T *pb, size_t sz) { - cl::sycl::range<1> numOfItems{sz}; - cl::sycl::buffer bufferA(pa, numOfItems); - cl::sycl::buffer bufferB(pb, numOfItems); - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto A = bufferA.template get_access(cgh); - auto B = bufferB.template get_access(cgh); - - auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] { -#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>" - : "+rw"(B[wiID]) - : "rw"(A[wiID])); -#else - B[wiID] += A[wiID]; -#endif - }; - cgh.parallel_for>(numOfItems, kern); - }); + return 0; } diff --git a/sycl/test/inline-asm/include/asmcheck.h b/sycl/test/inline-asm/include/asmcheck.h deleted file mode 100644 index 2031455e214c..000000000000 --- a/sycl/test/inline-asm/include/asmcheck.h +++ /dev/null @@ -1,17 +0,0 @@ -#include - -#include -bool isInlineASMSupported(sycl::device Device) { - - sycl::string_class DriverVersion = Device.get_info(); - sycl::string_class DeviceVendorName = Device.get_info(); - if (DeviceVendorName.find("Intel") == sycl::string_class::npos) - return false; - if (DriverVersion.length() < 5) - return false; - if (DriverVersion[2] != '.') - return false; - if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12) - return false; - return true; -} diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h new file mode 100644 index 000000000000..45d044161269 --- /dev/null +++ b/sycl/test/inline-asm/include/asmhelper.h @@ -0,0 +1,128 @@ +#include + +#include +#include + +constexpr const size_t DEFAULT_PROBLEM_SIZE = 16; + +template +struct WithOutputBuffer { + WithOutputBuffer(size_t size) { + _output_buffer_data.resize(size, 0); + _output_buffer = new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size()); + } + + WithOutputBuffer(const std::vector &data) { + _output_buffer_data = data; + _output_buffer = new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size()); + } + + ~WithOutputBuffer() { + if (_output_buffer) + delete _output_buffer; + } + + const std::vector &getOutputBufferData() const { + return _output_buffer_data; + } + + size_t getOutputBufferSize() const { + return _output_buffer_data.size(); + } + +protected: + cl::sycl::buffer &getOutputBuffer() { + return *_output_buffer; + } + + cl::sycl::buffer *_output_buffer = nullptr; + std::vector _output_buffer_data; +}; + +template +struct WithInputBuffers { + + template + WithInputBuffers(Args... inputs) { + static_assert(sizeof...(Args) == N, "All input buffers must be initialized"); + constructorHelper<0>(inputs...); + } + + ~WithInputBuffers() { + for (size_t i = 0; i < N; ++i) { + if (_input_buffers[i]) + delete _input_buffers[i]; + } + } + + cl::sycl::buffer &getInputBuffer(size_t i = 0) { + return *_input_buffers[i]; + } + +protected: + cl::sycl::buffer *_input_buffers[N] = {nullptr}; + std::vector _input_buffers_data[N]; + +private: + template + void constructorHelper(const std::vector &data, Args... rest) { + _input_buffers_data[Index] = data; + _input_buffers[Index] = new cl::sycl::buffer(_input_buffers_data[Index].data(), _input_buffers_data[Index].size()); + constructorHelper(rest...); + } + + template + void constructorHelper() { + // nothing to do, recursion stop + } +}; + +bool isInlineASMSupported(sycl::device Device) { + + sycl::string_class DriverVersion = Device.get_info(); + sycl::string_class DeviceVendorName = Device.get_info(); + // TODO: query for some extension/capability/whatever once interface is + // defined + if (DeviceVendorName.find("Intel") == sycl::string_class::npos) + return false; + if (DriverVersion.length() < 5) + return false; + if (DriverVersion[2] != '.') + return false; + if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12) + return false; + return true; +} + +/// checks if device suppots inline asm feature and launches a test +/// +/// \returns false if test wasn't launched (i.e.was skipped) and true otherwise +template +bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { + cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); + cl::sycl::device device = deviceQueue.get_device(); + + if (!isInlineASMSupported(device)) { + std::cout << "Skipping test\n"; + return false; + } + + if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return false; + } + + deviceQueue.submit(f).wait(); + return true; +} + +template +bool verify_all_the_same(const std::vector &input, T reference_value) { + for (int i = 0; i < input.size(); ++i) + if (input[i] != reference_value) { + std::cerr << "At index: " << i << " "; + std::cerr << input[i] << " != " << reference_value << "\n"; + return false; + } + return true; +} diff --git a/sycl/test/inline-asm/letter_example.cpp b/sycl/test/inline-asm/letter_example.cpp index 5b9bac0e22d2..22bf26648e78 100644 --- a/sycl/test/inline-asm/letter_example.cpp +++ b/sycl/test/inline-asm/letter_example.cpp @@ -2,45 +2,50 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include -#define N 100 -using namespace cl::sycl; + +constexpr size_t problem_size = 16; + +class kernel_name; + int main() { - int *a; - queue q; - sycl::device Device = q.get_device(); + cl::sycl::queue q; + cl::sycl::device Device = q.get_device(); if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { std::cout << "Skipping test\n"; return 0; } auto ctx = q.get_context(); - a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); - for (int i = 0; i < N; i++) + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { a[i] = i; - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<1>(N), [=](id<1> idx) - [[cl::intel_reqd_sub_group_size(16)]] { + } + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) + [[cl::intel_reqd_sub_group_size(16)]] { #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) - int i = idx[0]; - asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n" - "svm_gather.4.1 (M1, 16) %0.0 V52.0\n" - "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n" - "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}" - : - : "rw"(&a[i])); + int i = idx[0]; + asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n" + "svm_gather.4.1 (M1, 16) %0.0 V52.0\n" + "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n" + "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}" + : + : "rw"(&a[i])); #else - a[idx[0]]++; + a[idx[0]]++; #endif - }); + }); }).wait(); bool currect = true; - for (int i = 0; i < N; i++) { + for (int i = 0; i < problem_size; i++) { if (a[i] != (i + 1)) { currect = false; std::cerr << "error in a[" << i << "]=" @@ -48,11 +53,13 @@ int main() { break; } } + if (!currect) { std::cerr << "Error" << std::endl; cl::sycl::free(a, ctx); - return -1; + return 1; } + std::cerr << "Pass" << std::endl; cl::sycl::free(a, ctx); return 0; diff --git a/sycl/test/inline-asm/malloc_shared_32.cpp b/sycl/test/inline-asm/malloc_shared_32.cpp index d521afabaaca..8f058851c268 100644 --- a/sycl/test/inline-asm/malloc_shared_32.cpp +++ b/sycl/test/inline-asm/malloc_shared_32.cpp @@ -2,19 +2,21 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include -#define N 1000 -using namespace cl::sycl; + +constexpr size_t problem_size = 32; + +class kernel_name; + int main() { - int *a; - int *b; - int *c; - queue q; + cl::sycl::queue q; - sycl::device Device = q.get_device(); + cl::sycl::device Device = q.get_device(); if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { std::cout << "Skipping test\n"; @@ -22,18 +24,19 @@ int main() { } auto ctx = q.get_context(); - a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); - b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); - c = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); - for (int i = 0; i < N; i++) { + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *c = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { b[i] = -10; a[i] = i; c[i] = i; } - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<1>(N), - [=](id<1> idx) + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), + [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(32)]] { int i = idx[0]; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) @@ -64,7 +67,7 @@ int main() { }).wait(); bool currect = true; - for (int i = 0; i < N; i++) { + for (int i = 0; i < problem_size; i++) { if (b[i] != a[i] * b[i]) { currect = false; std::cerr << "error in a[" << i << "]=" @@ -72,13 +75,15 @@ int main() { break; } } + if (!currect) { std::cerr << "Error" << std::endl; cl::sycl::free(a, ctx); cl::sycl::free(b, ctx); cl::sycl::free(c, ctx); - return -1; + return 1; } + std::cerr << "Pass" << std::endl; cl::sycl::free(a, ctx); cl::sycl::free(b, ctx); diff --git a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp index fc0fa139036f..a6994bd37919 100644 --- a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp +++ b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp @@ -3,17 +3,18 @@ // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include -#define N 100 -using namespace cl::sycl; + +constexpr size_t problem_size = 100; + +class kernel_name; + int main() { - int *a; - int *b; - queue q; + cl::sycl::queue q; - sycl::device Device = q.get_device(); + cl::sycl::device Device = q.get_device(); if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { std::cout << "Skipping test\n"; @@ -21,15 +22,16 @@ int main() { } auto ctx = q.get_context(); - a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); - b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); - for (int i = 0; i < N; i++) { + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) { b[i] = -1; a[i] = i; } - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { int i = idx[0]; volatile int tmp = a[i]; tmp += 1; @@ -44,7 +46,7 @@ int main() { }).wait(); bool currect = true; - for (int i = 0; i < N; i++) { + for (int i = 0; i < problem_size; i++) { if (b[i] != a[i]) { currect = false; std::cerr << "error in a[" << i << "]=" @@ -52,12 +54,14 @@ int main() { break; } } + if (!currect) { std::cerr << "Error" << std::endl; cl::sycl::free(a, ctx); cl::sycl::free(b, ctx); - return -1; + return 1; } + std::cerr << "Pass" << std::endl; cl::sycl::free(a, ctx); cl::sycl::free(b, ctx); diff --git a/sycl/test/inline-asm/malloc_shared_no_input.cpp b/sycl/test/inline-asm/malloc_shared_no_input.cpp index f2db2b8e7c9f..22cd47abd64a 100644 --- a/sycl/test/inline-asm/malloc_shared_no_input.cpp +++ b/sycl/test/inline-asm/malloc_shared_no_input.cpp @@ -2,28 +2,33 @@ // REQUIRES: gpu,linux // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out // RUN: %t.out +// RUN: %clangxx -fsycl %s -o %t.ref.out +// RUN: %t.ref.out -#include "include/asmcheck.h" +#include "include/asmhelper.h" #include #include -#define N 100 -using namespace cl::sycl; + +constexpr size_t problem_size = 16; + +class kernel_name; + int main() { - int *a; - queue q; - sycl::device Device = q.get_device(); + cl::sycl::queue q; + cl::sycl::device Device = q.get_device(); if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) { std::cout << "Skipping test\n"; return 0; } auto ctx = q.get_context(); - a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx); - for (int i = 0; i < N; i++) + int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx); + for (int i = 0; i < problem_size; i++) a[i] = i; - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { + + q.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] { int i = idx[0]; #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" @@ -35,7 +40,7 @@ int main() { }).wait(); bool currect = true; - for (int i = 0; i < N; i++) { + for (int i = 0; i < problem_size; i++) { if (a[i] != 7) { currect = false; std::cerr << "error in a[" << i << "]=" @@ -43,11 +48,13 @@ int main() { break; } } + if (!currect) { std::cerr << "Error" << std::endl; cl::sycl::free(a, ctx); - return -1; + return 1; } + std::cerr << "Pass" << std::endl; cl::sycl::free(a, ctx); return 0; From d67e0e9edccdc173ddbdb20eeba0f179d49575b0 Mon Sep 17 00:00:00 2001 From: amochalo Date: Wed, 15 Apr 2020 19:02:00 +0300 Subject: [PATCH 03/10] Apply suggestion Signed-off-by: amochalo --- sycl/test/inline-asm/include/asmhelper.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h index 45d044161269..c57abff8c708 100644 --- a/sycl/test/inline-asm/include/asmhelper.h +++ b/sycl/test/inline-asm/include/asmhelper.h @@ -99,6 +99,9 @@ bool isInlineASMSupported(sycl::device Device) { /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise template bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { +#if !defined(INLINE_ASM) + return true; // any device is capable to launch reference versions of tests +#endif cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); cl::sycl::device device = deviceQueue.get_device(); From 29c858abbfd3ea31bb770ede564b7d4d5216a410 Mon Sep 17 00:00:00 2001 From: amochalo Date: Wed, 15 Apr 2020 19:35:04 +0300 Subject: [PATCH 04/10] Small fix Signed-off-by: amochalo --- sycl/test/inline-asm/include/asmhelper.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h index c57abff8c708..dd0adb2eba7f 100644 --- a/sycl/test/inline-asm/include/asmhelper.h +++ b/sycl/test/inline-asm/include/asmhelper.h @@ -100,15 +100,13 @@ bool isInlineASMSupported(sycl::device Device) { template bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { #if !defined(INLINE_ASM) - return true; // any device is capable to launch reference versions of tests -#endif - cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); - cl::sycl::device device = deviceQueue.get_device(); - - if (!isInlineASMSupported(device)) { + if (!isInlineASMSupported(device)) { std::cout << "Skipping test\n"; return false; } +#endif + cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); + cl::sycl::device device = deviceQueue.get_device(); if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) { std::cout << "Skipping test\n"; From 0fc675dc0dadd3dffc96cc059a0cf21efb23e124 Mon Sep 17 00:00:00 2001 From: amochalo Date: Wed, 15 Apr 2020 20:18:24 +0300 Subject: [PATCH 05/10] Small fix Signed-off-by: amochalo --- sycl/test/inline-asm/include/asmhelper.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h index dd0adb2eba7f..7b70376068ab 100644 --- a/sycl/test/inline-asm/include/asmhelper.h +++ b/sycl/test/inline-asm/include/asmhelper.h @@ -99,14 +99,15 @@ bool isInlineASMSupported(sycl::device Device) { /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise template bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { + cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); + cl::sycl::device device = deviceQueue.get_device(); + #if !defined(INLINE_ASM) - if (!isInlineASMSupported(device)) { + if (!isInlineASMSupported(device)) { std::cout << "Skipping test\n"; return false; } #endif - cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); - cl::sycl::device device = deviceQueue.get_device(); if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) { std::cout << "Skipping test\n"; From 063e02775a781d66d028b76c80a3b6430fbf88da Mon Sep 17 00:00:00 2001 From: amochalo Date: Thu, 16 Apr 2020 15:32:58 +0300 Subject: [PATCH 06/10] Small fix Signed-off-by: amochalo --- sycl/test/inline-asm/include/asmhelper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h index 7b70376068ab..41c01f6e8c71 100644 --- a/sycl/test/inline-asm/include/asmhelper.h +++ b/sycl/test/inline-asm/include/asmhelper.h @@ -102,7 +102,7 @@ bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); cl::sycl::device device = deviceQueue.get_device(); -#if !defined(INLINE_ASM) +#if defined(INLINE_ASM) if (!isInlineASMSupported(device)) { std::cout << "Skipping test\n"; return false; From d4fb662e020891cb16d659a69615fb4e6a5e4861 Mon Sep 17 00:00:00 2001 From: amochalo Date: Thu, 16 Apr 2020 15:40:47 +0300 Subject: [PATCH 07/10] Small fix Signed-off-by: amochalo --- sycl/test/inline-asm/include/asmhelper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h index 41c01f6e8c71..59150909b836 100644 --- a/sycl/test/inline-asm/include/asmhelper.h +++ b/sycl/test/inline-asm/include/asmhelper.h @@ -8,7 +8,7 @@ constexpr const size_t DEFAULT_PROBLEM_SIZE = 16; template struct WithOutputBuffer { WithOutputBuffer(size_t size) { - _output_buffer_data.resize(size, 0); + _output_buffer_data.resize(size); _output_buffer = new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size()); } From 8bb6e521695c8be9bed5c1c8126e8330edbe002a Mon Sep 17 00:00:00 2001 From: amochalo Date: Thu, 16 Apr 2020 21:26:04 +0300 Subject: [PATCH 08/10] Correctly handle buffer Signed-off-by: amochalo --- sycl/test/inline-asm/include/asmhelper.h | 56 ++++++++++++------------ 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h index 59150909b836..75585e161133 100644 --- a/sycl/test/inline-asm/include/asmhelper.h +++ b/sycl/test/inline-asm/include/asmhelper.h @@ -1,6 +1,7 @@ #include #include +#include #include constexpr const size_t DEFAULT_PROBLEM_SIZE = 16; @@ -9,20 +10,17 @@ template struct WithOutputBuffer { WithOutputBuffer(size_t size) { _output_buffer_data.resize(size); - _output_buffer = new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size()); + _output_buffer.reset(new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size())); } WithOutputBuffer(const std::vector &data) { _output_buffer_data = data; - _output_buffer = new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size()); + _output_buffer.reset(new cl::sycl::buffer(_output_buffer_data.data(), _output_buffer_data.size())); } - ~WithOutputBuffer() { - if (_output_buffer) - delete _output_buffer; - } - - const std::vector &getOutputBufferData() const { + const std::vector &getOutputBufferData() { + // We cannoe access the data until the buffer is still alive + _output_buffer.reset(); return _output_buffer_data; } @@ -35,7 +33,10 @@ struct WithOutputBuffer { return *_output_buffer; } - cl::sycl::buffer *_output_buffer = nullptr; + // Functor is being passed by-copy into cl::sycl::queue::submit and destroyed + // one more time in there. We need to make sure that buffer is only released + // once. + std::shared_ptr> _output_buffer = nullptr; std::vector _output_buffer_data; }; @@ -48,26 +49,19 @@ struct WithInputBuffers { constructorHelper<0>(inputs...); } - ~WithInputBuffers() { - for (size_t i = 0; i < N; ++i) { - if (_input_buffers[i]) - delete _input_buffers[i]; - } - } - cl::sycl::buffer &getInputBuffer(size_t i = 0) { return *_input_buffers[i]; } protected: - cl::sycl::buffer *_input_buffers[N] = {nullptr}; + std::shared_ptr> _input_buffers[N] = {nullptr}; std::vector _input_buffers_data[N]; private: template void constructorHelper(const std::vector &data, Args... rest) { _input_buffers_data[Index] = data; - _input_buffers[Index] = new cl::sycl::buffer(_input_buffers_data[Index].data(), _input_buffers_data[Index].size()); + _input_buffers[Index].reset(new cl::sycl::buffer(_input_buffers_data[Index].data(), _input_buffers_data[Index].size())); constructorHelper(rest...); } @@ -99,22 +93,26 @@ bool isInlineASMSupported(sycl::device Device) { /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise template bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) { - cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); - cl::sycl::device device = deviceQueue.get_device(); + try { + cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{}); + cl::sycl::device device = deviceQueue.get_device(); #if defined(INLINE_ASM) - if (!isInlineASMSupported(device)) { - std::cout << "Skipping test\n"; - return false; - } + if (!isInlineASMSupported(device)) { + std::cout << "Skipping test\n"; + return false; + } #endif - if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) { - std::cout << "Skipping test\n"; - return false; - } + if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) { + std::cout << "Skipping test\n"; + return false; + } - deviceQueue.submit(f).wait(); + deviceQueue.submit(f).wait(); + } catch (cl::sycl::exception &e) { + std::cerr << "Caught exception: " << e.what() << std::endl; + } return true; } From 2efe78f1af2e0781b48efb698ad212acc7f910a3 Mon Sep 17 00:00:00 2001 From: amochalo Date: Fri, 17 Apr 2020 15:32:48 +0300 Subject: [PATCH 09/10] Move inline asm tests to feature tests Signed-off-by: amochalo --- sycl/test/CMakeLists.txt | 8 ++++++++ sycl/test/{ => feature-tests}/inline-asm/asm_16_empty.cpp | 0 .../{ => feature-tests}/inline-asm/asm_16_matrix_mult.cpp | 0 .../inline-asm/asm_16_no_input_int.cpp | 0 .../{ => feature-tests}/inline-asm/asm_16_no_opts.cpp | 0 sycl/test/{ => feature-tests}/inline-asm/asm_8_empty.cpp | 0 .../{ => feature-tests}/inline-asm/asm_8_no_input_int.cpp | 0 .../inline-asm/asm_arbitrary_ops_order.cpp | 0 .../{ => feature-tests}/inline-asm/asm_decl_in_scope.cpp | 0 .../test/{ => feature-tests}/inline-asm/asm_float_add.cpp | 0 .../{ => feature-tests}/inline-asm/asm_float_imm_arg.cpp | 0 .../test/{ => feature-tests}/inline-asm/asm_float_neg.cpp | 0 sycl/test/{ => feature-tests}/inline-asm/asm_imm_arg.cpp | 0 sycl/test/{ => feature-tests}/inline-asm/asm_mul.cpp | 0 .../inline-asm/asm_multiple_instructions.cpp | 0 .../{ => feature-tests}/inline-asm/asm_no_operands.cpp | 0 .../test/{ => feature-tests}/inline-asm/asm_no_output.cpp | 0 sycl/test/{ => feature-tests}/inline-asm/asm_plus_mod.cpp | 0 .../{ => feature-tests}/inline-asm/include/asmhelper.h | 0 .../{ => feature-tests}/inline-asm/letter_example.cpp | 0 .../{ => feature-tests}/inline-asm/malloc_shared_32.cpp | 0 .../inline-asm/malloc_shared_in_out_dif.cpp | 0 .../inline-asm/malloc_shared_no_input.cpp | 0 sycl/test/lit.cfg.py | 3 ++- 24 files changed, 10 insertions(+), 1 deletion(-) rename sycl/test/{ => feature-tests}/inline-asm/asm_16_empty.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_16_matrix_mult.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_16_no_input_int.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_16_no_opts.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_8_empty.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_8_no_input_int.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_arbitrary_ops_order.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_decl_in_scope.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_float_add.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_float_imm_arg.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_float_neg.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_imm_arg.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_mul.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_multiple_instructions.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_no_operands.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_no_output.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/asm_plus_mod.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/include/asmhelper.h (100%) rename sycl/test/{ => feature-tests}/inline-asm/letter_example.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/malloc_shared_32.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/malloc_shared_in_out_dif.cpp (100%) rename sycl/test/{ => feature-tests}/inline-asm/malloc_shared_no_input.cpp (100%) diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt index a152c4b34137..0fc16451d8fc 100644 --- a/sycl/test/CMakeLists.txt +++ b/sycl/test/CMakeLists.txt @@ -61,6 +61,14 @@ add_lit_testsuites(SYCL-DEPLOY ${CMAKE_CURRENT_SOURCE_DIR} EXCLUDE_FROM_CHECK_ALL ) +add_lit_target(check-sycl-inline-asm + "Running lit suite ${CMAKE_CURRENT_SOURCE_DIR}/feature-tests/inline-asm" + "feature-tests/inline-asm" + ARGS ${RT_TEST_ARGS} + PARAMS "SYCL_BE=PI_OPENCL" + DEPENDS ${SYCL_TEST_DEPS} + ) + if(SYCL_BUILD_PI_CUDA) add_lit_testsuite(check-sycl-cuda "Running the SYCL regression tests for CUDA" ${CMAKE_CURRENT_BINARY_DIR} diff --git a/sycl/test/inline-asm/asm_16_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp similarity index 100% rename from sycl/test/inline-asm/asm_16_empty.cpp rename to sycl/test/feature-tests/inline-asm/asm_16_empty.cpp diff --git a/sycl/test/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp similarity index 100% rename from sycl/test/inline-asm/asm_16_matrix_mult.cpp rename to sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp diff --git a/sycl/test/inline-asm/asm_16_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp similarity index 100% rename from sycl/test/inline-asm/asm_16_no_input_int.cpp rename to sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp diff --git a/sycl/test/inline-asm/asm_16_no_opts.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp similarity index 100% rename from sycl/test/inline-asm/asm_16_no_opts.cpp rename to sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp diff --git a/sycl/test/inline-asm/asm_8_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp similarity index 100% rename from sycl/test/inline-asm/asm_8_empty.cpp rename to sycl/test/feature-tests/inline-asm/asm_8_empty.cpp diff --git a/sycl/test/inline-asm/asm_8_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp similarity index 100% rename from sycl/test/inline-asm/asm_8_no_input_int.cpp rename to sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp diff --git a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp similarity index 100% rename from sycl/test/inline-asm/asm_arbitrary_ops_order.cpp rename to sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp diff --git a/sycl/test/inline-asm/asm_decl_in_scope.cpp b/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp similarity index 100% rename from sycl/test/inline-asm/asm_decl_in_scope.cpp rename to sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp diff --git a/sycl/test/inline-asm/asm_float_add.cpp b/sycl/test/feature-tests/inline-asm/asm_float_add.cpp similarity index 100% rename from sycl/test/inline-asm/asm_float_add.cpp rename to sycl/test/feature-tests/inline-asm/asm_float_add.cpp diff --git a/sycl/test/inline-asm/asm_float_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp similarity index 100% rename from sycl/test/inline-asm/asm_float_imm_arg.cpp rename to sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp diff --git a/sycl/test/inline-asm/asm_float_neg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp similarity index 100% rename from sycl/test/inline-asm/asm_float_neg.cpp rename to sycl/test/feature-tests/inline-asm/asm_float_neg.cpp diff --git a/sycl/test/inline-asm/asm_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp similarity index 100% rename from sycl/test/inline-asm/asm_imm_arg.cpp rename to sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp diff --git a/sycl/test/inline-asm/asm_mul.cpp b/sycl/test/feature-tests/inline-asm/asm_mul.cpp similarity index 100% rename from sycl/test/inline-asm/asm_mul.cpp rename to sycl/test/feature-tests/inline-asm/asm_mul.cpp diff --git a/sycl/test/inline-asm/asm_multiple_instructions.cpp b/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp similarity index 100% rename from sycl/test/inline-asm/asm_multiple_instructions.cpp rename to sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp diff --git a/sycl/test/inline-asm/asm_no_operands.cpp b/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp similarity index 100% rename from sycl/test/inline-asm/asm_no_operands.cpp rename to sycl/test/feature-tests/inline-asm/asm_no_operands.cpp diff --git a/sycl/test/inline-asm/asm_no_output.cpp b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp similarity index 100% rename from sycl/test/inline-asm/asm_no_output.cpp rename to sycl/test/feature-tests/inline-asm/asm_no_output.cpp diff --git a/sycl/test/inline-asm/asm_plus_mod.cpp b/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp similarity index 100% rename from sycl/test/inline-asm/asm_plus_mod.cpp rename to sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/feature-tests/inline-asm/include/asmhelper.h similarity index 100% rename from sycl/test/inline-asm/include/asmhelper.h rename to sycl/test/feature-tests/inline-asm/include/asmhelper.h diff --git a/sycl/test/inline-asm/letter_example.cpp b/sycl/test/feature-tests/inline-asm/letter_example.cpp similarity index 100% rename from sycl/test/inline-asm/letter_example.cpp rename to sycl/test/feature-tests/inline-asm/letter_example.cpp diff --git a/sycl/test/inline-asm/malloc_shared_32.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp similarity index 100% rename from sycl/test/inline-asm/malloc_shared_32.cpp rename to sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp diff --git a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp similarity index 100% rename from sycl/test/inline-asm/malloc_shared_in_out_dif.cpp rename to sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp diff --git a/sycl/test/inline-asm/malloc_shared_no_input.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp similarity index 100% rename from sycl/test/inline-asm/malloc_shared_no_input.cpp rename to sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py index e6ae419047b9..6285c44d2763 100644 --- a/sycl/test/lit.cfg.py +++ b/sycl/test/lit.cfg.py @@ -26,7 +26,8 @@ # suffixes: A list of file extensions to treat as test files. config.suffixes = ['.c', '.cpp'] #add .spv. Currently not clear what to do with those -config.excludes = ['Inputs'] +# feature tests are considered not so lightweight, so, they are excluded by default +config.excludes = ['Inputs', 'feature-tests'] # test_source_root: The root path where tests are located. config.test_source_root = os.path.dirname(__file__) From aebe6a8759468229192e15518bdeddaca15e1d72 Mon Sep 17 00:00:00 2001 From: amochalo Date: Fri, 17 Apr 2020 15:39:50 +0300 Subject: [PATCH 10/10] Apply suggestion Signed-off-by: amochalo --- sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp | 4 ++-- sycl/test/feature-tests/inline-asm/asm_no_output.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp index ce9a3ad948a4..2dba04d1179b 100644 --- a/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp +++ b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp @@ -45,9 +45,9 @@ int main() { auto &B = f.getOutputBufferData(); for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) { - if (B[i] != input[i] * CONST_ARGUMENT) { + if (B[i] != input[i] + CONST_ARGUMENT) { std::cerr << "At index: " << i << ". "; - std::cerr << B[i] << " != " << input[i] * CONST_ARGUMENT << "\n"; + std::cerr << B[i] << " != " << input[i] + CONST_ARGUMENT << "\n"; return 1; } } diff --git a/sycl/test/feature-tests/inline-asm/asm_no_output.cpp b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp index ab65346d1313..ff6c65d48b31 100644 --- a/sycl/test/feature-tests/inline-asm/asm_no_output.cpp +++ b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp @@ -40,7 +40,7 @@ int main() { if (!launchInlineASMTest(f)) return 0; - if (verify_all_the_same(f.getOutputBufferData(), 47)) + if (verify_all_the_same(f.getOutputBufferData(), 0)) return 0; return 1;