diff --git a/SYCL/InvokeSimd/invoke_simd_conv.cpp b/SYCL/InvokeSimd/invoke_simd_conv.cpp new file mode 100644 index 0000000000..83b4d6ab53 --- /dev/null +++ b/SYCL/InvokeSimd/invoke_simd_conv.cpp @@ -0,0 +1,182 @@ +// The test checks that invoke_simd implementation performs proper conversions +// on the actual arguments: +// - Case1: actual type is uniform, formal - T1 (scalar) +// standard C++ arithmetic conversion is applied +// - Case2: actual type is T, format - simd +// simd-simd conversion is applied according to the std::experimental::simd +// specification. Basically, only non-narrowing conversions are allowed: +// char -> int, float -> double, etc. int -> float is forbidden. + +// TODO: enable on Windows once driver is ready +// REQUIRES: gpu && linux +// UNSUPPORTED: cuda || hip + +// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %s -o %t.out +// RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out + +#include +#include +#include +#include + +#include +#include +#include + +using namespace sycl::ext::oneapi::experimental; +using namespace sycl; + +constexpr int VL = 16; + +template inline T calc(T val) { return val + val; } +template inline simd calc(simd val) { + // emulate '+' on simd operands + for (int i = 0; i < N; ++i) { + val[i] += val[i]; + } + return val; +} + +template +[[intel::device_indirectly_callable]] // required by FE for addr-taken functions +simd __regcall SIMD_CALLEE_UNIFORM(SimdElemT val) + SYCL_ESIMD_FUNCTION { + return simd(calc(val)); // broadcast +} + +template +[[intel::device_indirectly_callable]] simd __regcall SIMD_CALLEE( + simd val) SYCL_ESIMD_FUNCTION { + return calc(val); +} + +class ESIMDSelector : public device_selector { + // Require GPU device unless HOST is requested in SYCL_DEVICE_FILTER env + virtual int operator()(const device &device) const { + if (const char *dev_filter = getenv("SYCL_DEVICE_FILTER")) { + std::string filter_string(dev_filter); + if (filter_string.find("gpu") != std::string::npos) + return device.is_gpu() ? 1000 : -1; + if (filter_string.find("host") != std::string::npos) + return device.is_host() ? 1000 : -1; + std::cerr + << "Supported 'SYCL_DEVICE_FILTER' env var values are 'gpu' and " + "'host', '" + << filter_string << "' does not contain such substrings.\n"; + return -1; + } + // If "SYCL_DEVICE_FILTER" not defined, only allow gpu device + return device.is_gpu() ? 1000 : -1; + } +}; + +inline auto createExceptionHandler() { + return [](exception_list l) { + for (auto ep : l) { + try { + std::rethrow_exception(ep); + } catch (sycl::exception &e0) { + std::cout << "sycl::exception: " << e0.what() << std::endl; + } catch (std::exception &e) { + std::cout << "std::exception: " << e.what() << std::endl; + } catch (...) { + std::cout << "generic exception\n"; + } + } + }; +} + +template class TestID; + +template bool test(queue q) { + // 3 subgroups per workgroup + unsigned GroupSize = VL * 3; + unsigned NGroups = 7; + unsigned Size = GroupSize * NGroups; + SimdElemT *A = malloc_shared(Size, q); + + for (unsigned i = 0; i < Size; ++i) { + A[i] = (SimdElemT)i; + } + sycl::range<1> GlobalRange{Size}; + sycl::range<1> LocalRange{GroupSize}; + sycl::nd_range<1> Range(GlobalRange, LocalRange); + + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for>( + Range, [=](nd_item<1> ndi) [[intel::reqd_sub_group_size(VL)]] { + sub_group sg = ndi.get_sub_group(); + SpmdT val = (SpmdT)sg.get_group_linear_id(); // 0 .. GroupSize-1 + SimdElemT res = 0; + + if constexpr (IsUniform) { + res = + invoke_simd(sg, SIMD_CALLEE_UNIFORM, uniform{val}); + } else { + res = invoke_simd(sg, SIMD_CALLEE, val); + } + uint32_t i = ndi.get_global_linear_id(); + A[i] = res; + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(A, q); + return false; + } + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + unsigned group_id = i / GroupSize; + uint32_t sg_id = (i - (group_id * GroupSize)) / VL; + SimdElemT test = A[i]; + SimdElemT gold = calc((SimdElemT)sg_id); + if ((test != gold) && (++err_cnt < 10)) { + std::cout << "failed at index " << i << ", " << test << " != " << gold + << "(gold)\n"; + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + sycl::free(A, q); + return err_cnt == 0; +} + +int main(void) { + queue q(ESIMDSelector{}, createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + bool passed = true; + + constexpr bool UNIFORM = true; + constexpr bool NON_UNIFORM = false; + + // With uniform parameters SPMD actual argument corresponds to SIMD scalar + // argument, and standard C++ arithmetic conversion are implicitly + // applied by the compiler. Any aritimetic type can be implicitly coverted to + // any other arithmetic type. + + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + + // With non-uniform parameters, SPMD actual argument of type T is "widened" to + // std::simd and then convered to SIMD vector argument + // (std::simd) using std::simd implicit conversion constructors. They + // allow only non-narrowing conversions (e.g. int -> float is narrowing and + // hence is prohibited). + + passed &= test(q); + passed &= test(q); + passed &= test(q); + + std::cout << (passed ? "Passed\n" : "FAILED\n"); + return passed ? 0 : 1; +} diff --git a/SYCL/InvokeSimd/invoke_simd_smoke.cpp b/SYCL/InvokeSimd/invoke_simd_smoke.cpp index 07780ae709..75225dc05f 100644 --- a/SYCL/InvokeSimd/invoke_simd_smoke.cpp +++ b/SYCL/InvokeSimd/invoke_simd_smoke.cpp @@ -6,7 +6,7 @@ // REQUIRES: gpu && linux // UNSUPPORTED: cuda || hip -// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd %s -o %t.out +// RUN: %clangxx -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %s -o %t.out // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %GPU_RUN_PLACEHOLDER %t.out #include @@ -37,9 +37,14 @@ ESIMD_CALLEE(float *A, esimd::simd b, int i) SYCL_ESIMD_FUNCTION { return a + b; } -SYCL_EXTERNAL -simd __regcall SIMD_CALLEE(float *A, simd b, - int i) SYCL_ESIMD_FUNCTION; +// Use two functions with the same signature called via invoke_simd for better +// testing. +[[intel::device_indirectly_callable]] SYCL_EXTERNAL + simd __regcall SIMD_CALLEE1(float *A, simd b, + int i) SYCL_ESIMD_FUNCTION; +[[intel::device_indirectly_callable]] SYCL_EXTERNAL + simd __regcall SIMD_CALLEE2(float *A, simd b, + int i) SYCL_ESIMD_FUNCTION; float SPMD_CALLEE(float *A, float b, int i) { return A[i] + b; } @@ -115,10 +120,14 @@ int main(void) { float res = 0; if constexpr (use_invoke_simd) { - res = invoke_simd(sg, SIMD_CALLEE, uniform{A}, B[wi_id], + res = invoke_simd(sg, SIMD_CALLEE1, uniform{A}, B[wi_id], uniform{i}); + res += invoke_simd(sg, SIMD_CALLEE2, uniform{A}, B[wi_id], + uniform{i}); + } else { res = SPMD_CALLEE(A, B[wi_id], wi_id); + res += SPMD_CALLEE(A, B[wi_id], wi_id); } C[wi_id] = res; }); @@ -135,10 +144,10 @@ int main(void) { int err_cnt = 0; for (unsigned i = 0; i < Size; ++i) { - if (A[i] + B[i] != C[i]) { + if (2 * (A[i] + B[i]) != C[i]) { if (++err_cnt < 10) { - std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] - << " + " << B[i] << "\n"; + std::cout << "failed at index " << i << ", " << C[i] << " != 2*(" + << A[i] << " + " << B[i] << ")\n"; } } } @@ -156,8 +165,15 @@ int main(void) { } SYCL_EXTERNAL -simd __regcall SIMD_CALLEE(float *A, simd b, - int i) SYCL_ESIMD_FUNCTION { +simd __regcall SIMD_CALLEE1(float *A, simd b, + int i) SYCL_ESIMD_FUNCTION { + esimd::simd res = ESIMD_CALLEE(A, b, i); + return res; +} + +SYCL_EXTERNAL +simd __regcall SIMD_CALLEE2(float *A, simd b, + int i) SYCL_ESIMD_FUNCTION { esimd::simd res = ESIMD_CALLEE(A, b, i); return res; }