diff --git a/SYCL/ESIMD/BitonicSortK.cpp b/SYCL/ESIMD/BitonicSortK.cpp index e211b1a097..8113271680 100644 --- a/SYCL/ESIMD/BitonicSortK.cpp +++ b/SYCL/ESIMD/BitonicSortK.cpp @@ -601,7 +601,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) { cl::sycl::range<1> SortLocalRange{1}; double total_time = 0; - { + try { buffer bufi(pInputs, range<1>(size)); buffer bufo(pOutputs, range<1>(size)); // enqueue sort265 kernel @@ -616,6 +616,9 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) { }); e.wait(); total_time += report_time("kernel time", e, e); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } // Each HW thread swap two 256-element chunks. Hence, we only need @@ -631,28 +634,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) { // this loop is for stage 8 to stage LOG2_ELEMENTS. event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2]; int k = 0; - for (int i = 8; i < LOG2_ELEMENTS; i++) { - // each step halves the stride distance of its prior step. - // 1<= 8; j--) { - buffer buf(pOutputs, range<1>(size)); - mergeEvent[k] = pQueue_->submit([&](handler &cgh) { - auto acc = buf.get_access(cgh); - cgh.parallel_for(MergeGlobalRange * MergeLocalRange, - [=](id<1> tid) SYCL_ESIMD_KERNEL { - using namespace sycl::INTEL::gpu; - cmk_bitonic_merge(acc, j, i, tid); - }); - }); - k++; + try { + for (int i = 8; i < LOG2_ELEMENTS; i++) { + // each step halves the stride distance of its prior step. + // 1<= 8; j--) { + buffer buf(pOutputs, range<1>(size)); + mergeEvent[k] = pQueue_->submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.parallel_for(MergeGlobalRange * MergeLocalRange, + [=](id<1> tid) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + cmk_bitonic_merge(acc, j, i, tid); + }); + }); + k++; + } } + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } + mergeEvent[k - 1].wait(); total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]); diff --git a/SYCL/ESIMD/BitonicSortKv2.cpp b/SYCL/ESIMD/BitonicSortKv2.cpp index 32962069eb..b95c16fbaf 100644 --- a/SYCL/ESIMD/BitonicSortKv2.cpp +++ b/SYCL/ESIMD/BitonicSortKv2.cpp @@ -523,15 +523,20 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) { // enqueue sort265 kernel double total_time = 0; - auto e = pQueue_->submit([&](handler &cgh) { - cgh.parallel_for( - SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { - using namespace sycl::INTEL::gpu; - cmk_bitonic_sort_256(pInputs, pOutputs, i); - }); - }); - e.wait(); - total_time += report_time("kernel time", e, e); + try { + auto e = pQueue_->submit([&](handler &cgh) { + cgh.parallel_for( + SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + cmk_bitonic_sort_256(pInputs, pOutputs, i); + }); + }); + e.wait(); + total_time += report_time("kernel time", e, e); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); + } // Each HW thread swap two 256-element chunks. Hence, we only need // to launch size/ (base_sort_size*2) HW threads @@ -546,27 +551,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) { // this loop is for stage 8 to stage LOG2_ELEMENTS. event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2]; int k = 0; - for (int i = 8; i < LOG2_ELEMENTS; i++) { - // each step halves the stride distance of its prior step. - // 1<= 8; j--) { - mergeEvent[k] = pQueue_->submit([&](handler &cgh) { - cgh.parallel_for(MergeGlobalRange * MergeLocalRange, - [=](id<1> tid) SYCL_ESIMD_KERNEL { - using namespace sycl::INTEL::gpu; - cmk_bitonic_merge(pOutputs, j, i, tid); - }); - }); - // mergeEvent[k].wait(); - k++; + try { + for (int i = 8; i < LOG2_ELEMENTS; i++) { + // each step halves the stride distance of its prior step. + // 1<= 8; j--) { + mergeEvent[k] = pQueue_->submit([&](handler &cgh) { + cgh.parallel_for(MergeGlobalRange * MergeLocalRange, + [=](id<1> tid) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + cmk_bitonic_merge(pOutputs, j, i, + tid); + }); + }); + // mergeEvent[k].wait(); + k++; + } } + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } + mergeEvent[k - 1].wait(); total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]); diff --git a/SYCL/ESIMD/PrefixSum.cpp b/SYCL/ESIMD/PrefixSum.cpp index eeb2a7ec61..78cb3a4679 100644 --- a/SYCL/ESIMD/PrefixSum.cpp +++ b/SYCL/ESIMD/PrefixSum.cpp @@ -298,76 +298,82 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos, void hierarchical_prefix(queue &q, unsigned *buf, unsigned elem_stride, unsigned thrd_stride, unsigned n_entries, unsigned entry_per_th) { - if (n_entries <= REMAINING_ENTRIES) { - std::cout << "... n_entries: " << n_entries + try { + if (n_entries <= REMAINING_ENTRIES) { + std::cout << "... n_entries: " << n_entries + << " elem_stide: " << elem_stride + << " thread_stride: " << thrd_stride + << " entry per thread: " << entry_per_th << std::endl; + // one single thread + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries); + }); + }); + return; + } + + std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride << " thread_stride: " << thrd_stride << " entry per thread: " << entry_per_th << std::endl; - // one single thread - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries); - }); - }); - return; - } - std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride - << " thread_stride: " << thrd_stride - << " entry per thread: " << entry_per_th << std::endl; - - if (entry_per_th == PREFIX_ENTRIES) { - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, - [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride, - PREFIX_ENTRIES); - }); - }); - } else { - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, - [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride, - PREFIX_ENTRIES_LOW); - }); - }); - } - - // if number of remaining entries <= 4K , each thread accumulates smaller - // number of entries to keep EUs saturated - if (n_entries / entry_per_th > 4096) - hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES, - n_entries / entry_per_th, PREFIX_ENTRIES); - else - hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW, - n_entries / entry_per_th, PREFIX_ENTRIES_LOW); - - std::cout << "=== n_entries: " << n_entries << " elem_stide: " << elem_stride - << " thread_stride: " << thrd_stride - << " entry per thread: " << entry_per_th << std::endl; - - if (entry_per_th == PREFIX_ENTRIES) { - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, - [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride, + if (entry_per_th == PREFIX_ENTRIES) { + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, + [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride, PREFIX_ENTRIES); - }); - }); - } else { - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, - [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride, + }); + }); + } else { + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, + [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride, PREFIX_ENTRIES_LOW); - }); - }); + }); + }); + } + + // if number of remaining entries <= 4K , each thread accumulates smaller + // number of entries to keep EUs saturated + if (n_entries / entry_per_th > 4096) + hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES, + n_entries / entry_per_th, PREFIX_ENTRIES); + else + hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW, + n_entries / entry_per_th, PREFIX_ENTRIES_LOW); + + std::cout << "=== n_entries: " << n_entries + << " elem_stide: " << elem_stride + << " thread_stride: " << thrd_stride + << " entry per thread: " << entry_per_th << std::endl; + + if (entry_per_th == PREFIX_ENTRIES) { + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, + [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride, + PREFIX_ENTRIES); + }); + }); + } else { + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, + [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride, + PREFIX_ENTRIES_LOW); + }); + }); + } + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; } } diff --git a/SYCL/ESIMD/Prefix_Local_sum1.cpp b/SYCL/ESIMD/Prefix_Local_sum1.cpp index a26b37f6a5..f46bb849a7 100644 --- a/SYCL/ESIMD/Prefix_Local_sum1.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum1.cpp @@ -144,14 +144,20 @@ int main(int argc, char *argv[]) { // compute local sum for every chunk of PREFIX_ENTRIES cl::sycl::range<2> GlobalRange{size / PREFIX_ENTRIES, 1}; - auto e0 = q.submit([&](handler &cgh) { - cgh.parallel_for( - GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_sum_tuple_count(pInputs, it.get_id(0)); - }); - }); - - e0.wait(); + try { + auto e0 = q.submit([&](handler &cgh) { + cgh.parallel_for( + GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_sum_tuple_count(pInputs, it.get_id(0)); + }); + }); + e0.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(pInputs, ctxt); + free(pExpectOutputs); + return e.get_cl_code(); + } bool pass = memcmp(pInputs, pExpectOutputs, size * TUPLE_SZ * sizeof(unsigned int)) == 0; diff --git a/SYCL/ESIMD/Prefix_Local_sum2.cpp b/SYCL/ESIMD/Prefix_Local_sum2.cpp index 96a68fb059..58b9a7f7be 100644 --- a/SYCL/ESIMD/Prefix_Local_sum2.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum2.cpp @@ -147,15 +147,21 @@ int main(int argc, char *argv[]) { malloc(size * TUPLE_SZ * sizeof(unsigned int))); compute_local_prefixsum(pInputs, pExpectOutputs, size); - auto e1 = q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{size / PREFIX_ENTRIES, 1} * LocalRange, - [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES); - }); - }); - - e1.wait(); + try { + auto e1 = q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{size / PREFIX_ENTRIES, 1} * LocalRange, + [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES); + }); + }); + e1.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(pInputs, ctxt); + free(pExpectOutputs); + return e.get_cl_code(); + } bool pass = memcmp(pInputs, pExpectOutputs, size * TUPLE_SZ * sizeof(unsigned int)) == 0; diff --git a/SYCL/ESIMD/Prefix_Local_sum3.cpp b/SYCL/ESIMD/Prefix_Local_sum3.cpp index b459463c0d..6a5004857a 100644 --- a/SYCL/ESIMD/Prefix_Local_sum3.cpp +++ b/SYCL/ESIMD/Prefix_Local_sum3.cpp @@ -236,44 +236,50 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems, void hierarchical_prefix(queue &q, unsigned *buf, unsigned elem_stride, unsigned thrd_stride, unsigned n_entries, unsigned entry_per_th) { - if (n_entries <= REMAINING_ENTRIES) { - std::cout << "... n_entries: " << n_entries + try { + if (n_entries <= REMAINING_ENTRIES) { + std::cout << "... n_entries: " << n_entries + << " elem_stide: " << elem_stride + << " thread_stride: " << thrd_stride + << " entry per thread: " << entry_per_th << std::endl; + // one single thread + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries); + }); + }); + q.wait(); + return; + } + + std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride << " thread_stride: " << thrd_stride << " entry per thread: " << entry_per_th << std::endl; - // one single thread - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries); - }); - }); - q.wait(); - return; - } - std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride - << " thread_stride: " << thrd_stride - << " entry per thread: " << entry_per_th << std::endl; - - if (entry_per_th == PREFIX_ENTRIES) { - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, - [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride); - }); - }); - q.wait(); - } else { - q.submit([&](handler &cgh) { - cgh.parallel_for( - range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, - [=](item<2> it) SYCL_ESIMD_KERNEL { - cmk_acum_iterative_low(buf, it.get_id(0), elem_stride, thrd_stride); - }); - }); - q.wait(); + if (entry_per_th == PREFIX_ENTRIES) { + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, + [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride); + }); + }); + q.wait(); + } else { + q.submit([&](handler &cgh) { + cgh.parallel_for( + range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1}, + [=](item<2> it) SYCL_ESIMD_KERNEL { + cmk_acum_iterative_low(buf, it.get_id(0), elem_stride, + thrd_stride); + }); + }); + q.wait(); + } + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; } // if number of remaining entries <= 4K , each thread accumulates smaller diff --git a/SYCL/ESIMD/Stencil.cpp b/SYCL/ESIMD/Stencil.cpp index 1a7f8fd155..ab18df1df0 100644 --- a/SYCL/ESIMD/Stencil.cpp +++ b/SYCL/ESIMD/Stencil.cpp @@ -104,75 +104,82 @@ int main(void) { InitializeSquareMatrix(inputMatrix, DIM_SIZE, false); InitializeSquareMatrix(outputMatrix, DIM_SIZE, true); - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { - using namespace sycl::INTEL::gpu; - uint h_pos = it.get_id(0); - uint v_pos = it.get_id(1); - - simd vin; - // matrix HEIGHT+10 x 32 - auto in = vin.format(); - - // - // rather than loading all data in - // the code will interleave data loading and compute - // first, we load enough data for the first 16 pixels - // - unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH; + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + uint h_pos = it.get_id(0); + uint v_pos = it.get_id(1); + + simd vin; + // matrix HEIGHT+10 x 32 + auto in = vin.format(); + + // + // rather than loading all data in + // the code will interleave data loading and compute + // first, we load enough data for the first 16 pixels + // + unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH; #pragma unroll - for (unsigned i = 0; i < 10; i++) { - in.row(i) = block_load(inputMatrix + off); - off += DIM_SIZE; - } + for (unsigned i = 0; i < 10; i++) { + in.row(i) = block_load(inputMatrix + off); + off += DIM_SIZE; + } - unsigned out_off = - (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) * - sizeof(float); - simd elm16(0, 1); + unsigned out_off = + (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) * + sizeof(float); + simd elm16(0, 1); #pragma unroll - for (unsigned i = 0; i < HEIGHT; i++) { - - in.row(10 + i) = block_load(inputMatrix + off); - off += DIM_SIZE; - - simd sum = - in.row(i + 0).select(5) * -0.02f + - in.row(i + 1).select(5) * -0.025f + - in.row(i + 2).select(5) * -0.0333333333333f + - in.row(i + 3).select(5) * -0.05f + - in.row(i + 4).select(5) * -0.1f + - in.row(i + 6).select(5) * 0.1f + - in.row(i + 7).select(5) * 0.05f + - in.row(i + 8).select(5) * 0.0333333333333f + - in.row(i + 9).select(5) * 0.025f + - in.row(i + 10).select(5) * 0.02f + - in.row(i + 5).select(0) * -0.02f + - in.row(i + 5).select(1) * -0.025f + - in.row(i + 5).select(2) * -0.0333333333333f + - in.row(i + 5).select(3) * -0.05f + - in.row(i + 5).select(4) * -0.1f + - in.row(i + 5).select(6) * 0.1f + - in.row(i + 5).select(7) * 0.05f + - in.row(i + 5).select(8) * 0.0333333333333f + - in.row(i + 5).select(9) * 0.025f + - in.row(i + 5).select(10) * 0.02f; - - // predciate output - simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; - - simd elm16_off = elm16 * sizeof(float) + out_off; - scatter(outputMatrix, sum, elm16_off, p); - out_off += DIM_SIZE * sizeof(float); - - if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1) - break; - } - }); - }); - e.wait(); + for (unsigned i = 0; i < HEIGHT; i++) { + + in.row(10 + i) = block_load(inputMatrix + off); + off += DIM_SIZE; + + simd sum = + in.row(i + 0).select(5) * -0.02f + + in.row(i + 1).select(5) * -0.025f + + in.row(i + 2).select(5) * -0.0333333333333f + + in.row(i + 3).select(5) * -0.05f + + in.row(i + 4).select(5) * -0.1f + + in.row(i + 6).select(5) * 0.1f + + in.row(i + 7).select(5) * 0.05f + + in.row(i + 8).select(5) * 0.0333333333333f + + in.row(i + 9).select(5) * 0.025f + + in.row(i + 10).select(5) * 0.02f + + in.row(i + 5).select(0) * -0.02f + + in.row(i + 5).select(1) * -0.025f + + in.row(i + 5).select(2) * -0.0333333333333f + + in.row(i + 5).select(3) * -0.05f + + in.row(i + 5).select(4) * -0.1f + + in.row(i + 5).select(6) * 0.1f + + in.row(i + 5).select(7) * 0.05f + + in.row(i + 5).select(8) * 0.0333333333333f + + in.row(i + 5).select(9) * 0.025f + + in.row(i + 5).select(10) * 0.02f; + + // predciate output + simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; + + simd elm16_off = elm16 * sizeof(float) + out_off; + scatter(outputMatrix, sum, elm16_off, p); + out_off += DIM_SIZE * sizeof(float); + + if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1) + break; + } + }); + }); + e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(inputMatrix, ctxt); + free(outputMatrix, ctxt); + return e.get_cl_code(); + } // check result bool passed = CheckResults(outputMatrix, inputMatrix); diff --git a/SYCL/ESIMD/accessor_gather_scatter.cpp b/SYCL/ESIMD/accessor_gather_scatter.cpp index 178515ade4..fe31e9b48c 100644 --- a/SYCL/ESIMD/accessor_gather_scatter.cpp +++ b/SYCL/ESIMD/accessor_gather_scatter.cpp @@ -58,7 +58,7 @@ template bool test(queue q) { A[i] = (T)i; } - { + try { buffer buf(A, range<1>(size)); range<1> glob_range{size / VL}; @@ -67,6 +67,10 @@ template bool test(queue q) { Kernel kernel(acc); cgh.parallel_for(glob_range, kernel); }); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + delete[] A; + return e.get_cl_code(); } int err_cnt = 0; diff --git a/SYCL/ESIMD/accessor_load_store.cpp b/SYCL/ESIMD/accessor_load_store.cpp index 61b87fff08..0709f0044b 100644 --- a/SYCL/ESIMD/accessor_load_store.cpp +++ b/SYCL/ESIMD/accessor_load_store.cpp @@ -53,7 +53,7 @@ template bool test(queue q, size_t size) { A[i] = (T)i; } - { + try { buffer buf(A, range<1>(size)); range<1> glob_range{size}; @@ -62,6 +62,10 @@ template bool test(queue q, size_t size) { Kernel kernel(acc); cgh.parallel_for(glob_range, kernel); }); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + delete[] A; + return e.get_cl_code(); } int err_cnt = 0; diff --git a/SYCL/ESIMD/histogram.cpp b/SYCL/ESIMD/histogram.cpp index 6709950a3d..892ea5b0f2 100644 --- a/SYCL/ESIMD/histogram.cpp +++ b/SYCL/ESIMD/histogram.cpp @@ -135,7 +135,7 @@ int main(int argc, char *argv[]) { image_channel_type::unsigned_int32, range<2>{width / sizeof(uint4), height}); - { + try { // create ranges // We need that many workitems auto GlobalRange = range<1>(range_width * range_height); @@ -207,7 +207,11 @@ int main(int argc, char *argv[]) { // SYCL will enqueue and run the kernel. Recall that the buffer's data is // given back to the host at the end of scope. - } // make sure data is given back to the host at the end of this scope + // make sure data is given back to the host at the end of this scope + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); + } writeHist(bins); writeHist(cpuHistogram); diff --git a/SYCL/ESIMD/histogram_256_slm.cpp b/SYCL/ESIMD/histogram_256_slm.cpp index 040834911c..564c8c7e6f 100644 --- a/SYCL/ESIMD/histogram_256_slm.cpp +++ b/SYCL/ESIMD/histogram_256_slm.cpp @@ -157,7 +157,7 @@ int main() { auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16); cl::sycl::nd_range<1> Range(GlobalRange, LocalRange); - { + try { auto e = q.submit([&](cl::sycl::handler &cgh) { cgh.parallel_for( Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { @@ -166,6 +166,9 @@ int main() { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } std::cout << "finish GPU histogram\n"; diff --git a/SYCL/ESIMD/histogram_256_slm_spec.cpp b/SYCL/ESIMD/histogram_256_slm_spec.cpp index cfd919b9c6..24c9dffe59 100644 --- a/SYCL/ESIMD/histogram_256_slm_spec.cpp +++ b/SYCL/ESIMD/histogram_256_slm_spec.cpp @@ -171,7 +171,7 @@ int main(int argc, char **argv) { auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16); cl::sycl::nd_range<1> Range(GlobalRange, LocalRange); - { + try { auto e = q.submit([&](cl::sycl::handler &cgh) { cgh.parallel_for( prg.get_kernel(), Range, @@ -181,6 +181,9 @@ int main(int argc, char **argv) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } std::cout << "finish GPU histogram\n"; diff --git a/SYCL/ESIMD/histogram_2d.cpp b/SYCL/ESIMD/histogram_2d.cpp index b1bfdda7cd..23cb598450 100644 --- a/SYCL/ESIMD/histogram_2d.cpp +++ b/SYCL/ESIMD/histogram_2d.cpp @@ -135,7 +135,7 @@ int main(int argc, char *argv[]) { image_channel_type::unsigned_int32, range<2>{width / sizeof(uint4), height}); - { + try { // create ranges // We need that many workitems auto GlobalRange = range<2>(range_width, range_height); @@ -206,7 +206,11 @@ int main(int argc, char *argv[]) { // SYCL will enqueue and run the kernel. Recall that the buffer's data is // given back to the host at the end of scope. - } // make sure data is given back to the host at the end of this scope + // make sure data is given back to the host at the end of this scope + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); + } writeHist(bins); writeHist(cpuHistogram); diff --git a/SYCL/ESIMD/histogram_raw_send.cpp b/SYCL/ESIMD/histogram_raw_send.cpp index 64435c296d..b86538477e 100644 --- a/SYCL/ESIMD/histogram_raw_send.cpp +++ b/SYCL/ESIMD/histogram_raw_send.cpp @@ -159,7 +159,7 @@ int main(int argc, char *argv[]) { image_channel_type::unsigned_int32, range<2>{width / sizeof(uint4), height}); - { + try { // create ranges // We need that many task groups auto GlobalRange = range<1>(range_width * range_height); @@ -231,7 +231,11 @@ int main(int argc, char *argv[]) { // SYCL will enqueue and run the kernel. Recall that the buffer's data is // given back to the host at the end of scope. - } // make sure data is given back to the host at the end of this scope + // make sure data is given back to the host at the end of this scope + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); + } writeHist(bins); writeHist(cpuHistogram); diff --git a/SYCL/ESIMD/kmeans/kmeans.cpp b/SYCL/ESIMD/kmeans/kmeans.cpp index 91fd646b87..d3abb00033 100644 --- a/SYCL/ESIMD/kmeans/kmeans.cpp +++ b/SYCL/ESIMD/kmeans/kmeans.cpp @@ -678,10 +678,17 @@ int main(int argc, char *argv[]) { kernel3_time_in_ns += report_time("kernel3", e2); }; - for (auto i = 0; i < NUM_ITERATIONS - 1; i++) { - submitJobs(false); + try { + for (auto i = 0; i < NUM_ITERATIONS - 1; i++) { + submitJobs(false); + } + submitJobs(true); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + delete cpu_points; + delete cpu_centroids; + return e.get_cl_code(); } - submitJobs(true); //--- diff --git a/SYCL/ESIMD/linear/linear.cpp b/SYCL/ESIMD/linear/linear.cpp index f5323efd9a..fbf2d8ad4d 100644 --- a/SYCL/ESIMD/linear/linear.cpp +++ b/SYCL/ESIMD/linear/linear.cpp @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) { // Sets output to blank image. output_image.setData(new unsigned char[img_size]); - { + try { unsigned int img_width = width * bpp / (8 * sizeof(int)); cl::sycl::image<2> imgInput( @@ -124,6 +124,9 @@ int main(int argc, char *argv[]) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } output_image.save("linear_out.bmp"); diff --git a/SYCL/ESIMD/mandelbrot/mandelbrot.cpp b/SYCL/ESIMD/mandelbrot/mandelbrot.cpp index f137e5b5ac..3f62b25681 100644 --- a/SYCL/ESIMD/mandelbrot/mandelbrot.cpp +++ b/SYCL/ESIMD/mandelbrot/mandelbrot.cpp @@ -84,7 +84,7 @@ int main(int argc, char *argv[]) { // Sets output to blank image. unsigned char *buf = new unsigned char[img_size]; - { + try { cl::sycl::image<2> imgOutput((unsigned int *)buf, image_channel_order::rgba, image_channel_type::unsigned_int8, range<2>{WIDTH, HEIGHT}); @@ -115,6 +115,10 @@ int main(int argc, char *argv[]) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + delete[] buf; + return e.get_cl_code(); } char *out_file = argv[1]; diff --git a/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp b/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp index 0af7422ca0..f885744880 100644 --- a/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp +++ b/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp @@ -93,7 +93,7 @@ int main(int argc, char *argv[]) { // Sets output to blank image. unsigned char *buf = new unsigned char[img_size]; - { + try { cl::sycl::image<2> imgOutput((unsigned int *)buf, image_channel_order::rgba, image_channel_type::unsigned_int8, range<2>{WIDTH, HEIGHT}); @@ -152,6 +152,10 @@ int main(int argc, char *argv[]) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + delete[] buf; + return e.get_cl_code(); } char *out_file = argv[1]; diff --git a/SYCL/ESIMD/matrix_transpose.cpp b/SYCL/ESIMD/matrix_transpose.cpp index f523094ee7..8bd2ac9720 100644 --- a/SYCL/ESIMD/matrix_transpose.cpp +++ b/SYCL/ESIMD/matrix_transpose.cpp @@ -348,37 +348,44 @@ bool runTest(unsigned MZ, unsigned block_size) { double kernel_times = 0; unsigned num_iters = 10; - // num_iters + 1, iteration#0 is for warmup - for (int i = 0; i <= num_iters; ++i) { - double etime = 0; - // make sure that buffer object has short live-range - // than M - buffer buf(M, range<1>(MZ * MZ)); - - if (block_size == 16 && MZ >= 16) { - auto e = q.submit([&](handler &cgh) { - auto acc = buf.get_access(cgh); - cgh.parallel_for( - Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { - transpose16(acc, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); - }); - }); - e.wait(); - etime = report_time("kernel time", e); - } else if (block_size == 8) { - auto e = q.submit([&](handler &cgh) { - auto acc = buf.get_access(cgh); - cgh.parallel_for( - Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { - transpose8(acc, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); - }); - }); - e.wait(); - etime = report_time("kernel time", e); - } + try { + // num_iters + 1, iteration#0 is for warmup + for (int i = 0; i <= num_iters; ++i) { + double etime = 0; + // make sure that buffer object has short live-range + // than M + buffer buf(M, range<1>(MZ * MZ)); + + if (block_size == 16 && MZ >= 16) { + auto e = q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.parallel_for( + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { + transpose16(acc, MZ, ndi.get_global_id(0), + ndi.get_global_id(1)); + }); + }); + e.wait(); + etime = report_time("kernel time", e); + } else if (block_size == 8) { + auto e = q.submit([&](handler &cgh) { + auto acc = buf.get_access(cgh); + cgh.parallel_for( + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { + transpose8(acc, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); + }); + }); + e.wait(); + etime = report_time("kernel time", e); + } - if (i > 0) - kernel_times += etime; + if (i > 0) + kernel_times += etime; + } + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + delete[] M; + return e.get_cl_code(); } // End timer. diff --git a/SYCL/ESIMD/matrix_transpose_glb.cpp b/SYCL/ESIMD/matrix_transpose_glb.cpp index 1dfb4f6806..ce60f699c4 100644 --- a/SYCL/ESIMD/matrix_transpose_glb.cpp +++ b/SYCL/ESIMD/matrix_transpose_glb.cpp @@ -312,31 +312,37 @@ bool runTest(unsigned MZ, unsigned block_size) { double kernel_times = 0; unsigned num_iters = 10; - // num_iters + 1, iteration#0 is for warmup - for (int i = 0; i <= num_iters; ++i) { - double etime = 0; - if (block_size == 16 && MZ >= 16) { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { - transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); - }); - }); - e.wait(); - etime = report_time("kernel time", e); - } else if (block_size == 8) { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { - transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); - }); - }); - e.wait(); - etime = report_time("kernel time", e); - } + try { + // num_iters + 1, iteration#0 is for warmup + for (int i = 0; i <= num_iters; ++i) { + double etime = 0; + if (block_size == 16 && MZ >= 16) { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { + transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); + }); + }); + e.wait(); + etime = report_time("kernel time", e); + } else if (block_size == 8) { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { + transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); + }); + }); + e.wait(); + etime = report_time("kernel time", e); + } - if (i > 0) - kernel_times += etime; + if (i > 0) + kernel_times += etime; + } + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(M, ctxt); + return e.get_cl_code(); } // End timer. diff --git a/SYCL/ESIMD/matrix_transpose_usm.cpp b/SYCL/ESIMD/matrix_transpose_usm.cpp index 5077bb3c88..b56a3293f0 100644 --- a/SYCL/ESIMD/matrix_transpose_usm.cpp +++ b/SYCL/ESIMD/matrix_transpose_usm.cpp @@ -319,31 +319,37 @@ bool runTest(unsigned MZ, unsigned block_size) { double kernel_times = 0; unsigned num_iters = 10; - // num_iters + 1, iteration#0 is for warmup - for (int i = 0; i <= num_iters; ++i) { - double etime = 0; - if (block_size == 16 && MZ >= 16) { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { - transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); - }); - }); - e.wait(); - etime = report_time("kernel time", e); - } else if (block_size == 8) { - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { - transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); - }); - }); - e.wait(); - etime = report_time("kernel time", e); - } + try { + // num_iters + 1, iteration#0 is for warmup + for (int i = 0; i <= num_iters; ++i) { + double etime = 0; + if (block_size == 16 && MZ >= 16) { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { + transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); + }); + }); + e.wait(); + etime = report_time("kernel time", e); + } else if (block_size == 8) { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL { + transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1)); + }); + }); + e.wait(); + etime = report_time("kernel time", e); + } - if (i > 0) - kernel_times += etime; + if (i > 0) + kernel_times += etime; + } + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(M, ctxt); + return e.get_cl_code(); } // End timer. diff --git a/SYCL/ESIMD/reduction.cpp b/SYCL/ESIMD/reduction.cpp index 4bc4161cde..921de22fb9 100644 --- a/SYCL/ESIMD/reduction.cpp +++ b/SYCL/ESIMD/reduction.cpp @@ -33,6 +33,7 @@ int main(void) { auto dev = q.get_device(); std::cout << "Running on " << dev.get_info() << "\n"; auto ctxt = q.get_context(); + // TODO: release memory in the end of the test TYPE *A = static_cast(malloc_shared(InputSize * sizeof(TYPE), dev, ctxt)); int *B = @@ -46,7 +47,7 @@ int main(void) { } } - { + try { cl::sycl::range<1> GroupRange{InputSize / VL}; cl::sycl::range<1> TaskRange{GroupSize}; cl::sycl::nd_range<1> Range(GroupRange, TaskRange); @@ -68,6 +69,9 @@ int main(void) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } auto compute_reduce_sum = [](TYPE A[InputSize]) -> int { diff --git a/SYCL/ESIMD/slm_barrier.cpp b/SYCL/ESIMD/slm_barrier.cpp index 02d7ba7b15..7fe8146176 100644 --- a/SYCL/ESIMD/slm_barrier.cpp +++ b/SYCL/ESIMD/slm_barrier.cpp @@ -81,6 +81,7 @@ int main(void) { auto dev = q.get_device(); std::cout << "Running on " << dev.get_info() << "\n"; auto ctxt = q.get_context(); + // TODO: release memory in the end of the test uint *A = static_cast(malloc_shared(Size * sizeof(uint), dev, ctxt)); uint *B = static_cast(malloc_shared(Size * sizeof(uint), dev, ctxt)); @@ -101,34 +102,40 @@ int main(void) { cl::sycl::range<1> LocalRange{LOCAL_SIZE}; cl::sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { - simd v_slmData; - simd v_Off(0, 4); + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + simd v_slmData; + simd v_Off(0, 4); - uint localID = ndi.get_local_id(0); - uint groupSize = ndi.get_local_range(0); - uint globalID = ndi.get_global_id(0); - uint groupID = ndi.get_group(0); + uint localID = ndi.get_local_id(0); + uint groupSize = ndi.get_local_range(0); + uint globalID = ndi.get_global_id(0); + uint groupID = ndi.get_group(0); - slm_init(1024); + slm_init(1024); - int grpMemOffset = groupID * groupSize * VL * 4; + int grpMemOffset = groupID * groupSize * VL * 4; - load_to_slm(groupSize, localID, 0, (char *)A, grpMemOffset, - groupSize * VL * 4); + load_to_slm(groupSize, localID, 0, (char *)A, grpMemOffset, + groupSize * VL * 4); - auto shiftID = (localID + 1) % 4; + auto shiftID = (localID + 1) % 4; - v_Off = v_Off + shiftID * 64; + v_Off = v_Off + shiftID * 64; - v_slmData = slm_load(v_Off); + v_slmData = slm_load(v_Off); + + block_store(B + globalID * VL, v_slmData); + }); + }); + e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); + } - block_store(B + globalID * VL, v_slmData); - }); - }); - e.wait(); std::cout << "result" << std::endl; int result = 0; for (int i = 0; i < NUM_THREADS; i++) { diff --git a/SYCL/ESIMD/stencil2.cpp b/SYCL/ESIMD/stencil2.cpp index ad51efc00c..9198a874e3 100644 --- a/SYCL/ESIMD/stencil2.cpp +++ b/SYCL/ESIMD/stencil2.cpp @@ -106,75 +106,82 @@ int main(void) { InitializeSquareMatrix(inputMatrix, DIM_SIZE, false); InitializeSquareMatrix(outputMatrix, DIM_SIZE, true); - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { - using namespace sycl::INTEL::gpu; - uint h_pos = it.get_id(0); - uint v_pos = it.get_id(1); - - simd vin; - // matrix HEIGHT+10 x 32 - auto in = vin.format(); - - // - // rather than loading all data in - // the code will interleave data loading and compute - // first, we load enough data for the first 16 pixels - // - unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH; + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + uint h_pos = it.get_id(0); + uint v_pos = it.get_id(1); + + simd vin; + // matrix HEIGHT+10 x 32 + auto in = vin.format(); + + // + // rather than loading all data in + // the code will interleave data loading and compute + // first, we load enough data for the first 16 pixels + // + unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH; #pragma unroll - for (unsigned i = 0; i < 10; i++) { - in.row(i) = block_load(inputMatrix + off); - off += DIM_SIZE; - } + for (unsigned i = 0; i < 10; i++) { + in.row(i) = block_load(inputMatrix + off); + off += DIM_SIZE; + } - unsigned out_off = - (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) * - sizeof(float); - simd elm16(0, 1); + unsigned out_off = + (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) * + sizeof(float); + simd elm16(0, 1); #pragma unroll - for (unsigned i = 0; i < HEIGHT; i++) { - - in.row(10 + i) = block_load(inputMatrix + off); - off += DIM_SIZE; - - simd sum = - vin.select(GET_IDX(i, 5)) * -0.02f + - vin.select(GET_IDX(i + 1, 5)) * -0.025f + - vin.select(GET_IDX(i + 2, 5)) * -0.0333333333333f + - vin.select(GET_IDX(i + 3, 5)) * -0.05f + - vin.select(GET_IDX(i + 4, 5)) * -0.1f + - vin.select(GET_IDX(i + 6, 5)) * 0.1f + - vin.select(GET_IDX(i + 7, 5)) * 0.05f + - vin.select(GET_IDX(i + 8, 5)) * 0.0333333333333f + - vin.select(GET_IDX(i + 9, 5)) * 0.025f + - vin.select(GET_IDX(i + 10, 5)) * 0.02f + - vin.select(GET_IDX(i + 5, 0)) * -0.02f + - vin.select(GET_IDX(i + 5, 1)) * -0.025f + - vin.select(GET_IDX(i + 5, 2)) * -0.0333333333333f + - vin.select(GET_IDX(i + 5, 3)) * -0.05f + - vin.select(GET_IDX(i + 5, 4)) * -0.1f + - vin.select(GET_IDX(i + 5, 6)) * 0.1f + - vin.select(GET_IDX(i + 5, 7)) * 0.05f + - vin.select(GET_IDX(i + 5, 8)) * 0.0333333333333f + - vin.select(GET_IDX(i + 5, 9)) * 0.025f + - vin.select(GET_IDX(i + 5, 10)) * 0.02f; - - // predciate output - simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; - - simd elm16_off = elm16 * sizeof(float) + out_off; - scatter(outputMatrix, sum, elm16_off, p); - out_off += DIM_SIZE * sizeof(float); - - if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1) - break; - } - }); - }); - e.wait(); + for (unsigned i = 0; i < HEIGHT; i++) { + + in.row(10 + i) = block_load(inputMatrix + off); + off += DIM_SIZE; + + simd sum = + vin.select(GET_IDX(i, 5)) * -0.02f + + vin.select(GET_IDX(i + 1, 5)) * -0.025f + + vin.select(GET_IDX(i + 2, 5)) * -0.0333333333333f + + vin.select(GET_IDX(i + 3, 5)) * -0.05f + + vin.select(GET_IDX(i + 4, 5)) * -0.1f + + vin.select(GET_IDX(i + 6, 5)) * 0.1f + + vin.select(GET_IDX(i + 7, 5)) * 0.05f + + vin.select(GET_IDX(i + 8, 5)) * 0.0333333333333f + + vin.select(GET_IDX(i + 9, 5)) * 0.025f + + vin.select(GET_IDX(i + 10, 5)) * 0.02f + + vin.select(GET_IDX(i + 5, 0)) * -0.02f + + vin.select(GET_IDX(i + 5, 1)) * -0.025f + + vin.select(GET_IDX(i + 5, 2)) * -0.0333333333333f + + vin.select(GET_IDX(i + 5, 3)) * -0.05f + + vin.select(GET_IDX(i + 5, 4)) * -0.1f + + vin.select(GET_IDX(i + 5, 6)) * 0.1f + + vin.select(GET_IDX(i + 5, 7)) * 0.05f + + vin.select(GET_IDX(i + 5, 8)) * 0.0333333333333f + + vin.select(GET_IDX(i + 5, 9)) * 0.025f + + vin.select(GET_IDX(i + 5, 10)) * 0.02f; + + // predciate output + simd p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10; + + simd elm16_off = elm16 * sizeof(float) + out_off; + scatter(outputMatrix, sum, elm16_off, p); + out_off += DIM_SIZE * sizeof(float); + + if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1) + break; + } + }); + }); + e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + free(inputMatrix, ctxt); + free(outputMatrix, ctxt); + return e.get_cl_code(); + } // check result bool passed = CheckResults(outputMatrix, inputMatrix); diff --git a/SYCL/ESIMD/test_id_3d.cpp b/SYCL/ESIMD/test_id_3d.cpp index 0c21473da8..e4cae353df 100644 --- a/SYCL/ESIMD/test_id_3d.cpp +++ b/SYCL/ESIMD/test_id_3d.cpp @@ -40,20 +40,26 @@ int main(void) { int *C = static_cast( malloc_shared(ScalarGlobalRange.size() * sizeof(int), dev, ctxt)); - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for( - GlobalRange, [=](item<3> it) SYCL_ESIMD_KERNEL { - using namespace sycl::INTEL::gpu; - auto id = it.get_id(); - // calculate linear ID: - size_t lin_id = id[0] * Y * X + id[1] * X + id[2]; - simd inc(0, 1); - int off = (int)(lin_id * VL); - simd val = inc + off; - block_store(C + off, val); - }); - }); - e.wait(); + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + GlobalRange, [=](item<3> it) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; + auto id = it.get_id(); + // calculate linear ID: + size_t lin_id = id[0] * Y * X + id[1] * X + id[2]; + simd inc(0, 1); + int off = (int)(lin_id * VL); + simd val = inc + off; + block_store(C + off, val); + }); + }); + e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); + } + int err_cnt = 0; for (size_t i = 0; i < ScalarGlobalRange.size(); ++i) { diff --git a/SYCL/ESIMD/vadd_1d.cpp b/SYCL/ESIMD/vadd_1d.cpp index ffb7b464b4..294d1b62fc 100644 --- a/SYCL/ESIMD/vadd_1d.cpp +++ b/SYCL/ESIMD/vadd_1d.cpp @@ -31,7 +31,7 @@ int main(void) { C[i] = 0.0f; } - { + try { buffer bufa(A, range<1>(Size)); buffer bufb(B, range<1>(Size)); buffer bufc(C, range<1>(Size)); @@ -62,6 +62,14 @@ int main(void) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + + return e.get_cl_code(); } int err_cnt = 0; diff --git a/SYCL/ESIMD/vadd_2d.cpp b/SYCL/ESIMD/vadd_2d.cpp index 7eb74b2fba..eff276ca9b 100644 --- a/SYCL/ESIMD/vadd_2d.cpp +++ b/SYCL/ESIMD/vadd_2d.cpp @@ -32,7 +32,7 @@ int main(void) { A[i] = B[i] = i; } - { + try { cl::sycl::image<2> imgA(A, image_channel_order::rgba, image_channel_type::unsigned_int32, range<2>{Size / 4, 1}); @@ -82,6 +82,9 @@ int main(void) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); } for (unsigned i = 0; i < Size; ++i) { diff --git a/SYCL/ESIMD/vadd_raw_send.cpp b/SYCL/ESIMD/vadd_raw_send.cpp index d76043804e..62beaa6def 100644 --- a/SYCL/ESIMD/vadd_raw_send.cpp +++ b/SYCL/ESIMD/vadd_raw_send.cpp @@ -91,7 +91,7 @@ int main(void) { C[i] = 0.0f; } - { + try { buffer bufa(A, range<1>(Size)); buffer bufb(B, range<1>(Size)); buffer bufc(C, range<1>(Size)); @@ -124,6 +124,14 @@ int main(void) { }); }); e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + + return e.get_cl_code(); } int err_cnt = 0; diff --git a/SYCL/ESIMD/vadd_usm.cpp b/SYCL/ESIMD/vadd_usm.cpp index 927479e633..428e754447 100644 --- a/SYCL/ESIMD/vadd_usm.cpp +++ b/SYCL/ESIMD/vadd_usm.cpp @@ -29,6 +29,7 @@ int main(void) { auto dev = q.get_device(); std::cout << "Running on " << dev.get_info() << "\n"; auto ctxt = q.get_context(); + // TODO: release memory in the end of the test float *A = static_cast(malloc_shared(Size * sizeof(float), dev, ctxt)); float *B = @@ -47,18 +48,25 @@ int main(void) { cl::sycl::nd_range<1> Range(GlobalRange, LocalRange); - auto e = q.submit([&](handler &cgh) { - cgh.parallel_for(Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { - using namespace sycl::INTEL::gpu; + try { + auto e = q.submit([&](handler &cgh) { + cgh.parallel_for( + Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL { + using namespace sycl::INTEL::gpu; - int i = ndi.get_global_id(0); - simd va = block_load(A + i * VL); - simd vb = block_load(B + i * VL); - simd vc = va + vb; - block_store(C + i * VL, vc); + int i = ndi.get_global_id(0); + simd va = block_load(A + i * VL); + simd vb = block_load(B + i * VL); + simd vc = va + vb; + block_store(C + i * VL, vc); + }); }); - }); - e.wait(); + e.wait(); + } catch (cl::sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + return e.get_cl_code(); + } + int err_cnt = 0; for (unsigned i = 0; i < Size; ++i) {