intel · vladimirlaz · Dec 7, 2020 · Dec 3, 2020
@@ -601,7 +601,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   cl::sycl::range<1> SortLocalRange{1};
 
   double total_time = 0;
-  {
+  try {
     buffer<uint32_t, 1> bufi(pInputs, range<1>(size));
     buffer<uint32_t, 1> bufo(pOutputs, range<1>(size));
     // enqueue sort265 kernel
@@ -616,6 +616,9 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
     });
     e.wait();
     total_time += report_time("kernel time", e, e);
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
 
   // Each HW thread swap two 256-element chunks. Hence, we only need
@@ -631,28 +634,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   // this loop is for stage 8 to stage LOG2_ELEMENTS.
   event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
   int k = 0;
-  for (int i = 8; i < LOG2_ELEMENTS; i++) {
-    // each step halves the stride distance of its prior step.
-    // 1<<j is the stride distance that the invoked step will handle.
-    // The recursive steps continue until stride distance 1 is complete.
-    // For stride distance less than 1<<8, no global synchronization
-    // is needed, i.e., all work can be done locally within HW threads.
-    // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
-    // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
-    // locally.
-    for (int j = i; j >= 8; j--) {
-      buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
-      mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
-        auto acc = buf.get_access<access::mode::read_write>(cgh);
-        cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
-                                      [=](id<1> tid) SYCL_ESIMD_KERNEL {
-                                        using namespace sycl::INTEL::gpu;
-                                        cmk_bitonic_merge(acc, j, i, tid);
-                                      });
-      });
-      k++;
+  try {
+    for (int i = 8; i < LOG2_ELEMENTS; i++) {
+      // each step halves the stride distance of its prior step.
+      // 1<<j is the stride distance that the invoked step will handle.
+      // The recursive steps continue until stride distance 1 is complete.
+      // For stride distance less than 1<<8, no global synchronization
+      // is needed, i.e., all work can be done locally within HW threads.
+      // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
+      // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
+      // locally.
+      for (int j = i; j >= 8; j--) {
+        buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
+        mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
+          auto acc = buf.get_access<access::mode::read_write>(cgh);
+          cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
+                                        [=](id<1> tid) SYCL_ESIMD_KERNEL {
+                                          using namespace sycl::INTEL::gpu;
+                                          cmk_bitonic_merge(acc, j, i, tid);
+                                        });
+        });
+        k++;
+      }
     }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
+
   mergeEvent[k - 1].wait();
   total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
 

@@ -523,15 +523,20 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
 
   // enqueue sort265 kernel
   double total_time = 0;
-  auto e = pQueue_->submit([&](handler &cgh) {
-    cgh.parallel_for<class Sort256>(
-        SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
-          using namespace sycl::INTEL::gpu;
-          cmk_bitonic_sort_256(pInputs, pOutputs, i);
-        });
-  });
-  e.wait();
-  total_time += report_time("kernel time", e, e);
+  try {
+    auto e = pQueue_->submit([&](handler &cgh) {
+      cgh.parallel_for<class Sort256>(
+          SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::INTEL::gpu;
+            cmk_bitonic_sort_256(pInputs, pOutputs, i);
+          });
+    });
+    e.wait();
+    total_time += report_time("kernel time", e, e);
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
 
   // Each HW thread swap two 256-element chunks. Hence, we only need
   // to launch size/ (base_sort_size*2) HW threads
@@ -546,27 +551,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   // this loop is for stage 8 to stage LOG2_ELEMENTS.
   event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
   int k = 0;
-  for (int i = 8; i < LOG2_ELEMENTS; i++) {
-    // each step halves the stride distance of its prior step.
-    // 1<<j is the stride distance that the invoked step will handle.
-    // The recursive steps continue until stride distance 1 is complete.
-    // For stride distance less than 1<<8, no global synchronization
-    // is needed, i.e., all work can be done locally within HW threads.
-    // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
-    // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
-    // locally.
-    for (int j = i; j >= 8; j--) {
-      mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
-        cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
-                                      [=](id<1> tid) SYCL_ESIMD_KERNEL {
-                                        using namespace sycl::INTEL::gpu;
-                                        cmk_bitonic_merge(pOutputs, j, i, tid);
-                                      });
-      });
-      // mergeEvent[k].wait();
-      k++;
+  try {
+    for (int i = 8; i < LOG2_ELEMENTS; i++) {
+      // each step halves the stride distance of its prior step.
+      // 1<<j is the stride distance that the invoked step will handle.
+      // The recursive steps continue until stride distance 1 is complete.
+      // For stride distance less than 1<<8, no global synchronization
+      // is needed, i.e., all work can be done locally within HW threads.
+      // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
+      // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
+      // locally.
+      for (int j = i; j >= 8; j--) {
+        mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
+          cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
+                                        [=](id<1> tid) SYCL_ESIMD_KERNEL {
+                                          using namespace sycl::INTEL::gpu;
+                                          cmk_bitonic_merge(pOutputs, j, i,
+                                                            tid);
+                                        });
+        });
+        // mergeEvent[k].wait();
+        k++;
+      }
     }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
+
   mergeEvent[k - 1].wait();
   total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
 

@@ -298,76 +298,82 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
 void hierarchical_prefix(queue &q, unsigned *buf, unsigned elem_stride,
                          unsigned thrd_stride, unsigned n_entries,
                          unsigned entry_per_th) {
-  if (n_entries <= REMAINING_ENTRIES) {
-    std::cout << "... n_entries: " << n_entries
+  try {
+    if (n_entries <= REMAINING_ENTRIES) {
+      std::cout << "... n_entries: " << n_entries
+                << " elem_stide: " << elem_stride
+                << " thread_stride: " << thrd_stride
+                << " entry per thread: " << entry_per_th << std::endl;
+      // one single thread
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_final>(
+            range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
+            });
+      });
+      return;
+    }
+
+    std::cout << "*** n_entries: " << n_entries
               << " elem_stide: " << elem_stride
               << " thread_stride: " << thrd_stride
               << " entry per thread: " << entry_per_th << std::endl;
-    // one single thread
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_final>(
-          range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
-          });
-    });
-    return;
-  }
 
-  std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride
-            << " thread_stride: " << thrd_stride
-            << " entry per thread: " << entry_per_th << std::endl;
-
-  if (entry_per_th == PREFIX_ENTRIES) {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_iterative1>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
-                               PREFIX_ENTRIES);
-          });
-    });
-  } else {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_iterative2>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
-                               PREFIX_ENTRIES_LOW);
-          });
-    });
-  }
-
-  // if number of remaining entries <= 4K , each thread  accumulates smaller
-  // number of entries to keep EUs saturated
-  if (n_entries / entry_per_th > 4096)
-    hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES,
-                        n_entries / entry_per_th, PREFIX_ENTRIES);
-  else
-    hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW,
-                        n_entries / entry_per_th, PREFIX_ENTRIES_LOW);
-
-  std::cout << "=== n_entries: " << n_entries << " elem_stide: " << elem_stride
-            << " thread_stride: " << thrd_stride
-            << " entry per thread: " << entry_per_th << std::endl;
-
-  if (entry_per_th == PREFIX_ENTRIES) {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Prefix_iterative1>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+    if (entry_per_th == PREFIX_ENTRIES) {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_iterative1>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
                                  PREFIX_ENTRIES);
-          });
-    });
-  } else {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Prefix_iterative2>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+            });
+      });
+    } else {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_iterative2>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
                                  PREFIX_ENTRIES_LOW);
-          });
-    });
+            });
+      });
+    }
+
+    // if number of remaining entries <= 4K , each thread  accumulates smaller
+    // number of entries to keep EUs saturated
+    if (n_entries / entry_per_th > 4096)
+      hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES,
+                          n_entries / entry_per_th, PREFIX_ENTRIES);
+    else
+      hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW,
+                          n_entries / entry_per_th, PREFIX_ENTRIES_LOW);
+
+    std::cout << "=== n_entries: " << n_entries
+              << " elem_stide: " << elem_stride
+              << " thread_stride: " << thrd_stride
+              << " entry per thread: " << entry_per_th << std::endl;
+
+    if (entry_per_th == PREFIX_ENTRIES) {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Prefix_iterative1>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+                                   PREFIX_ENTRIES);
+            });
+      });
+    } else {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Prefix_iterative2>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+                                   PREFIX_ENTRIES_LOW);
+            });
+      });
+    }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
   }
 }
 

@@ -144,14 +144,20 @@ int main(int argc, char *argv[]) {
 
   // compute local sum for every chunk of PREFIX_ENTRIES
   cl::sycl::range<2> GlobalRange{size / PREFIX_ENTRIES, 1};
-  auto e0 = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Sum_tuple>(
-        GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
-          cmk_sum_tuple_count(pInputs, it.get_id(0));
-        });
-  });
-
-  e0.wait();
+  try {
+    auto e0 = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Sum_tuple>(
+          GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
+            cmk_sum_tuple_count(pInputs, it.get_id(0));
+          });
+    });
+    e0.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(pInputs, ctxt);
+    free(pExpectOutputs);
+    return e.get_cl_code();
+  }
 
   bool pass = memcmp(pInputs, pExpectOutputs,
                      size * TUPLE_SZ * sizeof(unsigned int)) == 0;

@@ -147,15 +147,21 @@ int main(int argc, char *argv[]) {
       malloc(size * TUPLE_SZ * sizeof(unsigned int)));
   compute_local_prefixsum(pInputs, pExpectOutputs, size);
 
-  auto e1 = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Accum_iterative>(
-        range<2>{size / PREFIX_ENTRIES, 1} * LocalRange,
-        [=](item<2> it) SYCL_ESIMD_KERNEL {
-          cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES);
-        });
-  });
-
-  e1.wait();
+  try {
+    auto e1 = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Accum_iterative>(
+          range<2>{size / PREFIX_ENTRIES, 1} * LocalRange,
+          [=](item<2> it) SYCL_ESIMD_KERNEL {
+            cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES);
+          });
+    });
+    e1.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(pInputs, ctxt);
+    free(pExpectOutputs);
+    return e.get_cl_code();
+  }
 
   bool pass = memcmp(pInputs, pExpectOutputs,
                      size * TUPLE_SZ * sizeof(unsigned int)) == 0;