diff --git a/SYCL/ESIMD/BitonicSortK.cpp b/SYCL/ESIMD/BitonicSortK.cpp
index e211b1a097..8113271680 100644
--- a/SYCL/ESIMD/BitonicSortK.cpp
+++ b/SYCL/ESIMD/BitonicSortK.cpp
@@ -601,7 +601,7 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   cl::sycl::range<1> SortLocalRange{1};
 
   double total_time = 0;
-  {
+  try {
     buffer<uint32_t, 1> bufi(pInputs, range<1>(size));
     buffer<uint32_t, 1> bufo(pOutputs, range<1>(size));
     // enqueue sort265 kernel
@@ -616,6 +616,9 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
     });
     e.wait();
     total_time += report_time("kernel time", e, e);
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
 
   // Each HW thread swap two 256-element chunks. Hence, we only need
@@ -631,28 +634,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   // this loop is for stage 8 to stage LOG2_ELEMENTS.
   event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
   int k = 0;
-  for (int i = 8; i < LOG2_ELEMENTS; i++) {
-    // each step halves the stride distance of its prior step.
-    // 1<<j is the stride distance that the invoked step will handle.
-    // The recursive steps continue until stride distance 1 is complete.
-    // For stride distance less than 1<<8, no global synchronization
-    // is needed, i.e., all work can be done locally within HW threads.
-    // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
-    // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
-    // locally.
-    for (int j = i; j >= 8; j--) {
-      buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
-      mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
-        auto acc = buf.get_access<access::mode::read_write>(cgh);
-        cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
-                                      [=](id<1> tid) SYCL_ESIMD_KERNEL {
-                                        using namespace sycl::INTEL::gpu;
-                                        cmk_bitonic_merge(acc, j, i, tid);
-                                      });
-      });
-      k++;
+  try {
+    for (int i = 8; i < LOG2_ELEMENTS; i++) {
+      // each step halves the stride distance of its prior step.
+      // 1<<j is the stride distance that the invoked step will handle.
+      // The recursive steps continue until stride distance 1 is complete.
+      // For stride distance less than 1<<8, no global synchronization
+      // is needed, i.e., all work can be done locally within HW threads.
+      // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
+      // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
+      // locally.
+      for (int j = i; j >= 8; j--) {
+        buffer<uint32_t, 1> buf(pOutputs, range<1>(size));
+        mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
+          auto acc = buf.get_access<access::mode::read_write>(cgh);
+          cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
+                                        [=](id<1> tid) SYCL_ESIMD_KERNEL {
+                                          using namespace sycl::INTEL::gpu;
+                                          cmk_bitonic_merge(acc, j, i, tid);
+                                        });
+        });
+        k++;
+      }
     }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
+
   mergeEvent[k - 1].wait();
   total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
 
diff --git a/SYCL/ESIMD/BitonicSortKv2.cpp b/SYCL/ESIMD/BitonicSortKv2.cpp
index 32962069eb..b95c16fbaf 100644
--- a/SYCL/ESIMD/BitonicSortKv2.cpp
+++ b/SYCL/ESIMD/BitonicSortKv2.cpp
@@ -523,15 +523,20 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
 
   // enqueue sort265 kernel
   double total_time = 0;
-  auto e = pQueue_->submit([&](handler &cgh) {
-    cgh.parallel_for<class Sort256>(
-        SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
-          using namespace sycl::INTEL::gpu;
-          cmk_bitonic_sort_256(pInputs, pOutputs, i);
-        });
-  });
-  e.wait();
-  total_time += report_time("kernel time", e, e);
+  try {
+    auto e = pQueue_->submit([&](handler &cgh) {
+      cgh.parallel_for<class Sort256>(
+          SortGlobalRange * SortLocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::INTEL::gpu;
+            cmk_bitonic_sort_256(pInputs, pOutputs, i);
+          });
+    });
+    e.wait();
+    total_time += report_time("kernel time", e, e);
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
 
   // Each HW thread swap two 256-element chunks. Hence, we only need
   // to launch size/ (base_sort_size*2) HW threads
@@ -546,27 +551,34 @@ int BitonicSort::Solve(uint32_t *pInputs, uint32_t *pOutputs, uint32_t size) {
   // this loop is for stage 8 to stage LOG2_ELEMENTS.
   event mergeEvent[(LOG2_ELEMENTS - 8) * (LOG2_ELEMENTS - 7) / 2];
   int k = 0;
-  for (int i = 8; i < LOG2_ELEMENTS; i++) {
-    // each step halves the stride distance of its prior step.
-    // 1<<j is the stride distance that the invoked step will handle.
-    // The recursive steps continue until stride distance 1 is complete.
-    // For stride distance less than 1<<8, no global synchronization
-    // is needed, i.e., all work can be done locally within HW threads.
-    // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
-    // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
-    // locally.
-    for (int j = i; j >= 8; j--) {
-      mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
-        cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
-                                      [=](id<1> tid) SYCL_ESIMD_KERNEL {
-                                        using namespace sycl::INTEL::gpu;
-                                        cmk_bitonic_merge(pOutputs, j, i, tid);
-                                      });
-      });
-      // mergeEvent[k].wait();
-      k++;
+  try {
+    for (int i = 8; i < LOG2_ELEMENTS; i++) {
+      // each step halves the stride distance of its prior step.
+      // 1<<j is the stride distance that the invoked step will handle.
+      // The recursive steps continue until stride distance 1 is complete.
+      // For stride distance less than 1<<8, no global synchronization
+      // is needed, i.e., all work can be done locally within HW threads.
+      // Hence, the invocation of j==8 cmk_bitonic_merge finishes stride 256
+      // compare-and-swap and then performs stride 128, 64, 32, 16, 8, 4, 2, 1
+      // locally.
+      for (int j = i; j >= 8; j--) {
+        mergeEvent[k] = pQueue_->submit([&](handler &cgh) {
+          cgh.parallel_for<class Merge>(MergeGlobalRange * MergeLocalRange,
+                                        [=](id<1> tid) SYCL_ESIMD_KERNEL {
+                                          using namespace sycl::INTEL::gpu;
+                                          cmk_bitonic_merge(pOutputs, j, i,
+                                                            tid);
+                                        });
+        });
+        // mergeEvent[k].wait();
+        k++;
+      }
     }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
+
   mergeEvent[k - 1].wait();
   total_time += report_time("kernel time", mergeEvent[0], mergeEvent[k - 1]);
 
diff --git a/SYCL/ESIMD/PrefixSum.cpp b/SYCL/ESIMD/PrefixSum.cpp
index eeb2a7ec61..78cb3a4679 100644
--- a/SYCL/ESIMD/PrefixSum.cpp
+++ b/SYCL/ESIMD/PrefixSum.cpp
@@ -298,76 +298,82 @@ void cmk_prefix_iterative(unsigned *buf, unsigned h_pos,
 void hierarchical_prefix(queue &q, unsigned *buf, unsigned elem_stride,
                          unsigned thrd_stride, unsigned n_entries,
                          unsigned entry_per_th) {
-  if (n_entries <= REMAINING_ENTRIES) {
-    std::cout << "... n_entries: " << n_entries
+  try {
+    if (n_entries <= REMAINING_ENTRIES) {
+      std::cout << "... n_entries: " << n_entries
+                << " elem_stide: " << elem_stride
+                << " thread_stride: " << thrd_stride
+                << " entry per thread: " << entry_per_th << std::endl;
+      // one single thread
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_final>(
+            range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
+            });
+      });
+      return;
+    }
+
+    std::cout << "*** n_entries: " << n_entries
               << " elem_stide: " << elem_stride
               << " thread_stride: " << thrd_stride
               << " entry per thread: " << entry_per_th << std::endl;
-    // one single thread
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_final>(
-          range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
-          });
-    });
-    return;
-  }
 
-  std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride
-            << " thread_stride: " << thrd_stride
-            << " entry per thread: " << entry_per_th << std::endl;
-
-  if (entry_per_th == PREFIX_ENTRIES) {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_iterative1>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
-                               PREFIX_ENTRIES);
-          });
-    });
-  } else {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_iterative2>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
-                               PREFIX_ENTRIES_LOW);
-          });
-    });
-  }
-
-  // if number of remaining entries <= 4K , each thread  accumulates smaller
-  // number of entries to keep EUs saturated
-  if (n_entries / entry_per_th > 4096)
-    hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES,
-                        n_entries / entry_per_th, PREFIX_ENTRIES);
-  else
-    hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW,
-                        n_entries / entry_per_th, PREFIX_ENTRIES_LOW);
-
-  std::cout << "=== n_entries: " << n_entries << " elem_stide: " << elem_stride
-            << " thread_stride: " << thrd_stride
-            << " entry per thread: " << entry_per_th << std::endl;
-
-  if (entry_per_th == PREFIX_ENTRIES) {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Prefix_iterative1>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+    if (entry_per_th == PREFIX_ENTRIES) {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_iterative1>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
                                  PREFIX_ENTRIES);
-          });
-    });
-  } else {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Prefix_iterative2>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+            });
+      });
+    } else {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_iterative2>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
                                  PREFIX_ENTRIES_LOW);
-          });
-    });
+            });
+      });
+    }
+
+    // if number of remaining entries <= 4K , each thread  accumulates smaller
+    // number of entries to keep EUs saturated
+    if (n_entries / entry_per_th > 4096)
+      hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES,
+                          n_entries / entry_per_th, PREFIX_ENTRIES);
+    else
+      hierarchical_prefix(q, buf, thrd_stride, thrd_stride * PREFIX_ENTRIES_LOW,
+                          n_entries / entry_per_th, PREFIX_ENTRIES_LOW);
+
+    std::cout << "=== n_entries: " << n_entries
+              << " elem_stide: " << elem_stride
+              << " thread_stride: " << thrd_stride
+              << " entry per thread: " << entry_per_th << std::endl;
+
+    if (entry_per_th == PREFIX_ENTRIES) {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Prefix_iterative1>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+                                   PREFIX_ENTRIES);
+            });
+      });
+    } else {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Prefix_iterative2>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_prefix_iterative(buf, it.get_id(0), elem_stride, thrd_stride,
+                                   PREFIX_ENTRIES_LOW);
+            });
+      });
+    }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
   }
 }
 
diff --git a/SYCL/ESIMD/Prefix_Local_sum1.cpp b/SYCL/ESIMD/Prefix_Local_sum1.cpp
index a26b37f6a5..f46bb849a7 100644
--- a/SYCL/ESIMD/Prefix_Local_sum1.cpp
+++ b/SYCL/ESIMD/Prefix_Local_sum1.cpp
@@ -144,14 +144,20 @@ int main(int argc, char *argv[]) {
 
   // compute local sum for every chunk of PREFIX_ENTRIES
   cl::sycl::range<2> GlobalRange{size / PREFIX_ENTRIES, 1};
-  auto e0 = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Sum_tuple>(
-        GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
-          cmk_sum_tuple_count(pInputs, it.get_id(0));
-        });
-  });
-
-  e0.wait();
+  try {
+    auto e0 = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Sum_tuple>(
+          GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
+            cmk_sum_tuple_count(pInputs, it.get_id(0));
+          });
+    });
+    e0.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(pInputs, ctxt);
+    free(pExpectOutputs);
+    return e.get_cl_code();
+  }
 
   bool pass = memcmp(pInputs, pExpectOutputs,
                      size * TUPLE_SZ * sizeof(unsigned int)) == 0;
diff --git a/SYCL/ESIMD/Prefix_Local_sum2.cpp b/SYCL/ESIMD/Prefix_Local_sum2.cpp
index 96a68fb059..58b9a7f7be 100644
--- a/SYCL/ESIMD/Prefix_Local_sum2.cpp
+++ b/SYCL/ESIMD/Prefix_Local_sum2.cpp
@@ -147,15 +147,21 @@ int main(int argc, char *argv[]) {
       malloc(size * TUPLE_SZ * sizeof(unsigned int)));
   compute_local_prefixsum(pInputs, pExpectOutputs, size);
 
-  auto e1 = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Accum_iterative>(
-        range<2>{size / PREFIX_ENTRIES, 1} * LocalRange,
-        [=](item<2> it) SYCL_ESIMD_KERNEL {
-          cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES);
-        });
-  });
-
-  e1.wait();
+  try {
+    auto e1 = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Accum_iterative>(
+          range<2>{size / PREFIX_ENTRIES, 1} * LocalRange,
+          [=](item<2> it) SYCL_ESIMD_KERNEL {
+            cmk_acum_iterative(pInputs, it.get_id(0), 1, PREFIX_ENTRIES);
+          });
+    });
+    e1.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(pInputs, ctxt);
+    free(pExpectOutputs);
+    return e.get_cl_code();
+  }
 
   bool pass = memcmp(pInputs, pExpectOutputs,
                      size * TUPLE_SZ * sizeof(unsigned int)) == 0;
diff --git a/SYCL/ESIMD/Prefix_Local_sum3.cpp b/SYCL/ESIMD/Prefix_Local_sum3.cpp
index b459463c0d..6a5004857a 100644
--- a/SYCL/ESIMD/Prefix_Local_sum3.cpp
+++ b/SYCL/ESIMD/Prefix_Local_sum3.cpp
@@ -236,44 +236,50 @@ void cmk_acum_final(unsigned *buf, unsigned h_pos, unsigned int stride_elems,
 void hierarchical_prefix(queue &q, unsigned *buf, unsigned elem_stride,
                          unsigned thrd_stride, unsigned n_entries,
                          unsigned entry_per_th) {
-  if (n_entries <= REMAINING_ENTRIES) {
-    std::cout << "... n_entries: " << n_entries
+  try {
+    if (n_entries <= REMAINING_ENTRIES) {
+      std::cout << "... n_entries: " << n_entries
+                << " elem_stide: " << elem_stride
+                << " thread_stride: " << thrd_stride
+                << " entry per thread: " << entry_per_th << std::endl;
+      // one single thread
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_final>(
+            range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
+            });
+      });
+      q.wait();
+      return;
+    }
+
+    std::cout << "*** n_entries: " << n_entries
               << " elem_stide: " << elem_stride
               << " thread_stride: " << thrd_stride
               << " entry per thread: " << entry_per_th << std::endl;
-    // one single thread
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_final>(
-          range<2>{1, 1} * range<2>{1, 1}, [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_final(buf, it.get_id(0), elem_stride, n_entries);
-          });
-    });
-    q.wait();
-    return;
-  }
 
-  std::cout << "*** n_entries: " << n_entries << " elem_stide: " << elem_stride
-            << " thread_stride: " << thrd_stride
-            << " entry per thread: " << entry_per_th << std::endl;
-
-  if (entry_per_th == PREFIX_ENTRIES) {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_iterative1>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride);
-          });
-    });
-    q.wait();
-  } else {
-    q.submit([&](handler &cgh) {
-      cgh.parallel_for<class Accum_iterative2>(
-          range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
-          [=](item<2> it) SYCL_ESIMD_KERNEL {
-            cmk_acum_iterative_low(buf, it.get_id(0), elem_stride, thrd_stride);
-          });
-    });
-    q.wait();
+    if (entry_per_th == PREFIX_ENTRIES) {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_iterative1>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_iterative(buf, it.get_id(0), elem_stride, thrd_stride);
+            });
+      });
+      q.wait();
+    } else {
+      q.submit([&](handler &cgh) {
+        cgh.parallel_for<class Accum_iterative2>(
+            range<2>{n_entries / entry_per_th, 1} * range<2>{1, 1},
+            [=](item<2> it) SYCL_ESIMD_KERNEL {
+              cmk_acum_iterative_low(buf, it.get_id(0), elem_stride,
+                                     thrd_stride);
+            });
+      });
+      q.wait();
+    }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
   }
 
   // if number of remaining entries <= 4K , each thread  accumulates smaller
diff --git a/SYCL/ESIMD/Stencil.cpp b/SYCL/ESIMD/Stencil.cpp
index 1a7f8fd155..ab18df1df0 100644
--- a/SYCL/ESIMD/Stencil.cpp
+++ b/SYCL/ESIMD/Stencil.cpp
@@ -104,75 +104,82 @@ int main(void) {
   InitializeSquareMatrix(inputMatrix, DIM_SIZE, false);
   InitializeSquareMatrix(outputMatrix, DIM_SIZE, true);
 
-  auto e = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Stencil_kernel>(
-        GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
-          using namespace sycl::INTEL::gpu;
-          uint h_pos = it.get_id(0);
-          uint v_pos = it.get_id(1);
-
-          simd<float, (HEIGHT + 10) * 32> vin;
-          // matrix HEIGHT+10 x 32
-          auto in = vin.format<float, HEIGHT + 10, 32>();
-
-          //
-          // rather than loading all data in
-          // the code will interleave data loading and compute
-          // first, we load enough data for the first 16 pixels
-          //
-          unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Stencil_kernel>(
+          GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
+            using namespace sycl::INTEL::gpu;
+            uint h_pos = it.get_id(0);
+            uint v_pos = it.get_id(1);
+
+            simd<float, (HEIGHT + 10) * 32> vin;
+            // matrix HEIGHT+10 x 32
+            auto in = vin.format<float, HEIGHT + 10, 32>();
+
+            //
+            // rather than loading all data in
+            // the code will interleave data loading and compute
+            // first, we load enough data for the first 16 pixels
+            //
+            unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
 #pragma unroll
-          for (unsigned i = 0; i < 10; i++) {
-            in.row(i) = block_load<float, 32>(inputMatrix + off);
-            off += DIM_SIZE;
-          }
+            for (unsigned i = 0; i < 10; i++) {
+              in.row(i) = block_load<float, 32>(inputMatrix + off);
+              off += DIM_SIZE;
+            }
 
-          unsigned out_off =
-              (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) *
-              sizeof(float);
-          simd<unsigned, WIDTH> elm16(0, 1);
+            unsigned out_off =
+                (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) *
+                sizeof(float);
+            simd<unsigned, WIDTH> elm16(0, 1);
 
 #pragma unroll
-          for (unsigned i = 0; i < HEIGHT; i++) {
-
-            in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
-            off += DIM_SIZE;
-
-            simd<float, WIDTH> sum =
-                in.row(i + 0).select<WIDTH, 1>(5) * -0.02f +
-                in.row(i + 1).select<WIDTH, 1>(5) * -0.025f +
-                in.row(i + 2).select<WIDTH, 1>(5) * -0.0333333333333f +
-                in.row(i + 3).select<WIDTH, 1>(5) * -0.05f +
-                in.row(i + 4).select<WIDTH, 1>(5) * -0.1f +
-                in.row(i + 6).select<WIDTH, 1>(5) * 0.1f +
-                in.row(i + 7).select<WIDTH, 1>(5) * 0.05f +
-                in.row(i + 8).select<WIDTH, 1>(5) * 0.0333333333333f +
-                in.row(i + 9).select<WIDTH, 1>(5) * 0.025f +
-                in.row(i + 10).select<WIDTH, 1>(5) * 0.02f +
-                in.row(i + 5).select<WIDTH, 1>(0) * -0.02f +
-                in.row(i + 5).select<WIDTH, 1>(1) * -0.025f +
-                in.row(i + 5).select<WIDTH, 1>(2) * -0.0333333333333f +
-                in.row(i + 5).select<WIDTH, 1>(3) * -0.05f +
-                in.row(i + 5).select<WIDTH, 1>(4) * -0.1f +
-                in.row(i + 5).select<WIDTH, 1>(6) * 0.1f +
-                in.row(i + 5).select<WIDTH, 1>(7) * 0.05f +
-                in.row(i + 5).select<WIDTH, 1>(8) * 0.0333333333333f +
-                in.row(i + 5).select<WIDTH, 1>(9) * 0.025f +
-                in.row(i + 5).select<WIDTH, 1>(10) * 0.02f;
-
-            // predciate output
-            simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;
-
-            simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
-            scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
-            out_off += DIM_SIZE * sizeof(float);
-
-            if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1)
-              break;
-          }
-        });
-  });
-  e.wait();
+            for (unsigned i = 0; i < HEIGHT; i++) {
+
+              in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
+              off += DIM_SIZE;
+
+              simd<float, WIDTH> sum =
+                  in.row(i + 0).select<WIDTH, 1>(5) * -0.02f +
+                  in.row(i + 1).select<WIDTH, 1>(5) * -0.025f +
+                  in.row(i + 2).select<WIDTH, 1>(5) * -0.0333333333333f +
+                  in.row(i + 3).select<WIDTH, 1>(5) * -0.05f +
+                  in.row(i + 4).select<WIDTH, 1>(5) * -0.1f +
+                  in.row(i + 6).select<WIDTH, 1>(5) * 0.1f +
+                  in.row(i + 7).select<WIDTH, 1>(5) * 0.05f +
+                  in.row(i + 8).select<WIDTH, 1>(5) * 0.0333333333333f +
+                  in.row(i + 9).select<WIDTH, 1>(5) * 0.025f +
+                  in.row(i + 10).select<WIDTH, 1>(5) * 0.02f +
+                  in.row(i + 5).select<WIDTH, 1>(0) * -0.02f +
+                  in.row(i + 5).select<WIDTH, 1>(1) * -0.025f +
+                  in.row(i + 5).select<WIDTH, 1>(2) * -0.0333333333333f +
+                  in.row(i + 5).select<WIDTH, 1>(3) * -0.05f +
+                  in.row(i + 5).select<WIDTH, 1>(4) * -0.1f +
+                  in.row(i + 5).select<WIDTH, 1>(6) * 0.1f +
+                  in.row(i + 5).select<WIDTH, 1>(7) * 0.05f +
+                  in.row(i + 5).select<WIDTH, 1>(8) * 0.0333333333333f +
+                  in.row(i + 5).select<WIDTH, 1>(9) * 0.025f +
+                  in.row(i + 5).select<WIDTH, 1>(10) * 0.02f;
+
+              // predciate output
+              simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;
+
+              simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
+              scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
+              out_off += DIM_SIZE * sizeof(float);
+
+              if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1)
+                break;
+            }
+          });
+    });
+    e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(inputMatrix, ctxt);
+    free(outputMatrix, ctxt);
+    return e.get_cl_code();
+  }
 
   // check result
   bool passed = CheckResults(outputMatrix, inputMatrix);
diff --git a/SYCL/ESIMD/accessor_gather_scatter.cpp b/SYCL/ESIMD/accessor_gather_scatter.cpp
index 178515ade4..fe31e9b48c 100644
--- a/SYCL/ESIMD/accessor_gather_scatter.cpp
+++ b/SYCL/ESIMD/accessor_gather_scatter.cpp
@@ -58,7 +58,7 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
     A[i] = (T)i;
   }
 
-  {
+  try {
     buffer<T, 1> buf(A, range<1>(size));
     range<1> glob_range{size / VL};
 
@@ -67,6 +67,10 @@ template <typename T, unsigned VL, unsigned STRIDE> bool test(queue q) {
       Kernel<T, VL, STRIDE> kernel(acc);
       cgh.parallel_for(glob_range, kernel);
     });
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    delete[] A;
+    return e.get_cl_code();
   }
 
   int err_cnt = 0;
diff --git a/SYCL/ESIMD/accessor_load_store.cpp b/SYCL/ESIMD/accessor_load_store.cpp
index 61b87fff08..0709f0044b 100644
--- a/SYCL/ESIMD/accessor_load_store.cpp
+++ b/SYCL/ESIMD/accessor_load_store.cpp
@@ -53,7 +53,7 @@ template <typename T> bool test(queue q, size_t size) {
     A[i] = (T)i;
   }
 
-  {
+  try {
     buffer<T, 1> buf(A, range<1>(size));
     range<1> glob_range{size};
 
@@ -62,6 +62,10 @@ template <typename T> bool test(queue q, size_t size) {
       Kernel<T> kernel(acc);
       cgh.parallel_for(glob_range, kernel);
     });
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    delete[] A;
+    return e.get_cl_code();
   }
 
   int err_cnt = 0;
diff --git a/SYCL/ESIMD/histogram.cpp b/SYCL/ESIMD/histogram.cpp
index 6709950a3d..892ea5b0f2 100644
--- a/SYCL/ESIMD/histogram.cpp
+++ b/SYCL/ESIMD/histogram.cpp
@@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
                          image_channel_type::unsigned_int32,
                          range<2>{width / sizeof(uint4), height});
 
-  {
+  try {
     // create ranges
     // We need that many workitems
     auto GlobalRange = range<1>(range_width * range_height);
@@ -207,7 +207,11 @@ int main(int argc, char *argv[]) {
 
     // SYCL will enqueue and run the kernel. Recall that the buffer's data is
     // given back to the host at the end of scope.
-  } // make sure data is given back to the host at the end of this scope
+    // make sure data is given back to the host at the end of this scope
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
 
   writeHist(bins);
   writeHist(cpuHistogram);
diff --git a/SYCL/ESIMD/histogram_256_slm.cpp b/SYCL/ESIMD/histogram_256_slm.cpp
index 040834911c..564c8c7e6f 100644
--- a/SYCL/ESIMD/histogram_256_slm.cpp
+++ b/SYCL/ESIMD/histogram_256_slm.cpp
@@ -157,7 +157,7 @@ int main() {
   auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
   cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
 
-  {
+  try {
     auto e = q.submit([&](cl::sycl::handler &cgh) {
       cgh.parallel_for<class histogram_slm>(
           Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
@@ -166,6 +166,9 @@ int main() {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
 
   std::cout << "finish GPU histogram\n";
diff --git a/SYCL/ESIMD/histogram_256_slm_spec.cpp b/SYCL/ESIMD/histogram_256_slm_spec.cpp
index cfd919b9c6..24c9dffe59 100644
--- a/SYCL/ESIMD/histogram_256_slm_spec.cpp
+++ b/SYCL/ESIMD/histogram_256_slm_spec.cpp
@@ -171,7 +171,7 @@ int main(int argc, char **argv) {
   auto LocalRange = cl::sycl::range<1>(NUM_BINS / 16);
   cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
 
-  {
+  try {
     auto e = q.submit([&](cl::sycl::handler &cgh) {
       cgh.parallel_for<histogram_slm>(
           prg.get_kernel<histogram_slm>(), Range,
@@ -181,6 +181,9 @@ int main(int argc, char **argv) {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
 
   std::cout << "finish GPU histogram\n";
diff --git a/SYCL/ESIMD/histogram_2d.cpp b/SYCL/ESIMD/histogram_2d.cpp
index b1bfdda7cd..23cb598450 100644
--- a/SYCL/ESIMD/histogram_2d.cpp
+++ b/SYCL/ESIMD/histogram_2d.cpp
@@ -135,7 +135,7 @@ int main(int argc, char *argv[]) {
                          image_channel_type::unsigned_int32,
                          range<2>{width / sizeof(uint4), height});
 
-  {
+  try {
     // create ranges
     // We need that many workitems
     auto GlobalRange = range<2>(range_width, range_height);
@@ -206,7 +206,11 @@ int main(int argc, char *argv[]) {
 
     // SYCL will enqueue and run the kernel. Recall that the buffer's data is
     // given back to the host at the end of scope.
-  } // make sure data is given back to the host at the end of this scope
+    // make sure data is given back to the host at the end of this scope
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
 
   writeHist(bins);
   writeHist(cpuHistogram);
diff --git a/SYCL/ESIMD/histogram_raw_send.cpp b/SYCL/ESIMD/histogram_raw_send.cpp
index 64435c296d..b86538477e 100644
--- a/SYCL/ESIMD/histogram_raw_send.cpp
+++ b/SYCL/ESIMD/histogram_raw_send.cpp
@@ -159,7 +159,7 @@ int main(int argc, char *argv[]) {
                          image_channel_type::unsigned_int32,
                          range<2>{width / sizeof(uint4), height});
 
-  {
+  try {
     // create ranges
     // We need that many task groups
     auto GlobalRange = range<1>(range_width * range_height);
@@ -231,7 +231,11 @@ int main(int argc, char *argv[]) {
 
     // SYCL will enqueue and run the kernel. Recall that the buffer's data is
     // given back to the host at the end of scope.
-  } // make sure data is given back to the host at the end of this scope
+    // make sure data is given back to the host at the end of this scope
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
 
   writeHist(bins);
   writeHist(cpuHistogram);
diff --git a/SYCL/ESIMD/kmeans/kmeans.cpp b/SYCL/ESIMD/kmeans/kmeans.cpp
index 91fd646b87..d3abb00033 100644
--- a/SYCL/ESIMD/kmeans/kmeans.cpp
+++ b/SYCL/ESIMD/kmeans/kmeans.cpp
@@ -678,10 +678,17 @@ int main(int argc, char *argv[]) {
     kernel3_time_in_ns += report_time("kernel3", e2);
   };
 
-  for (auto i = 0; i < NUM_ITERATIONS - 1; i++) {
-    submitJobs(false);
+  try {
+    for (auto i = 0; i < NUM_ITERATIONS - 1; i++) {
+      submitJobs(false);
+    }
+    submitJobs(true);
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    delete cpu_points;
+    delete cpu_centroids;
+    return e.get_cl_code();
   }
-  submitJobs(true);
 
   //---
 
diff --git a/SYCL/ESIMD/linear/linear.cpp b/SYCL/ESIMD/linear/linear.cpp
index f5323efd9a..fbf2d8ad4d 100644
--- a/SYCL/ESIMD/linear/linear.cpp
+++ b/SYCL/ESIMD/linear/linear.cpp
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
   // Sets output to blank image.
   output_image.setData(new unsigned char[img_size]);
 
-  {
+  try {
     unsigned int img_width = width * bpp / (8 * sizeof(int));
 
     cl::sycl::image<2> imgInput(
@@ -124,6 +124,9 @@ int main(int argc, char *argv[]) {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
 
   output_image.save("linear_out.bmp");
diff --git a/SYCL/ESIMD/mandelbrot/mandelbrot.cpp b/SYCL/ESIMD/mandelbrot/mandelbrot.cpp
index f137e5b5ac..3f62b25681 100644
--- a/SYCL/ESIMD/mandelbrot/mandelbrot.cpp
+++ b/SYCL/ESIMD/mandelbrot/mandelbrot.cpp
@@ -84,7 +84,7 @@ int main(int argc, char *argv[]) {
   // Sets output to blank image.
   unsigned char *buf = new unsigned char[img_size];
 
-  {
+  try {
     cl::sycl::image<2> imgOutput((unsigned int *)buf, image_channel_order::rgba,
                                  image_channel_type::unsigned_int8,
                                  range<2>{WIDTH, HEIGHT});
@@ -115,6 +115,10 @@ int main(int argc, char *argv[]) {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    delete[] buf;
+    return e.get_cl_code();
   }
 
   char *out_file = argv[1];
diff --git a/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp b/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp
index 0af7422ca0..f885744880 100644
--- a/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp
+++ b/SYCL/ESIMD/mandelbrot/mandelbrot_spec.cpp
@@ -93,7 +93,7 @@ int main(int argc, char *argv[]) {
   // Sets output to blank image.
   unsigned char *buf = new unsigned char[img_size];
 
-  {
+  try {
     cl::sycl::image<2> imgOutput((unsigned int *)buf, image_channel_order::rgba,
                                  image_channel_type::unsigned_int8,
                                  range<2>{WIDTH, HEIGHT});
@@ -152,6 +152,10 @@ int main(int argc, char *argv[]) {
                              });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    delete[] buf;
+    return e.get_cl_code();
   }
 
   char *out_file = argv[1];
diff --git a/SYCL/ESIMD/matrix_transpose.cpp b/SYCL/ESIMD/matrix_transpose.cpp
index f523094ee7..8bd2ac9720 100644
--- a/SYCL/ESIMD/matrix_transpose.cpp
+++ b/SYCL/ESIMD/matrix_transpose.cpp
@@ -348,37 +348,44 @@ bool runTest(unsigned MZ, unsigned block_size) {
   double kernel_times = 0;
   unsigned num_iters = 10;
 
-  // num_iters + 1, iteration#0 is for warmup
-  for (int i = 0; i <= num_iters; ++i) {
-    double etime = 0;
-    // make sure that buffer object has short live-range
-    // than M
-    buffer<int, 1> buf(M, range<1>(MZ * MZ));
-
-    if (block_size == 16 && MZ >= 16) {
-      auto e = q.submit([&](handler &cgh) {
-        auto acc = buf.get_access<access::mode::read_write>(cgh);
-        cgh.parallel_for<class K16>(
-            Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
-              transpose16(acc, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
-            });
-      });
-      e.wait();
-      etime = report_time("kernel time", e);
-    } else if (block_size == 8) {
-      auto e = q.submit([&](handler &cgh) {
-        auto acc = buf.get_access<access::mode::read_write>(cgh);
-        cgh.parallel_for<class K08>(
-            Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
-              transpose8(acc, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
-            });
-      });
-      e.wait();
-      etime = report_time("kernel time", e);
-    }
+  try {
+    // num_iters + 1, iteration#0 is for warmup
+    for (int i = 0; i <= num_iters; ++i) {
+      double etime = 0;
+      // make sure that buffer object has short live-range
+      // than M
+      buffer<int, 1> buf(M, range<1>(MZ * MZ));
+
+      if (block_size == 16 && MZ >= 16) {
+        auto e = q.submit([&](handler &cgh) {
+          auto acc = buf.get_access<access::mode::read_write>(cgh);
+          cgh.parallel_for<class K16>(
+              Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
+                transpose16(acc, MZ, ndi.get_global_id(0),
+                            ndi.get_global_id(1));
+              });
+        });
+        e.wait();
+        etime = report_time("kernel time", e);
+      } else if (block_size == 8) {
+        auto e = q.submit([&](handler &cgh) {
+          auto acc = buf.get_access<access::mode::read_write>(cgh);
+          cgh.parallel_for<class K08>(
+              Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
+                transpose8(acc, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
+              });
+        });
+        e.wait();
+        etime = report_time("kernel time", e);
+      }
 
-    if (i > 0)
-      kernel_times += etime;
+      if (i > 0)
+        kernel_times += etime;
+    }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    delete[] M;
+    return e.get_cl_code();
   }
 
   // End timer.
diff --git a/SYCL/ESIMD/matrix_transpose_glb.cpp b/SYCL/ESIMD/matrix_transpose_glb.cpp
index 1dfb4f6806..ce60f699c4 100644
--- a/SYCL/ESIMD/matrix_transpose_glb.cpp
+++ b/SYCL/ESIMD/matrix_transpose_glb.cpp
@@ -312,31 +312,37 @@ bool runTest(unsigned MZ, unsigned block_size) {
   double kernel_times = 0;
   unsigned num_iters = 10;
 
-  // num_iters + 1, iteration#0 is for warmup
-  for (int i = 0; i <= num_iters; ++i) {
-    double etime = 0;
-    if (block_size == 16 && MZ >= 16) {
-      auto e = q.submit([&](handler &cgh) {
-        cgh.parallel_for<class Transpose16>(
-            Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
-              transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
-            });
-      });
-      e.wait();
-      etime = report_time("kernel time", e);
-    } else if (block_size == 8) {
-      auto e = q.submit([&](handler &cgh) {
-        cgh.parallel_for<class Transpose08>(
-            Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
-              transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
-            });
-      });
-      e.wait();
-      etime = report_time("kernel time", e);
-    }
+  try {
+    // num_iters + 1, iteration#0 is for warmup
+    for (int i = 0; i <= num_iters; ++i) {
+      double etime = 0;
+      if (block_size == 16 && MZ >= 16) {
+        auto e = q.submit([&](handler &cgh) {
+          cgh.parallel_for<class Transpose16>(
+              Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
+                transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
+              });
+        });
+        e.wait();
+        etime = report_time("kernel time", e);
+      } else if (block_size == 8) {
+        auto e = q.submit([&](handler &cgh) {
+          cgh.parallel_for<class Transpose08>(
+              Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
+                transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
+              });
+        });
+        e.wait();
+        etime = report_time("kernel time", e);
+      }
 
-    if (i > 0)
-      kernel_times += etime;
+      if (i > 0)
+        kernel_times += etime;
+    }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(M, ctxt);
+    return e.get_cl_code();
   }
 
   // End timer.
diff --git a/SYCL/ESIMD/matrix_transpose_usm.cpp b/SYCL/ESIMD/matrix_transpose_usm.cpp
index 5077bb3c88..b56a3293f0 100644
--- a/SYCL/ESIMD/matrix_transpose_usm.cpp
+++ b/SYCL/ESIMD/matrix_transpose_usm.cpp
@@ -319,31 +319,37 @@ bool runTest(unsigned MZ, unsigned block_size) {
   double kernel_times = 0;
   unsigned num_iters = 10;
 
-  // num_iters + 1, iteration#0 is for warmup
-  for (int i = 0; i <= num_iters; ++i) {
-    double etime = 0;
-    if (block_size == 16 && MZ >= 16) {
-      auto e = q.submit([&](handler &cgh) {
-        cgh.parallel_for<class Transpose16>(
-            Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
-              transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
-            });
-      });
-      e.wait();
-      etime = report_time("kernel time", e);
-    } else if (block_size == 8) {
-      auto e = q.submit([&](handler &cgh) {
-        cgh.parallel_for<class Transpose08>(
-            Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
-              transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
-            });
-      });
-      e.wait();
-      etime = report_time("kernel time", e);
-    }
+  try {
+    // num_iters + 1, iteration#0 is for warmup
+    for (int i = 0; i <= num_iters; ++i) {
+      double etime = 0;
+      if (block_size == 16 && MZ >= 16) {
+        auto e = q.submit([&](handler &cgh) {
+          cgh.parallel_for<class Transpose16>(
+              Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
+                transpose16(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
+              });
+        });
+        e.wait();
+        etime = report_time("kernel time", e);
+      } else if (block_size == 8) {
+        auto e = q.submit([&](handler &cgh) {
+          cgh.parallel_for<class Transpose08>(
+              Range, [=](nd_item<2> ndi) SYCL_ESIMD_KERNEL {
+                transpose8(M, MZ, ndi.get_global_id(0), ndi.get_global_id(1));
+              });
+        });
+        e.wait();
+        etime = report_time("kernel time", e);
+      }
 
-    if (i > 0)
-      kernel_times += etime;
+      if (i > 0)
+        kernel_times += etime;
+    }
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(M, ctxt);
+    return e.get_cl_code();
   }
 
   // End timer.
diff --git a/SYCL/ESIMD/reduction.cpp b/SYCL/ESIMD/reduction.cpp
index 4bc4161cde..921de22fb9 100644
--- a/SYCL/ESIMD/reduction.cpp
+++ b/SYCL/ESIMD/reduction.cpp
@@ -33,6 +33,7 @@ int main(void) {
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
   auto ctxt = q.get_context();
+  // TODO: release memory in the end of the test
   TYPE *A =
       static_cast<TYPE *>(malloc_shared(InputSize * sizeof(TYPE), dev, ctxt));
   int *B =
@@ -46,7 +47,7 @@ int main(void) {
     }
   }
 
-  {
+  try {
     cl::sycl::range<1> GroupRange{InputSize / VL};
     cl::sycl::range<1> TaskRange{GroupSize};
     cl::sycl::nd_range<1> Range(GroupRange, TaskRange);
@@ -68,6 +69,9 @@ int main(void) {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
 
   auto compute_reduce_sum = [](TYPE A[InputSize]) -> int {
diff --git a/SYCL/ESIMD/slm_barrier.cpp b/SYCL/ESIMD/slm_barrier.cpp
index 02d7ba7b15..7fe8146176 100644
--- a/SYCL/ESIMD/slm_barrier.cpp
+++ b/SYCL/ESIMD/slm_barrier.cpp
@@ -81,6 +81,7 @@ int main(void) {
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
   auto ctxt = q.get_context();
+  // TODO: release memory in the end of the test
   uint *A = static_cast<uint *>(malloc_shared(Size * sizeof(uint), dev, ctxt));
   uint *B = static_cast<uint *>(malloc_shared(Size * sizeof(uint), dev, ctxt));
 
@@ -101,34 +102,40 @@ int main(void) {
   cl::sycl::range<1> LocalRange{LOCAL_SIZE};
   cl::sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
 
-  auto e = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Test>(
-        Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
-          simd<uint, VL> v_slmData;
-          simd<uint, VL> v_Off(0, 4);
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Test>(
+          Range, [=](cl::sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            simd<uint, VL> v_slmData;
+            simd<uint, VL> v_Off(0, 4);
 
-          uint localID = ndi.get_local_id(0);
-          uint groupSize = ndi.get_local_range(0);
-          uint globalID = ndi.get_global_id(0);
-          uint groupID = ndi.get_group(0);
+            uint localID = ndi.get_local_id(0);
+            uint groupSize = ndi.get_local_range(0);
+            uint globalID = ndi.get_global_id(0);
+            uint groupID = ndi.get_group(0);
 
-          slm_init(1024);
+            slm_init(1024);
 
-          int grpMemOffset = groupID * groupSize * VL * 4;
+            int grpMemOffset = groupID * groupSize * VL * 4;
 
-          load_to_slm(groupSize, localID, 0, (char *)A, grpMemOffset,
-                      groupSize * VL * 4);
+            load_to_slm(groupSize, localID, 0, (char *)A, grpMemOffset,
+                        groupSize * VL * 4);
 
-          auto shiftID = (localID + 1) % 4;
+            auto shiftID = (localID + 1) % 4;
 
-          v_Off = v_Off + shiftID * 64;
+            v_Off = v_Off + shiftID * 64;
 
-          v_slmData = slm_load<uint, VL>(v_Off);
+            v_slmData = slm_load<uint, VL>(v_Off);
+
+            block_store<uint, VL>(B + globalID * VL, v_slmData);
+          });
+    });
+    e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
 
-          block_store<uint, VL>(B + globalID * VL, v_slmData);
-        });
-  });
-  e.wait();
   std::cout << "result" << std::endl;
   int result = 0;
   for (int i = 0; i < NUM_THREADS; i++) {
diff --git a/SYCL/ESIMD/stencil2.cpp b/SYCL/ESIMD/stencil2.cpp
index ad51efc00c..9198a874e3 100644
--- a/SYCL/ESIMD/stencil2.cpp
+++ b/SYCL/ESIMD/stencil2.cpp
@@ -106,75 +106,82 @@ int main(void) {
   InitializeSquareMatrix(inputMatrix, DIM_SIZE, false);
   InitializeSquareMatrix(outputMatrix, DIM_SIZE, true);
 
-  auto e = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Stencil_kernel>(
-        GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
-          using namespace sycl::INTEL::gpu;
-          uint h_pos = it.get_id(0);
-          uint v_pos = it.get_id(1);
-
-          simd<float, (HEIGHT + 10) * 32> vin;
-          // matrix HEIGHT+10 x 32
-          auto in = vin.format<float, HEIGHT + 10, 32>();
-
-          //
-          // rather than loading all data in
-          // the code will interleave data loading and compute
-          // first, we load enough data for the first 16 pixels
-          //
-          unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Stencil_kernel>(
+          GlobalRange * LocalRange, [=](item<2> it) SYCL_ESIMD_KERNEL {
+            using namespace sycl::INTEL::gpu;
+            uint h_pos = it.get_id(0);
+            uint v_pos = it.get_id(1);
+
+            simd<float, (HEIGHT + 10) * 32> vin;
+            // matrix HEIGHT+10 x 32
+            auto in = vin.format<float, HEIGHT + 10, 32>();
+
+            //
+            // rather than loading all data in
+            // the code will interleave data loading and compute
+            // first, we load enough data for the first 16 pixels
+            //
+            unsigned off = (v_pos * HEIGHT) * DIM_SIZE + h_pos * WIDTH;
 #pragma unroll
-          for (unsigned i = 0; i < 10; i++) {
-            in.row(i) = block_load<float, 32>(inputMatrix + off);
-            off += DIM_SIZE;
-          }
+            for (unsigned i = 0; i < 10; i++) {
+              in.row(i) = block_load<float, 32>(inputMatrix + off);
+              off += DIM_SIZE;
+            }
 
-          unsigned out_off =
-              (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) *
-              sizeof(float);
-          simd<unsigned, WIDTH> elm16(0, 1);
+            unsigned out_off =
+                (((v_pos * HEIGHT + 5) * DIM_SIZE + (h_pos * WIDTH) + 5)) *
+                sizeof(float);
+            simd<unsigned, WIDTH> elm16(0, 1);
 
 #pragma unroll
-          for (unsigned i = 0; i < HEIGHT; i++) {
-
-            in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
-            off += DIM_SIZE;
-
-            simd<float, WIDTH> sum =
-                vin.select<WIDTH, 1>(GET_IDX(i, 5)) * -0.02f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 1, 5)) * -0.025f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 2, 5)) * -0.0333333333333f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 3, 5)) * -0.05f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 4, 5)) * -0.1f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 6, 5)) * 0.1f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 7, 5)) * 0.05f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 8, 5)) * 0.0333333333333f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 9, 5)) * 0.025f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 10, 5)) * 0.02f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 0)) * -0.02f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 1)) * -0.025f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 2)) * -0.0333333333333f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 3)) * -0.05f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 4)) * -0.1f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 6)) * 0.1f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 7)) * 0.05f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 8)) * 0.0333333333333f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 9)) * 0.025f +
-                vin.select<WIDTH, 1>(GET_IDX(i + 5, 10)) * 0.02f;
-
-            // predciate output
-            simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;
-
-            simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
-            scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
-            out_off += DIM_SIZE * sizeof(float);
-
-            if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1)
-              break;
-          }
-        });
-  });
-  e.wait();
+            for (unsigned i = 0; i < HEIGHT; i++) {
+
+              in.row(10 + i) = block_load<float, 32>(inputMatrix + off);
+              off += DIM_SIZE;
+
+              simd<float, WIDTH> sum =
+                  vin.select<WIDTH, 1>(GET_IDX(i, 5)) * -0.02f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 1, 5)) * -0.025f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 2, 5)) * -0.0333333333333f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 3, 5)) * -0.05f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 4, 5)) * -0.1f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 6, 5)) * 0.1f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 7, 5)) * 0.05f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 8, 5)) * 0.0333333333333f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 9, 5)) * 0.025f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 10, 5)) * 0.02f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 0)) * -0.02f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 1)) * -0.025f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 2)) * -0.0333333333333f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 3)) * -0.05f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 4)) * -0.1f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 6)) * 0.1f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 7)) * 0.05f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 8)) * 0.0333333333333f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 9)) * 0.025f +
+                  vin.select<WIDTH, 1>(GET_IDX(i + 5, 10)) * 0.02f;
+
+              // predciate output
+              simd<ushort, WIDTH> p = (elm16 + h_pos * WIDTH) < DIM_SIZE - 10;
+
+              simd<unsigned, WIDTH> elm16_off = elm16 * sizeof(float) + out_off;
+              scatter<float, WIDTH>(outputMatrix, sum, elm16_off, p);
+              out_off += DIM_SIZE * sizeof(float);
+
+              if (v_pos * HEIGHT + 10 + i >= DIM_SIZE - 1)
+                break;
+            }
+          });
+    });
+    e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(inputMatrix, ctxt);
+    free(outputMatrix, ctxt);
+    return e.get_cl_code();
+  }
 
   // check result
   bool passed = CheckResults(outputMatrix, inputMatrix);
diff --git a/SYCL/ESIMD/test_id_3d.cpp b/SYCL/ESIMD/test_id_3d.cpp
index 0c21473da8..e4cae353df 100644
--- a/SYCL/ESIMD/test_id_3d.cpp
+++ b/SYCL/ESIMD/test_id_3d.cpp
@@ -40,20 +40,26 @@ int main(void) {
   int *C = static_cast<int *>(
       malloc_shared(ScalarGlobalRange.size() * sizeof(int), dev, ctxt));
 
-  auto e = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Test>(
-        GlobalRange, [=](item<3> it) SYCL_ESIMD_KERNEL {
-          using namespace sycl::INTEL::gpu;
-          auto id = it.get_id();
-          // calculate linear ID:
-          size_t lin_id = id[0] * Y * X + id[1] * X + id[2];
-          simd<int, VL> inc(0, 1);
-          int off = (int)(lin_id * VL);
-          simd<int, VL> val = inc + off;
-          block_store<int, VL>(C + off, val);
-        });
-  });
-  e.wait();
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Test>(
+          GlobalRange, [=](item<3> it) SYCL_ESIMD_KERNEL {
+            using namespace sycl::INTEL::gpu;
+            auto id = it.get_id();
+            // calculate linear ID:
+            size_t lin_id = id[0] * Y * X + id[1] * X + id[2];
+            simd<int, VL> inc(0, 1);
+            int off = (int)(lin_id * VL);
+            simd<int, VL> val = inc + off;
+            block_store<int, VL>(C + off, val);
+          });
+    });
+    e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
+
   int err_cnt = 0;
 
   for (size_t i = 0; i < ScalarGlobalRange.size(); ++i) {
diff --git a/SYCL/ESIMD/vadd_1d.cpp b/SYCL/ESIMD/vadd_1d.cpp
index ffb7b464b4..294d1b62fc 100644
--- a/SYCL/ESIMD/vadd_1d.cpp
+++ b/SYCL/ESIMD/vadd_1d.cpp
@@ -31,7 +31,7 @@ int main(void) {
     C[i] = 0.0f;
   }
 
-  {
+  try {
     buffer<float, 1> bufa(A, range<1>(Size));
     buffer<float, 1> bufb(B, range<1>(Size));
     buffer<float, 1> bufc(C, range<1>(Size));
@@ -62,6 +62,14 @@ int main(void) {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+
+    return e.get_cl_code();
   }
 
   int err_cnt = 0;
diff --git a/SYCL/ESIMD/vadd_2d.cpp b/SYCL/ESIMD/vadd_2d.cpp
index 7eb74b2fba..eff276ca9b 100644
--- a/SYCL/ESIMD/vadd_2d.cpp
+++ b/SYCL/ESIMD/vadd_2d.cpp
@@ -32,7 +32,7 @@ int main(void) {
     A[i] = B[i] = i;
   }
 
-  {
+  try {
     cl::sycl::image<2> imgA(A, image_channel_order::rgba,
                             image_channel_type::unsigned_int32,
                             range<2>{Size / 4, 1});
@@ -82,6 +82,9 @@ int main(void) {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
   }
 
   for (unsigned i = 0; i < Size; ++i) {
diff --git a/SYCL/ESIMD/vadd_raw_send.cpp b/SYCL/ESIMD/vadd_raw_send.cpp
index d76043804e..62beaa6def 100644
--- a/SYCL/ESIMD/vadd_raw_send.cpp
+++ b/SYCL/ESIMD/vadd_raw_send.cpp
@@ -91,7 +91,7 @@ int main(void) {
     C[i] = 0.0f;
   }
 
-  {
+  try {
     buffer<float, 1> bufa(A, range<1>(Size));
     buffer<float, 1> bufb(B, range<1>(Size));
     buffer<float, 1> bufc(C, range<1>(Size));
@@ -124,6 +124,14 @@ int main(void) {
           });
     });
     e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+
+    return e.get_cl_code();
   }
 
   int err_cnt = 0;
diff --git a/SYCL/ESIMD/vadd_usm.cpp b/SYCL/ESIMD/vadd_usm.cpp
index 927479e633..428e754447 100644
--- a/SYCL/ESIMD/vadd_usm.cpp
+++ b/SYCL/ESIMD/vadd_usm.cpp
@@ -29,6 +29,7 @@ int main(void) {
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
   auto ctxt = q.get_context();
+  // TODO: release memory in the end of the test
   float *A =
       static_cast<float *>(malloc_shared(Size * sizeof(float), dev, ctxt));
   float *B =
@@ -47,18 +48,25 @@ int main(void) {
 
   cl::sycl::nd_range<1> Range(GlobalRange, LocalRange);
 
-  auto e = q.submit([&](handler &cgh) {
-    cgh.parallel_for<class Test>(Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
-      using namespace sycl::INTEL::gpu;
+  try {
+    auto e = q.submit([&](handler &cgh) {
+      cgh.parallel_for<class Test>(
+          Range, [=](nd_item<1> ndi) SYCL_ESIMD_KERNEL {
+            using namespace sycl::INTEL::gpu;
 
-      int i = ndi.get_global_id(0);
-      simd<float, VL> va = block_load<float, VL>(A + i * VL);
-      simd<float, VL> vb = block_load<float, VL>(B + i * VL);
-      simd<float, VL> vc = va + vb;
-      block_store<float, VL>(C + i * VL, vc);
+            int i = ndi.get_global_id(0);
+            simd<float, VL> va = block_load<float, VL>(A + i * VL);
+            simd<float, VL> vb = block_load<float, VL>(B + i * VL);
+            simd<float, VL> vc = va + vb;
+            block_store<float, VL>(C + i * VL, vc);
+          });
     });
-  });
-  e.wait();
+    e.wait();
+  } catch (cl::sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return e.get_cl_code();
+  }
+
   int err_cnt = 0;
 
   for (unsigned i = 0; i < Size; ++i) {