From 7179d3adb067ec14f85319dce2c85da76dbdc077 Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Thu, 26 Mar 2020 23:25:56 +0300
Subject: [PATCH 01/10] [SYCL] Add tests for inline asm feature

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/inline-asm/asm_16_empty.cpp         |  73 ++++++++++++
 sycl/test/inline-asm/asm_16_matrix_mult.cpp   |  73 ++++++++++++
 sycl/test/inline-asm/asm_16_no_input_int.cpp  |  73 ++++++++++++
 sycl/test/inline-asm/asm_16_no_opts.cpp       |  75 +++++++++++++
 sycl/test/inline-asm/asm_8_empty.cpp          |  72 ++++++++++++
 sycl/test/inline-asm/asm_8_no_input_int.cpp   |  75 +++++++++++++
 .../inline-asm/asm_arbitrary_ops_order.cpp    |  98 ++++++++++++++++
 sycl/test/inline-asm/asm_decl_in_scope.cpp    | 103 +++++++++++++++++
 sycl/test/inline-asm/asm_float_add.cpp        |  94 ++++++++++++++++
 sycl/test/inline-asm/asm_float_imm_arg.cpp    |  89 +++++++++++++++
 sycl/test/inline-asm/asm_float_neg.cpp        |  89 +++++++++++++++
 sycl/test/inline-asm/asm_imm_arg.cpp          |  88 +++++++++++++++
 sycl/test/inline-asm/asm_mul.cpp              |  93 ++++++++++++++++
 .../inline-asm/asm_multiple_instructions.cpp  | 105 ++++++++++++++++++
 sycl/test/inline-asm/asm_no_operands.cpp      |  32 ++++++
 sycl/test/inline-asm/asm_no_output.cpp        |  78 +++++++++++++
 sycl/test/inline-asm/asm_plus_mod.cpp         |  87 +++++++++++++++
 sycl/test/inline-asm/include/asmcheck.h       |  17 +++
 sycl/test/inline-asm/letter_example.cpp       |  59 ++++++++++
 sycl/test/inline-asm/malloc_shared_32.cpp     |  87 +++++++++++++++
 .../inline-asm/malloc_shared_in_out_dif.cpp   |  65 +++++++++++
 .../inline-asm/malloc_shared_no_input.cpp     |  54 +++++++++
 22 files changed, 1679 insertions(+)
 create mode 100644 sycl/test/inline-asm/asm_16_empty.cpp
 create mode 100644 sycl/test/inline-asm/asm_16_matrix_mult.cpp
 create mode 100644 sycl/test/inline-asm/asm_16_no_input_int.cpp
 create mode 100644 sycl/test/inline-asm/asm_16_no_opts.cpp
 create mode 100644 sycl/test/inline-asm/asm_8_empty.cpp
 create mode 100644 sycl/test/inline-asm/asm_8_no_input_int.cpp
 create mode 100644 sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
 create mode 100644 sycl/test/inline-asm/asm_decl_in_scope.cpp
 create mode 100644 sycl/test/inline-asm/asm_float_add.cpp
 create mode 100644 sycl/test/inline-asm/asm_float_imm_arg.cpp
 create mode 100644 sycl/test/inline-asm/asm_float_neg.cpp
 create mode 100644 sycl/test/inline-asm/asm_imm_arg.cpp
 create mode 100644 sycl/test/inline-asm/asm_mul.cpp
 create mode 100644 sycl/test/inline-asm/asm_multiple_instructions.cpp
 create mode 100644 sycl/test/inline-asm/asm_no_operands.cpp
 create mode 100644 sycl/test/inline-asm/asm_no_output.cpp
 create mode 100644 sycl/test/inline-asm/asm_plus_mod.cpp
 create mode 100644 sycl/test/inline-asm/include/asmcheck.h
 create mode 100644 sycl/test/inline-asm/letter_example.cpp
 create mode 100644 sycl/test/inline-asm/malloc_shared_32.cpp
 create mode 100644 sycl/test/inline-asm/malloc_shared_in_out_dif.cpp
 create mode 100644 sycl/test/inline-asm/malloc_shared_no_input.cpp
diff --git a/sycl/test/inline-asm/asm_16_empty.cpp b/sycl/test/inline-asm/asm_16_empty.cpp
new file mode 100644
index 000000000000..7b551b55e8a1
--- /dev/null
+++ b/sycl/test/inline-asm/asm_16_empty.cpp
@@ -0,0 +1,73 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <string>
+#include <vector>
+
+constexpr int LIST_SIZE = 1024;
+using arr_t = std::vector<cl::sycl::cl_int>;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+// class is used for kernel name
+template <typename T>
+class no_opts;
+
+template <typename T>
+void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [C](cl::sycl::id<1> wiID)
+        [[cl::intel_reqd_sub_group_size(16)]] {
+      C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm volatile("");
+#endif
+    };
+    cgh.parallel_for<class no_opts<T>>(numOfItems, kern);
+  });
+};
+
+int main() {
+  arr_t C(LIST_SIZE);
+
+  cl::sycl::gpu_selector gpsel;
+  cl::sycl::queue deviceQueue(gpsel);
+
+  sycl::device Device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  for (int i = 0; i < LIST_SIZE; i++) {
+    C[i] = 0;
+  }
+
+  process_buffers(deviceQueue, C.data(), LIST_SIZE);
+
+  bool all_right = true;
+
+  for (int i = 0; i < LIST_SIZE; ++i)
+    if (C[i] != 43) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << 43 << "\n";
+      all_right = false;
+      break;
+    }
+  if (all_right) {
+    std::cout << "Pass" << std::endl;
+    return 0;
+  }
+  std::cout << "Error" << std::endl;
+  return -1;
+}
diff --git a/sycl/test/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/inline-asm/asm_16_matrix_mult.cpp
new file mode 100644
index 000000000000..3de2b2966abc
--- /dev/null
+++ b/sycl/test/inline-asm/asm_16_matrix_mult.cpp
@@ -0,0 +1,73 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int LIST_SIZE = 8;
+using arr_t = std::vector<cl::sycl::cl_int>;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+// class is used for kernel name
+template <typename T>
+class simple_vector_add;
+
+template <typename T>
+void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [C](cl::sycl::id<1> wiID)
+        [[cl::intel_reqd_sub_group_size(16)]] {
+      volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                   : "=rw"(output));
+#else
+      output = 7;
+#endif
+      C[wiID] = output;
+    };
+    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
+  });
+};
+
+int main() {
+  arr_t C(LIST_SIZE);
+
+  cl::sycl::gpu_selector gpsel;
+  cl::sycl::queue deviceQueue(gpsel);
+  sycl::device Device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  for (int i = 0; i < LIST_SIZE; i++) {
+    C[i] = 0;
+  }
+
+  process_buffers(deviceQueue, C.data(), LIST_SIZE);
+
+  bool all_right = true;
+  for (int i = 0; i < LIST_SIZE; ++i)
+    if (C[i] != 7) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << 7 << "\n";
+      all_right = false;
+      break;
+    }
+  if (all_right) {
+    std::cout << "Pass" << std::endl;
+    return 0;
+  }
+  std::cout << "Error" << std::endl;
+  return -1;
+}
diff --git a/sycl/test/inline-asm/asm_16_no_input_int.cpp b/sycl/test/inline-asm/asm_16_no_input_int.cpp
new file mode 100644
index 000000000000..3de2b2966abc
--- /dev/null
+++ b/sycl/test/inline-asm/asm_16_no_input_int.cpp
@@ -0,0 +1,73 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int LIST_SIZE = 8;
+using arr_t = std::vector<cl::sycl::cl_int>;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+// class is used for kernel name
+template <typename T>
+class simple_vector_add;
+
+template <typename T>
+void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [C](cl::sycl::id<1> wiID)
+        [[cl::intel_reqd_sub_group_size(16)]] {
+      volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                   : "=rw"(output));
+#else
+      output = 7;
+#endif
+      C[wiID] = output;
+    };
+    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
+  });
+};
+
+int main() {
+  arr_t C(LIST_SIZE);
+
+  cl::sycl::gpu_selector gpsel;
+  cl::sycl::queue deviceQueue(gpsel);
+  sycl::device Device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  for (int i = 0; i < LIST_SIZE; i++) {
+    C[i] = 0;
+  }
+
+  process_buffers(deviceQueue, C.data(), LIST_SIZE);
+
+  bool all_right = true;
+  for (int i = 0; i < LIST_SIZE; ++i)
+    if (C[i] != 7) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << 7 << "\n";
+      all_right = false;
+      break;
+    }
+  if (all_right) {
+    std::cout << "Pass" << std::endl;
+    return 0;
+  }
+  std::cout << "Error" << std::endl;
+  return -1;
+}
diff --git a/sycl/test/inline-asm/asm_16_no_opts.cpp b/sycl/test/inline-asm/asm_16_no_opts.cpp
new file mode 100644
index 000000000000..cc2df932a7d8
--- /dev/null
+++ b/sycl/test/inline-asm/asm_16_no_opts.cpp
@@ -0,0 +1,75 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int LIST_SIZE = 1024;
+using arr_t = std::vector<cl::sycl::cl_int>;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+// class is used for kernel name
+template <typename T>
+class simple_vector_add;
+
+template <typename T>
+void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [C](cl::sycl::id<1> wiID)
+        [[cl::intel_reqd_sub_group_size(16)]] {
+      for (int i = 0; i < 10; ++i) {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+        asm("fence_sw");
+        C[wiID] += i;
+
+#else
+        C[wiID] += i;
+#endif
+      }
+    };
+    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
+  });
+};
+
+int main() {
+  arr_t C(LIST_SIZE);
+
+  cl::sycl::gpu_selector gpsel;
+  cl::sycl::queue deviceQueue(gpsel);
+  sycl::device Device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  for (int i = 0; i < LIST_SIZE; i++) {
+    C[i] = 0;
+  }
+
+  process_buffers(deviceQueue, C.data(), LIST_SIZE);
+
+  bool all_right = true;
+
+  for (int i = 0; i < LIST_SIZE; ++i)
+    if (C[i] != 45) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << 45 << "\n";
+      all_right = false;
+      break;
+    }
+  if (all_right) {
+    std::cout << "Pass" << std::endl;
+    return 0;
+  }
+  std::cout << "Error" << std::endl;
+  return -1;
+}
diff --git a/sycl/test/inline-asm/asm_8_empty.cpp b/sycl/test/inline-asm/asm_8_empty.cpp
new file mode 100644
index 000000000000..a7f0b0bc6d4b
--- /dev/null
+++ b/sycl/test/inline-asm/asm_8_empty.cpp
@@ -0,0 +1,72 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int LIST_SIZE = 1024;
+using arr_t = std::vector<cl::sycl::cl_int>;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+// class is used for kernel name
+template <typename T>
+class no_opts;
+
+template <typename T>
+void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [C](cl::sycl::id<1> wiID)
+        [[cl::intel_reqd_sub_group_size(8)]] {
+      C[wiID] = 43;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm volatile("");
+#endif
+    };
+    cgh.parallel_for<class no_opts<T>>(numOfItems, kern);
+  });
+};
+
+int main() {
+  arr_t C(LIST_SIZE);
+
+  cl::sycl::gpu_selector gpsel;
+  cl::sycl::queue deviceQueue(gpsel);
+
+  sycl::device Device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  for (int i = 0; i < LIST_SIZE; i++) {
+    C[i] = 0;
+  }
+
+  process_buffers(deviceQueue, C.data(), LIST_SIZE);
+
+  bool all_right = true;
+
+  for (int i = 0; i < LIST_SIZE; ++i)
+    if (C[i] != 43) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << 43 << "\n";
+      all_right = false;
+      break;
+    }
+  if (all_right) {
+    std::cout << "Pass" << std::endl;
+    return 0;
+  }
+  std::cout << "Error" << std::endl;
+  return -1;
+}
diff --git a/sycl/test/inline-asm/asm_8_no_input_int.cpp b/sycl/test/inline-asm/asm_8_no_input_int.cpp
new file mode 100644
index 000000000000..a370f77b3ada
--- /dev/null
+++ b/sycl/test/inline-asm/asm_8_no_input_int.cpp
@@ -0,0 +1,75 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int LIST_SIZE = 8;
+using arr_t = std::vector<cl::sycl::cl_int>;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+// class is used for kernel name
+template <typename T>
+class simple_vector_add;
+
+template <typename T>
+void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [C](cl::sycl::id<1> wiID)
+        [[cl::intel_reqd_sub_group_size(8)]] {
+      volatile int output = 0;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
+                   : "=rw"(output));
+#else
+      output = 7;
+#endif
+      C[wiID] = output;
+    };
+    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
+  });
+};
+
+int main() {
+  arr_t C(LIST_SIZE);
+
+  cl::sycl::gpu_selector gpsel;
+  cl::sycl::queue deviceQueue(gpsel);
+
+  for (int i = 0; i < LIST_SIZE; i++) {
+    C[i] = 0;
+  }
+
+  sycl::device Device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  process_buffers(deviceQueue, C.data(), LIST_SIZE);
+
+  bool all_right = true;
+  for (int i = 0; i < LIST_SIZE; ++i)
+    if (C[i] != 7) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << 7 << "\n";
+      all_right = false;
+      break;
+    }
+  if (all_right) {
+    std::cout << "Pass" << std::endl;
+    return 0;
+  }
+  std::cout << "Error" << std::endl;
+  return -1;
+}
diff --git a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
new file mode 100644
index 000000000000..1b75b63d0027
--- /dev/null
+++ b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
@@ -0,0 +1,98 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 1024;
+using arr_t = std::vector<cl::sycl::cl_int>;
+
+// class is used for kernel name
+template <typename T>
+class vector_mad;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(T const *pa, T const *pb, const T *pc, T *pd, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++) {
+      A[i] = i;
+      B[i] = i;
+      C[i] = LIST_SIZE - i * i;
+    }
+
+    ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i)
+      if (D[i] != A[i] * B[i] + C[i]) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << D[i] << " != " << A[i] * B[i] + C[i] << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(T const *pa, T const *pb, const T *pc, T *pd,
+                                size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+  cl::sycl::buffer<T, 1> bufferD(pd, numOfItems);
+
+  bufferA.set_final_data(nullptr);
+  bufferB.set_final_data(nullptr);
+  bufferC.set_final_data(nullptr);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_read>(cgh);
+    auto C = bufferC.template get_access<sycl_read>(cgh);
+    auto D = bufferD.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+          : "=rw"(D[wiID])
+          : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
+#else
+      D[wiID] = A[wiID] * B[wiID] + C[wiID];
+#endif
+    };
+    cgh.parallel_for<class vector_mad<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_decl_in_scope.cpp b/sycl/test/inline-asm/asm_decl_in_scope.cpp
new file mode 100644
index 000000000000..2164a602bae6
--- /dev/null
+++ b/sycl/test/inline-asm/asm_decl_in_scope.cpp
@@ -0,0 +1,103 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 1024 * 1024;
+using arr_t = std::vector<cl::sycl::cl_int>;
+
+// class is used for kernel name
+template <typename T>
+class decl_in_scope_kernel;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(T const *pa, T const *pb, T *pc, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++) {
+      A[i] = i;
+      B[i] = 2;
+    }
+
+    ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i)
+      if (C[i] != A[i] * B[i]) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << C[i] << " != " << A[i] * B[i] << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  bufferA.set_final_data(nullptr);
+  bufferB.set_final_data(nullptr);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_read>(cgh);
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+
+      // declaration of temp within and outside the scope
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("{\n"
+          ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+          "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n"
+          "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+          "}\n"
+          ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+          "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n"
+          "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+          : "+rw"(C[wiID])
+          : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+      C[wiID] = A[wiID];
+      C[wiID] *= B[wiID];
+#endif
+    };
+    cgh.parallel_for<class decl_in_scope_kernel<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_float_add.cpp b/sycl/test/inline-asm/asm_float_add.cpp
new file mode 100644
index 000000000000..4e58bb3b12d3
--- /dev/null
+++ b/sycl/test/inline-asm/asm_float_add.cpp
@@ -0,0 +1,94 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 16;
+using arr_t = std::vector<cl::sycl::cl_double>;
+
+// class is used for kernel name
+template <typename T>
+class asm_float_add_kernel;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(T const *pa, T const *pb, T *pc, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++) {
+      A[i] = (double)1 / std::pow(2, i);
+      B[i] = (double)2 / std::pow(2, i);
+    }
+
+    ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; i++)
+      if (C[i] != A[i] + B[i]) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << C[i] << " != " << A[i] + B[i] << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  bufferA.set_final_data(nullptr);
+  bufferB.set_final_data(nullptr);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_read>(cgh);
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+          : "=rw"(C[wiID])
+          : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+      C[wiID] = A[wiID] + B[wiID];
+#endif
+    };
+    cgh.parallel_for<class asm_float_add_kernel<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_float_imm_arg.cpp b/sycl/test/inline-asm/asm_float_imm_arg.cpp
new file mode 100644
index 000000000000..2b3f05772953
--- /dev/null
+++ b/sycl/test/inline-asm/asm_float_imm_arg.cpp
@@ -0,0 +1,89 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <cmath>
+#include <iostream>
+#include <vector>
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 8;
+constexpr double IMM_ARGUMENT = 0.5;
+
+using arr_t = std::vector<cl::sycl::cl_double>;
+
+// class is used for kernel name
+template <typename T>
+class float_imm_arg_kernel;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(const T *pa, T *pb, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++)
+      A[i] = (double)1 / std::pow(2, i);
+
+    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i) {
+      if (B[i] != A[i] * IMM_ARGUMENT) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << B[i] << " != " << A[i] * IMM_ARGUMENT << "\n";
+        abort();
+      }
+    }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+          : "=rw"(B[wiID])
+          : "rw"(A[wiID]), "rw"(IMM_ARGUMENT));
+#else
+      B[wiID] = A[wiID] * IMM_ARGUMENT;
+#endif
+    };
+    cgh.parallel_for<class float_imm_arg_kernel<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_float_neg.cpp b/sycl/test/inline-asm/asm_float_neg.cpp
new file mode 100644
index 000000000000..f96ea348c468
--- /dev/null
+++ b/sycl/test/inline-asm/asm_float_neg.cpp
@@ -0,0 +1,89 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 1024;
+
+using arr_t = std::vector<cl::sycl::cl_float>;
+
+// class is used for kernel name
+template <typename T>
+class float_neg_kernel;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(const T *pa, T *pb, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++)
+      A[i] = 1.0 / i;
+
+    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i)
+      if (B[i] != -A[i]) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << B[i] << " != " << -A[i] << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+
+  bufferA.set_final_data(nullptr);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>"
+          : "=rw"(B[wiID])
+          : "rw"(A[wiID]));
+#else
+      B[wiID] = -A[wiID];
+#endif
+    };
+    cgh.parallel_for<class float_neg_kernel<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_imm_arg.cpp b/sycl/test/inline-asm/asm_imm_arg.cpp
new file mode 100644
index 000000000000..361bcc4c48d1
--- /dev/null
+++ b/sycl/test/inline-asm/asm_imm_arg.cpp
@@ -0,0 +1,88 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 1024 * 1024;
+constexpr int CONST_ARGUMENT = 0xabc;
+
+using arr_t = std::vector<cl::sycl::cl_int>;
+
+// class is used for kernel name
+template <typename T>
+class const_asm_arg_kernel;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(const T *pa, T *pb, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++)
+      A[i] = i;
+
+    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i)
+      if (B[i] != A[i] + CONST_ARGUMENT) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << B[i] << " != " << A[i] + CONST_ARGUMENT << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+          : "=rw"(B[wiID])
+          : "rw"(A[wiID]), "rw"(CONST_ARGUMENT));
+#else
+      B[wiID] = A[wiID] + CONST_ARGUMENT;
+#endif
+    };
+    cgh.parallel_for<class const_asm_arg_kernel<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_mul.cpp b/sycl/test/inline-asm/asm_mul.cpp
new file mode 100644
index 000000000000..15fd7126e7d2
--- /dev/null
+++ b/sycl/test/inline-asm/asm_mul.cpp
@@ -0,0 +1,93 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 1024 * 1024;
+using arr_t = std::vector<cl::sycl::cl_int>;
+
+// class is used for kernel name
+template <typename T>
+class vector_mul;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(T const *pa, T const *pb, T *pc, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++) {
+      A[i] = i;
+      B[i] = LIST_SIZE - i;
+    }
+
+    ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i)
+      if (C[i] != A[i] * B[i]) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << C[i] << " != " << A[i] * B[i] << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  bufferA.set_final_data(nullptr);
+  bufferB.set_final_data(nullptr);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_read>(cgh);
+    auto C = bufferC.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+          : "=rw"(C[wiID])
+          : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+      C[wiID] = A[wiID] * B[wiID];
+#endif
+    };
+    cgh.parallel_for<class vector_mul<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_multiple_instructions.cpp b/sycl/test/inline-asm/asm_multiple_instructions.cpp
new file mode 100644
index 000000000000..edbd21cfae94
--- /dev/null
+++ b/sycl/test/inline-asm/asm_multiple_instructions.cpp
@@ -0,0 +1,105 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 1024;
+
+using arr_t = std::vector<cl::sycl::cl_int>;
+
+// class is used for kernel name
+template <typename T>
+class vector_mul;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(T const *pa, T const *pb, T const *pc, T *pd, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+
+    for (int i = 0; i < LIST_SIZE; i++) {
+      A[i] = B[i] = i;
+      C[i] = LIST_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE
+    }
+
+    ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i)
+      if (D[i] != LIST_SIZE) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << D[i] << " != " << LIST_SIZE << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T const *pc, T *pd,
+                                size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+  cl::sycl::buffer<T, 1> bufferD(pd, numOfItems);
+
+  bufferA.set_final_data(nullptr);
+  bufferB.set_final_data(nullptr);
+  bufferC.set_final_data(nullptr);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_write>(cgh);
+    auto B = bufferB.template get_access<sycl_read>(cgh);
+    auto C = bufferC.template get_access<sycl_read>(cgh);
+    auto D = bufferD.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID)
+        [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("{\n"
+          "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n"
+          "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n"
+          "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n"
+          "}\n"
+          : "=rw"(D[wiID]), "+rw"(A[wiID])
+          : "rw"(B[wiID]), "rw"(C[wiID]));
+#else
+      A[wiID] += B[wiID];
+      A[wiID] += C[wiID];
+      D[wiID] = A[wiID];
+#endif
+    };
+    cgh.parallel_for<class vector_mul<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/asm_no_operands.cpp b/sycl/test/inline-asm/asm_no_operands.cpp
new file mode 100644
index 000000000000..f7abfd523928
--- /dev/null
+++ b/sycl/test/inline-asm/asm_no_operands.cpp
@@ -0,0 +1,32 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+class no_operands_kernel;
+
+int main() {
+  // Creating SYCL queue
+  cl::sycl::queue Queue;
+  sycl::device Device = Queue.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  // Size of index space for kernel
+  cl::sycl::range<1> NumOfWorkItems{15};
+
+  // Submitting command group(work) to queue
+  Queue.submit([&](cl::sycl::handler &cgh) {
+    // Executing kernel
+    cgh.parallel_for<class no_operands_kernel>(
+        NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("barrier");
+#endif
+        });
+  });
+}
diff --git a/sycl/test/inline-asm/asm_no_output.cpp b/sycl/test/inline-asm/asm_no_output.cpp
new file mode 100644
index 000000000000..2440fd1c17a2
--- /dev/null
+++ b/sycl/test/inline-asm/asm_no_output.cpp
@@ -0,0 +1,78 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr int LIST_SIZE = 8;
+using arr_t = std::vector<cl::sycl::cl_int>;
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+// class is used for kernel name
+template <typename T>
+class asm_no_output;
+
+template <typename T>
+void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto C = bufferC.template get_access<sycl_read>(cgh);
+
+    auto kern = [C]()
+        [[cl::intel_reqd_sub_group_size(16)]] {
+      volatile int local_var = 47;
+      local_var += C[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm volatile("{\n"
+                   ".decl temp v_type=G type=w num_elts=8 align=GRF\n"
+                   "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n"
+                   "}\n" ::"rw"(local_var));
+#else
+      volatile int temp = 0;
+      temp = local_var;
+#endif
+    };
+    cgh.single_task<class asm_no_output<T>>(kern);
+  });
+};
+
+int main() {
+  arr_t C(LIST_SIZE);
+
+  cl::sycl::gpu_selector gpsel;
+  cl::sycl::queue deviceQueue(gpsel);
+  sycl::device Device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(Device)) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  for (int i = 0; i < LIST_SIZE; i++) {
+    C[i] = 0;
+  }
+
+  process_buffers(deviceQueue, C.data(), LIST_SIZE);
+
+  bool all_right = true;
+  for (int i = 0; i < LIST_SIZE; ++i)
+    if (C[i] != 0) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << 0 << "\n";
+      all_right = false;
+      break;
+    }
+  if (all_right) {
+    std::cout << "Pass" << std::endl;
+    return 0;
+  }
+  std::cout << "Error" << std::endl;
+  return -1;
+}
diff --git a/sycl/test/inline-asm/asm_plus_mod.cpp b/sycl/test/inline-asm/asm_plus_mod.cpp
new file mode 100644
index 000000000000..d460406c71cd
--- /dev/null
+++ b/sycl/test/inline-asm/asm_plus_mod.cpp
@@ -0,0 +1,87 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <vector>
+
+constexpr auto sycl_read = cl::sycl::access::mode::read;
+constexpr auto sycl_write = cl::sycl::access::mode::write;
+
+constexpr int LIST_SIZE = 1024 * 1024;
+using arr_t = std::vector<cl::sycl::cl_int>;
+
+// class is used for kernel name
+template <typename T>
+class plus_mod_kernel;
+
+class ocl_ctx_t {
+  cl::sycl::queue deviceQueue;
+
+public:
+  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+
+  template <typename T>
+  void process_buffers(T const *pa, T *pb, size_t sz);
+};
+
+int main() {
+  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
+
+  try {
+    cl::sycl::gpu_selector gpsel;
+    ocl_ctx_t ct{gpsel};
+    cl::sycl::queue deviceQueue(gpsel);
+    sycl::device Device = deviceQueue.get_device();
+
+    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return 0;
+    }
+    for (int i = 0; i < LIST_SIZE; i++) {
+      A[i] = i;
+      B[i] = LIST_SIZE - i;
+      C[i] = A[i] + B[i];
+    }
+
+    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
+
+    for (int i = 0; i < LIST_SIZE; ++i)
+      if (B[i] != C[i]) {
+        std::cerr << "At index: " << i << ". ";
+        std::cerr << B[i] << " != " << C[i] << "\n";
+        abort();
+      }
+
+    std::cout << "Everything is correct" << std::endl;
+  } catch (cl::sycl::exception const &err) {
+    std::cerr << "ERROR: " << err.what() << ":\n";
+    return -1;
+  }
+}
+
+template <typename T>
+void ocl_ctx_t::process_buffers(T const *pa, T *pb, size_t sz) {
+  cl::sycl::range<1> numOfItems{sz};
+  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
+  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
+
+  deviceQueue.submit([&](cl::sycl::handler &cgh) {
+    auto A = bufferA.template get_access<sycl_read>(cgh);
+    auto B = bufferB.template get_access<sycl_write>(cgh);
+
+    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+      asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>"
+          : "+rw"(B[wiID])
+          : "rw"(A[wiID]));
+#else
+      B[wiID] += A[wiID];
+#endif
+    };
+    cgh.parallel_for<class plus_mod_kernel<T>>(numOfItems, kern);
+  });
+}
diff --git a/sycl/test/inline-asm/include/asmcheck.h b/sycl/test/inline-asm/include/asmcheck.h
new file mode 100644
index 000000000000..2031455e214c
--- /dev/null
+++ b/sycl/test/inline-asm/include/asmcheck.h
@@ -0,0 +1,17 @@
+#include <CL/sycl.hpp>
+
+#include <iostream>
+bool isInlineASMSupported(sycl::device Device) {
+
+  sycl::string_class DriverVersion = Device.get_info<sycl::info::device::driver_version>();
+  sycl::string_class DeviceVendorName = Device.get_info<sycl::info::device::vendor>();
+  if (DeviceVendorName.find("Intel") == sycl::string_class::npos)
+    return false;
+  if (DriverVersion.length() < 5)
+    return false;
+  if (DriverVersion[2] != '.')
+    return false;
+  if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12)
+    return false;
+  return true;
+}
diff --git a/sycl/test/inline-asm/letter_example.cpp b/sycl/test/inline-asm/letter_example.cpp
new file mode 100644
index 000000000000..5b9bac0e22d2
--- /dev/null
+++ b/sycl/test/inline-asm/letter_example.cpp
@@ -0,0 +1,59 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#define N 100
+using namespace cl::sycl;
+int main() {
+  int *a;
+  queue q;
+  sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  auto ctx = q.get_context();
+  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
+  for (int i = 0; i < N; i++)
+    a[i] = i;
+  q.submit([&](handler &cgh) {
+     cgh.parallel_for<class kernel>(
+         range<1>(N), [=](id<1> idx)
+                          [[cl::intel_reqd_sub_group_size(16)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+                            int i = idx[0];
+                            asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n"
+                                         "svm_gather.4.1 (M1, 16) %0.0 V52.0\n"
+                                         "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n"
+                                         "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}"
+                                         :
+                                         : "rw"(&a[i]));
+#else
+                            a[idx[0]]++;
+#endif
+                          });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < N; i++) {
+    if (a[i] != (i + 1)) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << a[i] << "!=" << (i + 1) << std::endl;
+      break;
+    }
+  }
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    return -1;
+  }
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  return 0;
+}
diff --git a/sycl/test/inline-asm/malloc_shared_32.cpp b/sycl/test/inline-asm/malloc_shared_32.cpp
new file mode 100644
index 000000000000..d521afabaaca
--- /dev/null
+++ b/sycl/test/inline-asm/malloc_shared_32.cpp
@@ -0,0 +1,87 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#define N 1000
+using namespace cl::sycl;
+int main() {
+  int *a;
+  int *b;
+  int *c;
+  queue q;
+
+  sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  auto ctx = q.get_context();
+  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
+  b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
+  c = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
+  for (int i = 0; i < N; i++) {
+    b[i] = -10;
+    a[i] = i;
+    c[i] = i;
+  }
+  q.submit([&](handler &cgh) {
+     cgh.parallel_for<class kernel>(
+         range<1>(N),
+         [=](id<1> idx)
+             [[cl::intel_reqd_sub_group_size(32)]] {
+               int i = idx[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+               asm volatile(R"a(
+    {
+        .decl V52 v_type=G type=d num_elts=16 align=GRF
+        .decl V53 v_type=G type=d num_elts=16 align=GRF
+        .decl V54 v_type=G type=d num_elts=16 align=GRF
+        .decl V55 v_type=G type=d num_elts=16 align=GRF
+        .decl V56 v_type=G type=d num_elts=16 align=GRF
+        .decl V57 v_type=G type=d num_elts=16 align=GRF
+        svm_gather.4.1 (M1, 16) %2.0 V54.0
+        svm_gather.4.1 (M1, 16) %3.0 V55.0
+        svm_gather.4.1 (M1, 16) %4.0 V56.0
+        svm_gather.4.1 (M1, 16) %5.0 V57.0
+        mul (M1, 16) V52(0,0)<1> V54(0,0)<1;1,0> V56(0,0)<1;1,0>
+        mul (M1, 16) V53(0,0)<1> V55(0,0)<1;1,0> V57(0,0)<1;1,0>
+        svm_scatter.4.1 (M1, 16) %0.0 V52.0
+        svm_scatter.4.1 (M1, 16) %1.0 V53.0
+    }
+    )a" ::"rw"(&b[i]),
+                            "rw"(&b[i] + 16), "rw"(&a[i]), "rw"(&a[i] + 16), "rw"(&c[i]),
+                            "rw"(&c[i] + 16));
+#else
+               b[i] = a[i] * c[i];
+#endif
+             });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < N; i++) {
+    if (b[i] != a[i] * b[i]) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << b[i] << "!=" << a[i] * b[i] << std::endl;
+      break;
+    }
+  }
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    cl::sycl::free(b, ctx);
+    cl::sycl::free(c, ctx);
+    return -1;
+  }
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  cl::sycl::free(b, ctx);
+  cl::sycl::free(c, ctx);
+  return 0;
+}
diff --git a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp
new file mode 100644
index 000000000000..fc0fa139036f
--- /dev/null
+++ b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp
@@ -0,0 +1,65 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#define N 100
+using namespace cl::sycl;
+int main() {
+  int *a;
+  int *b;
+  queue q;
+
+  sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  auto ctx = q.get_context();
+  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
+  b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
+  for (int i = 0; i < N; i++) {
+    b[i] = -1;
+    a[i] = i;
+  }
+  q.submit([&](handler &cgh) {
+     cgh.parallel_for<class kernel>(
+         range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+           int i = idx[0];
+           volatile int tmp = a[i];
+           tmp += 1;
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+           asm volatile(" add (M1, 16) %0(0,0)<1> %0(0,0)<1;1,0> %1(0,0)<1;1,0>"
+                        : "+rw"(b[i])
+                        : "rw"(tmp));
+#else
+           b[i] += tmp;
+#endif
+         });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < N; i++) {
+    if (b[i] != a[i]) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << b[i] << "!=" << a[i] << std::endl;
+      break;
+    }
+  }
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    cl::sycl::free(b, ctx);
+    return -1;
+  }
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  cl::sycl::free(b, ctx);
+  return 0;
+}
diff --git a/sycl/test/inline-asm/malloc_shared_no_input.cpp b/sycl/test/inline-asm/malloc_shared_no_input.cpp
new file mode 100644
index 000000000000..f2db2b8e7c9f
--- /dev/null
+++ b/sycl/test/inline-asm/malloc_shared_no_input.cpp
@@ -0,0 +1,54 @@
+// UNSUPPORTED: cuda
+// REQUIRES: gpu,linux
+// RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
+// RUN: %t.out
+
+#include "include/asmcheck.h"
+#include <CL/sycl.hpp>
+#include <iostream>
+#define N 100
+using namespace cl::sycl;
+int main() {
+  int *a;
+  queue q;
+  sycl::device Device = q.get_device();
+
+  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  auto ctx = q.get_context();
+  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
+  for (int i = 0; i < N; i++)
+    a[i] = i;
+  q.submit([&](handler &cgh) {
+     cgh.parallel_for<class kernel>(
+         range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+           int i = idx[0];
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+           asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d"
+                        : "=rw"(a[i]));
+#else
+           a[i] = 7;
+#endif
+         });
+   }).wait();
+
+  bool currect = true;
+  for (int i = 0; i < N; i++) {
+    if (a[i] != 7) {
+      currect = false;
+      std::cerr << "error in a[" << i << "]="
+                << a[i] << "!=" << 7 << std::endl;
+      break;
+    }
+  }
+  if (!currect) {
+    std::cerr << "Error" << std::endl;
+    cl::sycl::free(a, ctx);
+    return -1;
+  }
+  std::cerr << "Pass" << std::endl;
+  cl::sycl::free(a, ctx);
+  return 0;
+}

From aa01a1dfafb8913d8050937e5ab96a0b426ce30f Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Wed, 15 Apr 2020 12:58:57 +0300
Subject: [PATCH 02/10] [SYCL] Refactor inline asm tests to reduce code
 duplication

Signed-off-by: Alexey Sachkov <alexey.sachkov@intel.com>
---
 sycl/test/inline-asm/asm_16_empty.cpp         |  73 +++-------
 sycl/test/inline-asm/asm_16_matrix_mult.cpp   |  75 ++++------
 sycl/test/inline-asm/asm_16_no_input_int.cpp  |  75 ++++------
 sycl/test/inline-asm/asm_16_no_opts.cpp       |  76 ++++------
 sycl/test/inline-asm/asm_8_empty.cpp          |  72 +++-------
 sycl/test/inline-asm/asm_8_no_input_int.cpp   |  77 ++++-------
 .../inline-asm/asm_arbitrary_ops_order.cpp    | 117 ++++++----------
 sycl/test/inline-asm/asm_decl_in_scope.cpp    | 130 +++++++-----------
 sycl/test/inline-asm/asm_float_add.cpp        | 109 +++++----------
 sycl/test/inline-asm/asm_float_imm_arg.cpp    | 105 +++++---------
 sycl/test/inline-asm/asm_float_neg.cpp        | 106 +++++---------
 sycl/test/inline-asm/asm_imm_arg.cpp          | 105 +++++---------
 sycl/test/inline-asm/asm_mul.cpp              | 110 +++++----------
 .../inline-asm/asm_multiple_instructions.cpp  | 124 ++++++-----------
 sycl/test/inline-asm/asm_no_operands.cpp      |  10 +-
 sycl/test/inline-asm/asm_no_output.cpp        |  83 ++++-------
 sycl/test/inline-asm/asm_plus_mod.cpp         | 103 +++++---------
 sycl/test/inline-asm/include/asmcheck.h       |  17 ---
 sycl/test/inline-asm/include/asmhelper.h      | 128 +++++++++++++++++
 sycl/test/inline-asm/letter_example.cpp       |  53 +++----
 sycl/test/inline-asm/malloc_shared_32.cpp     |  41 +++---
 .../inline-asm/malloc_shared_in_out_dif.cpp   |  34 +++--
 .../inline-asm/malloc_shared_no_input.cpp     |  33 +++--
 23 files changed, 729 insertions(+), 1127 deletions(-)
 delete mode 100644 sycl/test/inline-asm/include/asmcheck.h
 create mode 100644 sycl/test/inline-asm/include/asmhelper.h

diff --git a/sycl/test/inline-asm/asm_16_empty.cpp b/sycl/test/inline-asm/asm_16_empty.cpp
index 7b551b55e8a1..ad4285e8ecbd 100644
--- a/sycl/test/inline-asm/asm_16_empty.cpp
+++ b/sycl/test/inline-asm/asm_16_empty.cpp
@@ -2,72 +2,39 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
-#include <string>
 #include <vector>
 
-constexpr int LIST_SIZE = 1024;
-using arr_t = std::vector<cl::sycl::cl_int>;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-// class is used for kernel name
-template <typename T>
-class no_opts;
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
 
-template <typename T>
-void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [C](cl::sycl::id<1> wiID)
-        [[cl::intel_reqd_sub_group_size(16)]] {
-      C[wiID] = 43;
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          C[wiID] = 43;
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm volatile("");
+          asm volatile("");
 #endif
-    };
-    cgh.parallel_for<class no_opts<T>>(numOfItems, kern);
-  });
+        });
+  }
 };
 
 int main() {
-  arr_t C(LIST_SIZE);
-
-  cl::sycl::gpu_selector gpsel;
-  cl::sycl::queue deviceQueue(gpsel);
-
-  sycl::device Device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-    std::cout << "Skipping test\n";
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
     return 0;
-  }
 
-  for (int i = 0; i < LIST_SIZE; i++) {
-    C[i] = 0;
-  }
-
-  process_buffers(deviceQueue, C.data(), LIST_SIZE);
-
-  bool all_right = true;
-
-  for (int i = 0; i < LIST_SIZE; ++i)
-    if (C[i] != 43) {
-      std::cerr << "At index: " << i << ". ";
-      std::cerr << C[i] << " != " << 43 << "\n";
-      all_right = false;
-      break;
-    }
-  if (all_right) {
-    std::cout << "Pass" << std::endl;
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
     return 0;
-  }
-  std::cout << "Error" << std::endl;
-  return -1;
+
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/inline-asm/asm_16_matrix_mult.cpp
index 3de2b2966abc..6ae1debb6748 100644
--- a/sycl/test/inline-asm/asm_16_matrix_mult.cpp
+++ b/sycl/test/inline-asm/asm_16_matrix_mult.cpp
@@ -2,72 +2,43 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr int LIST_SIZE = 8;
-using arr_t = std::vector<cl::sycl::cl_int>;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-// class is used for kernel name
-template <typename T>
-class simple_vector_add;
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
 
-template <typename T>
-void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [C](cl::sycl::id<1> wiID)
-        [[cl::intel_reqd_sub_group_size(16)]] {
-      volatile int output = 0;
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
-                   : "=rw"(output));
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
 #else
-      output = 7;
+          output = 7;
 #endif
-      C[wiID] = output;
-    };
-    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
-  });
+          C[wiID] = output;
+        });
+  }
 };
 
 int main() {
-  arr_t C(LIST_SIZE);
-
-  cl::sycl::gpu_selector gpsel;
-  cl::sycl::queue deviceQueue(gpsel);
-  sycl::device Device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-    std::cout << "Skipping test\n";
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
     return 0;
-  }
-  for (int i = 0; i < LIST_SIZE; i++) {
-    C[i] = 0;
-  }
-
-  process_buffers(deviceQueue, C.data(), LIST_SIZE);
 
-  bool all_right = true;
-  for (int i = 0; i < LIST_SIZE; ++i)
-    if (C[i] != 7) {
-      std::cerr << "At index: " << i << ". ";
-      std::cerr << C[i] << " != " << 7 << "\n";
-      all_right = false;
-      break;
-    }
-  if (all_right) {
-    std::cout << "Pass" << std::endl;
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
     return 0;
-  }
-  std::cout << "Error" << std::endl;
-  return -1;
+
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_16_no_input_int.cpp b/sycl/test/inline-asm/asm_16_no_input_int.cpp
index 3de2b2966abc..6ae1debb6748 100644
--- a/sycl/test/inline-asm/asm_16_no_input_int.cpp
+++ b/sycl/test/inline-asm/asm_16_no_input_int.cpp
@@ -2,72 +2,43 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr int LIST_SIZE = 8;
-using arr_t = std::vector<cl::sycl::cl_int>;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-// class is used for kernel name
-template <typename T>
-class simple_vector_add;
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
 
-template <typename T>
-void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [C](cl::sycl::id<1> wiID)
-        [[cl::intel_reqd_sub_group_size(16)]] {
-      volatile int output = 0;
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          volatile int output = 0;
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
-                   : "=rw"(output));
+          asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
 #else
-      output = 7;
+          output = 7;
 #endif
-      C[wiID] = output;
-    };
-    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
-  });
+          C[wiID] = output;
+        });
+  }
 };
 
 int main() {
-  arr_t C(LIST_SIZE);
-
-  cl::sycl::gpu_selector gpsel;
-  cl::sycl::queue deviceQueue(gpsel);
-  sycl::device Device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-    std::cout << "Skipping test\n";
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
     return 0;
-  }
-  for (int i = 0; i < LIST_SIZE; i++) {
-    C[i] = 0;
-  }
-
-  process_buffers(deviceQueue, C.data(), LIST_SIZE);
 
-  bool all_right = true;
-  for (int i = 0; i < LIST_SIZE; ++i)
-    if (C[i] != 7) {
-      std::cerr << "At index: " << i << ". ";
-      std::cerr << C[i] << " != " << 7 << "\n";
-      all_right = false;
-      break;
-    }
-  if (all_right) {
-    std::cout << "Pass" << std::endl;
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
     return 0;
-  }
-  std::cout << "Error" << std::endl;
-  return -1;
+
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_16_no_opts.cpp b/sycl/test/inline-asm/asm_16_no_opts.cpp
index cc2df932a7d8..4b6d5146fd6b 100644
--- a/sycl/test/inline-asm/asm_16_no_opts.cpp
+++ b/sycl/test/inline-asm/asm_16_no_opts.cpp
@@ -2,74 +2,44 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr int LIST_SIZE = 1024;
-using arr_t = std::vector<cl::sycl::cl_int>;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-// class is used for kernel name
-template <typename T>
-class simple_vector_add;
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
 
-template <typename T>
-void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [C](cl::sycl::id<1> wiID)
-        [[cl::intel_reqd_sub_group_size(16)]] {
-      for (int i = 0; i < 10; ++i) {
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+          for (int i = 0; i < 10; ++i) {
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-        asm("fence_sw");
-        C[wiID] += i;
+            asm("fence_sw");
+            C[wiID] += i;
 
 #else
-        C[wiID] += i;
+            C[wiID] += i;
 #endif
-      }
-    };
-    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
-  });
+          }
+        });
+  }
 };
 
 int main() {
-  arr_t C(LIST_SIZE);
-
-  cl::sycl::gpu_selector gpsel;
-  cl::sycl::queue deviceQueue(gpsel);
-  sycl::device Device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-    std::cout << "Skipping test\n";
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
     return 0;
-  }
-  for (int i = 0; i < LIST_SIZE; i++) {
-    C[i] = 0;
-  }
-
-  process_buffers(deviceQueue, C.data(), LIST_SIZE);
-
-  bool all_right = true;
 
-  for (int i = 0; i < LIST_SIZE; ++i)
-    if (C[i] != 45) {
-      std::cerr << "At index: " << i << ". ";
-      std::cerr << C[i] << " != " << 45 << "\n";
-      all_right = false;
-      break;
-    }
-  if (all_right) {
-    std::cout << "Pass" << std::endl;
+  if (verify_all_the_same(f.getOutputBufferData(), 45))
     return 0;
-  }
-  std::cout << "Error" << std::endl;
-  return -1;
+
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_8_empty.cpp b/sycl/test/inline-asm/asm_8_empty.cpp
index a7f0b0bc6d4b..97fae0ed4eb2 100644
--- a/sycl/test/inline-asm/asm_8_empty.cpp
+++ b/sycl/test/inline-asm/asm_8_empty.cpp
@@ -2,71 +2,39 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr int LIST_SIZE = 1024;
-using arr_t = std::vector<cl::sycl::cl_int>;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-// class is used for kernel name
-template <typename T>
-class no_opts;
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
 
-template <typename T>
-void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [C](cl::sycl::id<1> wiID)
-        [[cl::intel_reqd_sub_group_size(8)]] {
-      C[wiID] = 43;
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          C[wiID] = 43;
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm volatile("");
+          asm volatile("");
 #endif
-    };
-    cgh.parallel_for<class no_opts<T>>(numOfItems, kern);
-  });
+        });
+  }
 };
 
 int main() {
-  arr_t C(LIST_SIZE);
-
-  cl::sycl::gpu_selector gpsel;
-  cl::sycl::queue deviceQueue(gpsel);
-
-  sycl::device Device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-    std::cout << "Skipping test\n";
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
     return 0;
-  }
 
-  for (int i = 0; i < LIST_SIZE; i++) {
-    C[i] = 0;
-  }
-
-  process_buffers(deviceQueue, C.data(), LIST_SIZE);
-
-  bool all_right = true;
-
-  for (int i = 0; i < LIST_SIZE; ++i)
-    if (C[i] != 43) {
-      std::cerr << "At index: " << i << ". ";
-      std::cerr << C[i] << " != " << 43 << "\n";
-      all_right = false;
-      break;
-    }
-  if (all_right) {
-    std::cout << "Pass" << std::endl;
+  if (verify_all_the_same(f.getOutputBufferData(), 43))
     return 0;
-  }
-  std::cout << "Error" << std::endl;
-  return -1;
+
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_8_no_input_int.cpp b/sycl/test/inline-asm/asm_8_no_input_int.cpp
index a370f77b3ada..6d1dcbb832cf 100644
--- a/sycl/test/inline-asm/asm_8_no_input_int.cpp
+++ b/sycl/test/inline-asm/asm_8_no_input_int.cpp
@@ -2,74 +2,43 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr int LIST_SIZE = 8;
-using arr_t = std::vector<cl::sycl::cl_int>;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-// class is used for kernel name
-template <typename T>
-class simple_vector_add;
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
 
-template <typename T>
-void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [C](cl::sycl::id<1> wiID)
-        [[cl::intel_reqd_sub_group_size(8)]] {
-      volatile int output = 0;
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int output = 0;
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
-                   : "=rw"(output));
+          asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d"
+                       : "=rw"(output));
 #else
-      output = 7;
+          output = 7;
 #endif
-      C[wiID] = output;
-    };
-    cgh.parallel_for<class simple_vector_add<T>>(numOfItems, kern);
-  });
+          C[wiID] = output;
+        });
+  }
 };
 
 int main() {
-  arr_t C(LIST_SIZE);
-
-  cl::sycl::gpu_selector gpsel;
-  cl::sycl::queue deviceQueue(gpsel);
-
-  for (int i = 0; i < LIST_SIZE; i++) {
-    C[i] = 0;
-  }
-
-  sycl::device Device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-    std::cout << "Skipping test\n";
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
     return 0;
-  }
 
-  process_buffers(deviceQueue, C.data(), LIST_SIZE);
-
-  bool all_right = true;
-  for (int i = 0; i < LIST_SIZE; ++i)
-    if (C[i] != 7) {
-      std::cerr << "At index: " << i << ". ";
-      std::cerr << C[i] << " != " << 7 << "\n";
-      all_right = false;
-      break;
-    }
-  if (all_right) {
-    std::cout << "Pass" << std::endl;
+  if (verify_all_the_same(f.getOutputBufferData(), 7))
     return 0;
-  }
-  std::cout << "Error" << std::endl;
-  return -1;
+
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
index 1b75b63d0027..28d0af1d455b 100644
--- a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
+++ b/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
@@ -2,97 +2,58 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-constexpr int LIST_SIZE = 1024;
-using arr_t = std::vector<cl::sycl::cl_int>;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
 
-// class is used for kernel name
-template <typename T>
-class vector_mad;
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
-
-  template <typename T>
-  void process_buffers(T const *pa, T const *pb, const T *pc, T *pd, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(D[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
+#else
+          D[wiID] = A[wiID] * B[wiID] + C[wiID];
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - i * i;
+  }
 
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
 
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
+  auto &D = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (D[i] != inputA[i] * inputB[i] + inputC[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << D[i] << " != " << inputA[i] * inputB[i] + inputC[i] << "\n";
+      return 1;
     }
-
-    for (int i = 0; i < LIST_SIZE; i++) {
-      A[i] = i;
-      B[i] = i;
-      C[i] = LIST_SIZE - i * i;
-    }
-
-    ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i)
-      if (D[i] != A[i] * B[i] + C[i]) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << D[i] << " != " << A[i] * B[i] + C[i] << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(T const *pa, T const *pb, const T *pc, T *pd,
-                                size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-  cl::sycl::buffer<T, 1> bufferD(pd, numOfItems);
-
-  bufferA.set_final_data(nullptr);
-  bufferB.set_final_data(nullptr);
-  bufferC.set_final_data(nullptr);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_read>(cgh);
-    auto C = bufferC.template get_access<sycl_read>(cgh);
-    auto D = bufferD.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("mad (M1, 8) %0(0, 0)<1> %3(0, 0)<1;1,0> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
-          : "=rw"(D[wiID])
-          : "rw"(B[wiID]), "rw"(C[wiID]), "rw"(A[wiID]));
-#else
-      D[wiID] = A[wiID] * B[wiID] + C[wiID];
-#endif
-    };
-    cgh.parallel_for<class vector_mad<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/asm_decl_in_scope.cpp b/sycl/test/inline-asm/asm_decl_in_scope.cpp
index 2164a602bae6..db30e20f5e9e 100644
--- a/sycl/test/inline-asm/asm_decl_in_scope.cpp
+++ b/sycl/test/inline-asm/asm_decl_in_scope.cpp
@@ -2,102 +2,66 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-constexpr int LIST_SIZE = 1024 * 1024;
-using arr_t = std::vector<cl::sycl::cl_int>;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
 
-// class is used for kernel name
-template <typename T>
-class decl_in_scope_kernel;
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
-
-  template <typename T>
-  void process_buffers(T const *pa, T const *pb, T *pc, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()},
+        [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+    // declaration of temp within and outside the scope
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("{\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              "}\n"
+              ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
+              "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n"
+              "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
+              : "+rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID];
+          C[wiID] *= B[wiID];
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = 2;
+  }
 
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
 
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (C[i] != inputA[i] * inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n";
+      return 1;
     }
-
-    for (int i = 0; i < LIST_SIZE; i++) {
-      A[i] = i;
-      B[i] = 2;
-    }
-
-    ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i)
-      if (C[i] != A[i] * B[i]) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << C[i] << " != " << A[i] * B[i] << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  bufferA.set_final_data(nullptr);
-  bufferB.set_final_data(nullptr);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_read>(cgh);
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
-
-      // declaration of temp within and outside the scope
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("{\n"
-          ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
-          "mov (M1, 16) temp(0, 0)<1> %1(0, 0)<1;1,0>\n"
-          "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
-          "}\n"
-          ".decl temp v_type=G type=d num_elts=16 align=GRF\n"
-          "mul (M1, 16) temp(0, 0)<1> %2(0, 0)<1;1,0> %0(0, 0)<1;1,0>\n"
-          "mov (M1, 16) %0(0, 0)<1>  temp(0, 0)<1;1,0>\n"
-          : "+rw"(C[wiID])
-          : "rw"(A[wiID]), "rw"(B[wiID]));
-#else
-      C[wiID] = A[wiID];
-      C[wiID] *= B[wiID];
-#endif
-    };
-    cgh.parallel_for<class decl_in_scope_kernel<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/asm_float_add.cpp b/sycl/test/inline-asm/asm_float_add.cpp
index 4e58bb3b12d3..c23b084317c5 100644
--- a/sycl/test/inline-asm/asm_float_add.cpp
+++ b/sycl/test/inline-asm/asm_float_add.cpp
@@ -2,93 +2,58 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <cmath>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_double;
 
-constexpr int LIST_SIZE = 16;
-using arr_t = std::vector<cl::sycl::cl_double>;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
 
-// class is used for kernel name
-template <typename T>
-class asm_float_add_kernel;
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
-
-  template <typename T>
-  void process_buffers(T const *pa, T const *pb, T *pc, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID] + B[wiID];
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
-
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = (double)1 / std::pow(2, i);
+    inputB[i] = (double)2 / std::pow(2, i);
+  }
 
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
-    }
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
 
-    for (int i = 0; i < LIST_SIZE; i++) {
-      A[i] = (double)1 / std::pow(2, i);
-      B[i] = (double)2 / std::pow(2, i);
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    if (C[i] != inputA[i] + inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] + inputB[i] << "\n";
+      return 1;
     }
-
-    ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; i++)
-      if (C[i] != A[i] + B[i]) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << C[i] << " != " << A[i] + B[i] << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
 
-  bufferA.set_final_data(nullptr);
-  bufferB.set_final_data(nullptr);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_read>(cgh);
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
-          : "=rw"(C[wiID])
-          : "rw"(A[wiID]), "rw"(B[wiID]));
-#else
-      C[wiID] = A[wiID] + B[wiID];
-#endif
-    };
-    cgh.parallel_for<class asm_float_add_kernel<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/asm_float_imm_arg.cpp b/sycl/test/inline-asm/asm_float_imm_arg.cpp
index 2b3f05772953..c9683cf020f7 100644
--- a/sycl/test/inline-asm/asm_float_imm_arg.cpp
+++ b/sycl/test/inline-asm/asm_float_imm_arg.cpp
@@ -2,88 +2,55 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <cmath>
 #include <iostream>
 #include <vector>
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
 
-constexpr int LIST_SIZE = 8;
 constexpr double IMM_ARGUMENT = 0.5;
+using dataType = cl::sycl::cl_double;
 
-using arr_t = std::vector<cl::sycl::cl_double>;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
 
-// class is used for kernel name
-template <typename T>
-class float_imm_arg_kernel;
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
-
-  template <typename T>
-  void process_buffers(const T *pa, T *pb, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]), "rw"(IMM_ARGUMENT));
+#else
+          B[wiID] = A[wiID] * IMM_ARGUMENT;
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
-
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
-
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = (double)1 / std::pow(2, i);
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != input[i] * IMM_ARGUMENT) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << input[i] * IMM_ARGUMENT << "\n";
+      return 1;
     }
-
-    for (int i = 0; i < LIST_SIZE; i++)
-      A[i] = (double)1 / std::pow(2, i);
-
-    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i) {
-      if (B[i] != A[i] * IMM_ARGUMENT) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << B[i] << " != " << A[i] * IMM_ARGUMENT << "\n";
-        abort();
-      }
-    }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
-          : "=rw"(B[wiID])
-          : "rw"(A[wiID]), "rw"(IMM_ARGUMENT));
-#else
-      B[wiID] = A[wiID] * IMM_ARGUMENT;
-#endif
-    };
-    cgh.parallel_for<class float_imm_arg_kernel<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/asm_float_neg.cpp b/sycl/test/inline-asm/asm_float_neg.cpp
index f96ea348c468..290b0898903e 100644
--- a/sycl/test/inline-asm/asm_float_neg.cpp
+++ b/sycl/test/inline-asm/asm_float_neg.cpp
@@ -2,88 +2,56 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_float;
 
-constexpr int LIST_SIZE = 1024;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
 
-using arr_t = std::vector<cl::sycl::cl_float>;
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer().template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-// class is used for kernel name
-template <typename T>
-class float_neg_kernel;
-
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]));
+#else
+          B[wiID] = -A[wiID];
+#endif
+        });
+  }
 
-  template <typename T>
-  void process_buffers(const T *pa, T *pb, size_t sz);
+  size_t problem_size = 0;
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
-
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
-
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = 1.0 / i;
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &R = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (R[i] != -input[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << R[i] << " != " << -input[i] << "\n";
+      return 1;
     }
-
-    for (int i = 0; i < LIST_SIZE; i++)
-      A[i] = 1.0 / i;
-
-    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i)
-      if (B[i] != -A[i]) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << B[i] << " != " << -A[i] << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
 
-  bufferA.set_final_data(nullptr);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("mov (M1, 8) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>"
-          : "=rw"(B[wiID])
-          : "rw"(A[wiID]));
-#else
-      B[wiID] = -A[wiID];
-#endif
-    };
-    cgh.parallel_for<class float_neg_kernel<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/asm_imm_arg.cpp b/sycl/test/inline-asm/asm_imm_arg.cpp
index 361bcc4c48d1..ce9a3ad948a4 100644
--- a/sycl/test/inline-asm/asm_imm_arg.cpp
+++ b/sycl/test/inline-asm/asm_imm_arg.cpp
@@ -2,87 +2,54 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
-
-constexpr int LIST_SIZE = 1024 * 1024;
 constexpr int CONST_ARGUMENT = 0xabc;
+using dataType = cl::sycl::cl_int;
 
-using arr_t = std::vector<cl::sycl::cl_int>;
-
-// class is used for kernel name
-template <typename T>
-class const_asm_arg_kernel;
-
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input) : WithInputBuffers<T, 1>(input), WithOutputBuffer<T>(input.size()) {}
 
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-  template <typename T>
-  void process_buffers(const T *pa, T *pb, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
+              : "=rw"(B[wiID])
+              : "rw"(A[wiID]), "rw"(CONST_ARGUMENT));
+#else
+          B[wiID] = A[wiID] + CONST_ARGUMENT;
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
-
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
-
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
+  std::vector<dataType> input(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++)
+    input[i] = i;
+
+  KernelFunctor<> f(input);
+  if (!launchInlineASMTest(f))
+    return 0;
+
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != input[i] * CONST_ARGUMENT) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << input[i] * CONST_ARGUMENT << "\n";
+      return 1;
     }
-
-    for (int i = 0; i < LIST_SIZE; i++)
-      A[i] = i;
-
-    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i)
-      if (B[i] != A[i] + CONST_ARGUMENT) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << B[i] << " != " << A[i] + CONST_ARGUMENT << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(const T *pa, T *pb, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("add (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2"
-          : "=rw"(B[wiID])
-          : "rw"(A[wiID]), "rw"(CONST_ARGUMENT));
-#else
-      B[wiID] = A[wiID] + CONST_ARGUMENT;
-#endif
-    };
-    cgh.parallel_for<class const_asm_arg_kernel<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/asm_mul.cpp b/sycl/test/inline-asm/asm_mul.cpp
index 15fd7126e7d2..726abcf787f2 100644
--- a/sycl/test/inline-asm/asm_mul.cpp
+++ b/sycl/test/inline-asm/asm_mul.cpp
@@ -2,92 +2,56 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-constexpr int LIST_SIZE = 1024 * 1024;
-using arr_t = std::vector<cl::sycl::cl_int>;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 2>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 2>(input1, input2), WithOutputBuffer<T>(input1.size()) {}
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-// class is used for kernel name
-template <typename T>
-class vector_mul;
-
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
-
-  template <typename T>
-  void process_buffers(T const *pa, T const *pb, T *pc, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
+              : "=rw"(C[wiID])
+              : "rw"(A[wiID]), "rw"(B[wiID]));
+#else
+          C[wiID] = A[wiID] * B[wiID];
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
-
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = DEFAULT_PROBLEM_SIZE - i;
+  }
 
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
-    }
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
 
-    for (int i = 0; i < LIST_SIZE; i++) {
-      A[i] = i;
-      B[i] = LIST_SIZE - i;
+  auto &C = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (C[i] != inputA[i] * inputB[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << C[i] << " != " << inputA[i] * inputB[i] << "\n";
+      return 1;
     }
-
-    ct.process_buffers(A.data(), B.data(), C.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i)
-      if (C[i] != A[i] * B[i]) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << C[i] << " != " << A[i] * B[i] << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
 
-  bufferA.set_final_data(nullptr);
-  bufferB.set_final_data(nullptr);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_read>(cgh);
-    auto C = bufferC.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B, C ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("mul (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>"
-          : "=rw"(C[wiID])
-          : "rw"(A[wiID]), "rw"(B[wiID]));
-#else
-      C[wiID] = A[wiID] * B[wiID];
-#endif
-    };
-    cgh.parallel_for<class vector_mul<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/asm_multiple_instructions.cpp b/sycl/test/inline-asm/asm_multiple_instructions.cpp
index edbd21cfae94..e8cf02a5292e 100644
--- a/sycl/test/inline-asm/asm_multiple_instructions.cpp
+++ b/sycl/test/inline-asm/asm_multiple_instructions.cpp
@@ -2,104 +2,58 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-constexpr int LIST_SIZE = 1024;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 3>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2, const std::vector<T> &input3) : WithInputBuffers<T, 3>(input1, input2, input3), WithOutputBuffer<T>(input1.size()) {}
 
-using arr_t = std::vector<cl::sycl::cl_int>;
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read_write>(cgh);
+    auto B = this->getInputBuffer(1).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto C = this->getInputBuffer(2).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto D = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-// class is used for kernel name
-template <typename T>
-class vector_mul;
-
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
-
-  template <typename T>
-  void process_buffers(T const *pa, T const *pb, T const *pc, T *pd, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("{\n"
+              "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n"
+              "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n"
+              "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n"
+              "}\n"
+              : "=rw"(D[wiID]), "+rw"(A[wiID])
+              : "rw"(B[wiID]), "rw"(C[wiID]));
+#else
+          A[wiID] += B[wiID];
+          A[wiID] += C[wiID];
+          D[wiID] = A[wiID];
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE), D(LIST_SIZE);
-
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
-
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
-
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
-    }
-
-    for (int i = 0; i < LIST_SIZE; i++) {
-      A[i] = B[i] = i;
-      C[i] = LIST_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE
-    }
-
-    ct.process_buffers(A.data(), B.data(), C.data(), D.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i)
-      if (D[i] != LIST_SIZE) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << D[i] << " != " << LIST_SIZE << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), inputC(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = inputB[i] = i;
+    inputC[i] = DEFAULT_PROBLEM_SIZE - 2 * i; // A[i] + B[i] + C[i] = LIST_SIZE
   }
-}
 
-template <typename T>
-void ocl_ctx_t::process_buffers(T const *pa, T const *pb, T const *pc, T *pd,
-                                size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-  cl::sycl::buffer<T, 1> bufferD(pd, numOfItems);
+  KernelFunctor<> f(inputA, inputB, inputC);
+  if (!launchInlineASMTest(f))
+    return 0;
 
-  bufferA.set_final_data(nullptr);
-  bufferB.set_final_data(nullptr);
-  bufferC.set_final_data(nullptr);
+  if (verify_all_the_same(f.getOutputBufferData(), (dataType)DEFAULT_PROBLEM_SIZE))
+    return 0;
 
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_write>(cgh);
-    auto B = bufferB.template get_access<sycl_read>(cgh);
-    auto C = bufferC.template get_access<sycl_read>(cgh);
-    auto D = bufferD.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B, C, D ](cl::sycl::id<1> wiID)
-        [[cl::intel_reqd_sub_group_size(8)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("{\n"
-          "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>\n"
-          "add (M1, 8) %1(0, 0)<1> %1(0, 0)<1;1,0> %3(0, 0)<1;1,0>\n"
-          "mov (M1, 8) %0(0, 0)<1> %1(0, 0)<1;1,0>\n"
-          "}\n"
-          : "=rw"(D[wiID]), "+rw"(A[wiID])
-          : "rw"(B[wiID]), "rw"(C[wiID]));
-#else
-      A[wiID] += B[wiID];
-      A[wiID] += C[wiID];
-      D[wiID] = A[wiID];
-#endif
-    };
-    cgh.parallel_for<class vector_mul<T>>(numOfItems, kern);
-  });
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_no_operands.cpp b/sycl/test/inline-asm/asm_no_operands.cpp
index f7abfd523928..3a3a919caa98 100644
--- a/sycl/test/inline-asm/asm_no_operands.cpp
+++ b/sycl/test/inline-asm/asm_no_operands.cpp
@@ -2,27 +2,29 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 class no_operands_kernel;
 
 int main() {
   // Creating SYCL queue
   cl::sycl::queue Queue;
-  sycl::device Device = Queue.get_device();
+  cl::sycl::device Device = Queue.get_device();
 
   if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
     std::cout << "Skipping test\n";
     return 0;
   }
   // Size of index space for kernel
-  cl::sycl::range<1> NumOfWorkItems{15};
+  cl::sycl::range<1> NumOfWorkItems{16};
 
   // Submitting command group(work) to queue
   Queue.submit([&](cl::sycl::handler &cgh) {
     // Executing kernel
-    cgh.parallel_for<class no_operands_kernel>(
+    cgh.parallel_for<no_operands_kernel>(
         NumOfWorkItems, [=](cl::sycl::id<1> WIid) [[cl::intel_reqd_sub_group_size(8)]] {
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
           asm("barrier");
diff --git a/sycl/test/inline-asm/asm_no_output.cpp b/sycl/test/inline-asm/asm_no_output.cpp
index 2440fd1c17a2..ab65346d1313 100644
--- a/sycl/test/inline-asm/asm_no_output.cpp
+++ b/sycl/test/inline-asm/asm_no_output.cpp
@@ -2,77 +2,46 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr int LIST_SIZE = 8;
-using arr_t = std::vector<cl::sycl::cl_int>;
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-// class is used for kernel name
-template <typename T>
-class asm_no_output;
+template <typename T = dataType>
+struct KernelFunctor : WithOutputBuffer<T> {
+  KernelFunctor(size_t problem_size) : WithOutputBuffer<T>(problem_size) {}
 
-template <typename T>
-void process_buffers(cl::sycl::queue &deviceQueue, T *pc, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferC(pc, numOfItems);
-
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto C = bufferC.template get_access<sycl_read>(cgh);
-
-    auto kern = [C]()
-        [[cl::intel_reqd_sub_group_size(16)]] {
-      volatile int local_var = 47;
-      local_var += C[0];
+  void operator()(cl::sycl::handler &cgh) {
+    auto C = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(8)]] {
+          volatile int local_var = 47;
+          local_var += C[0];
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm volatile("{\n"
-                   ".decl temp v_type=G type=w num_elts=8 align=GRF\n"
-                   "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n"
-                   "}\n" ::"rw"(local_var));
+          asm volatile("{\n"
+                       ".decl temp v_type=G type=w num_elts=8 align=GRF\n"
+                       "mov (M1,16) temp(0, 0)<1> %0(0,0)<1;1,0>\n"
+                       "}\n" ::"rw"(local_var));
 #else
-      volatile int temp = 0;
-      temp = local_var;
+          volatile int temp = 0;
+          temp = local_var;
 #endif
-    };
-    cgh.single_task<class asm_no_output<T>>(kern);
-  });
+        });
+  }
 };
 
 int main() {
-  arr_t C(LIST_SIZE);
-
-  cl::sycl::gpu_selector gpsel;
-  cl::sycl::queue deviceQueue(gpsel);
-  sycl::device Device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(Device)) {
-    std::cout << "Skipping test\n";
+  KernelFunctor<> f(DEFAULT_PROBLEM_SIZE);
+  if (!launchInlineASMTest(f))
     return 0;
-  }
 
-  for (int i = 0; i < LIST_SIZE; i++) {
-    C[i] = 0;
-  }
-
-  process_buffers(deviceQueue, C.data(), LIST_SIZE);
-
-  bool all_right = true;
-  for (int i = 0; i < LIST_SIZE; ++i)
-    if (C[i] != 0) {
-      std::cerr << "At index: " << i << ". ";
-      std::cerr << C[i] << " != " << 0 << "\n";
-      all_right = false;
-      break;
-    }
-  if (all_right) {
-    std::cout << "Pass" << std::endl;
+  if (verify_all_the_same(f.getOutputBufferData(), 47))
     return 0;
-  }
-  std::cout << "Error" << std::endl;
-  return -1;
+
+  return 1;
 }
diff --git a/sycl/test/inline-asm/asm_plus_mod.cpp b/sycl/test/inline-asm/asm_plus_mod.cpp
index d460406c71cd..f65cda777ef9 100644
--- a/sycl/test/inline-asm/asm_plus_mod.cpp
+++ b/sycl/test/inline-asm/asm_plus_mod.cpp
@@ -2,86 +2,57 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
 #include <vector>
 
-constexpr auto sycl_read = cl::sycl::access::mode::read;
-constexpr auto sycl_write = cl::sycl::access::mode::write;
+using dataType = cl::sycl::cl_int;
 
-constexpr int LIST_SIZE = 1024 * 1024;
-using arr_t = std::vector<cl::sycl::cl_int>;
+template <typename T = dataType>
+struct KernelFunctor : WithInputBuffers<T, 1>, WithOutputBuffer<T> {
+  KernelFunctor(const std::vector<T> &input1, const std::vector<T> &input2) : WithInputBuffers<T, 1>(input1), WithOutputBuffer<T>(input2) {}
 
-// class is used for kernel name
-template <typename T>
-class plus_mod_kernel;
+  void operator()(cl::sycl::handler &cgh) {
+    auto A = this->getInputBuffer(0).template get_access<cl::sycl::access::mode::read>(cgh);
+    auto B = this->getOutputBuffer().template get_access<cl::sycl::access::mode::write>(cgh);
 
-class ocl_ctx_t {
-  cl::sycl::queue deviceQueue;
-
-public:
-  ocl_ctx_t(const cl::sycl::device_selector &sel) : deviceQueue(sel) {}
-
-  template <typename T>
-  void process_buffers(T const *pa, T *pb, size_t sz);
+    cgh.parallel_for<KernelFunctor<T>>(
+        cl::sycl::range<1>{this->getOutputBufferSize()}, [=](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
+#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
+          asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>"
+              : "+rw"(B[wiID])
+              : "rw"(A[wiID]));
+#else
+          B[wiID] += A[wiID];
+#endif
+        });
+  }
 };
 
 int main() {
-  arr_t A(LIST_SIZE), B(LIST_SIZE), C(LIST_SIZE);
+  std::vector<dataType> inputA(DEFAULT_PROBLEM_SIZE), inputB(DEFAULT_PROBLEM_SIZE), R(DEFAULT_PROBLEM_SIZE);
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; i++) {
+    inputA[i] = i;
+    inputB[i] = DEFAULT_PROBLEM_SIZE - i;
+    R[i] = inputA[i] + inputB[i];
+  }
 
-  try {
-    cl::sycl::gpu_selector gpsel;
-    ocl_ctx_t ct{gpsel};
-    cl::sycl::queue deviceQueue(gpsel);
-    sycl::device Device = deviceQueue.get_device();
+  KernelFunctor<> f(inputA, inputB);
+  if (!launchInlineASMTest(f))
+    return 0;
 
-    if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
-      std::cout << "Skipping test\n";
-      return 0;
+  auto &B = f.getOutputBufferData();
+  for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
+    if (B[i] != R[i]) {
+      std::cerr << "At index: " << i << ". ";
+      std::cerr << B[i] << " != " << R[i] << "\n";
+      return 1;
     }
-    for (int i = 0; i < LIST_SIZE; i++) {
-      A[i] = i;
-      B[i] = LIST_SIZE - i;
-      C[i] = A[i] + B[i];
-    }
-
-    ct.process_buffers(A.data(), B.data(), LIST_SIZE);
-
-    for (int i = 0; i < LIST_SIZE; ++i)
-      if (B[i] != C[i]) {
-        std::cerr << "At index: " << i << ". ";
-        std::cerr << B[i] << " != " << C[i] << "\n";
-        abort();
-      }
-
-    std::cout << "Everything is correct" << std::endl;
-  } catch (cl::sycl::exception const &err) {
-    std::cerr << "ERROR: " << err.what() << ":\n";
-    return -1;
   }
-}
-
-template <typename T>
-void ocl_ctx_t::process_buffers(T const *pa, T *pb, size_t sz) {
-  cl::sycl::range<1> numOfItems{sz};
-  cl::sycl::buffer<T, 1> bufferA(pa, numOfItems);
-  cl::sycl::buffer<T, 1> bufferB(pb, numOfItems);
 
-  deviceQueue.submit([&](cl::sycl::handler &cgh) {
-    auto A = bufferA.template get_access<sycl_read>(cgh);
-    auto B = bufferB.template get_access<sycl_write>(cgh);
-
-    auto kern = [ A, B ](cl::sycl::id<1> wiID) [[cl::intel_reqd_sub_group_size(16)]] {
-#if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-      asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>"
-          : "+rw"(B[wiID])
-          : "rw"(A[wiID]));
-#else
-      B[wiID] += A[wiID];
-#endif
-    };
-    cgh.parallel_for<class plus_mod_kernel<T>>(numOfItems, kern);
-  });
+  return 0;
 }
diff --git a/sycl/test/inline-asm/include/asmcheck.h b/sycl/test/inline-asm/include/asmcheck.h
deleted file mode 100644
index 2031455e214c..000000000000
--- a/sycl/test/inline-asm/include/asmcheck.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <CL/sycl.hpp>
-
-#include <iostream>
-bool isInlineASMSupported(sycl::device Device) {
-
-  sycl::string_class DriverVersion = Device.get_info<sycl::info::device::driver_version>();
-  sycl::string_class DeviceVendorName = Device.get_info<sycl::info::device::vendor>();
-  if (DeviceVendorName.find("Intel") == sycl::string_class::npos)
-    return false;
-  if (DriverVersion.length() < 5)
-    return false;
-  if (DriverVersion[2] != '.')
-    return false;
-  if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12)
-    return false;
-  return true;
-}
diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h
new file mode 100644
index 000000000000..45d044161269
--- /dev/null
+++ b/sycl/test/inline-asm/include/asmhelper.h
@@ -0,0 +1,128 @@
+#include <CL/sycl.hpp>
+
+#include <iostream>
+#include <vector>
+
+constexpr const size_t DEFAULT_PROBLEM_SIZE = 16;
+
+template <typename T>
+struct WithOutputBuffer {
+  WithOutputBuffer(size_t size) {
+    _output_buffer_data.resize(size, 0);
+    _output_buffer = new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size());
+  }
+
+  WithOutputBuffer(const std::vector<T> &data) {
+    _output_buffer_data = data;
+    _output_buffer = new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size());
+  }
+
+  ~WithOutputBuffer() {
+    if (_output_buffer)
+      delete _output_buffer;
+  }
+
+  const std::vector<T> &getOutputBufferData() const {
+    return _output_buffer_data;
+  }
+
+  size_t getOutputBufferSize() const {
+    return _output_buffer_data.size();
+  }
+
+protected:
+  cl::sycl::buffer<T> &getOutputBuffer() {
+    return *_output_buffer;
+  }
+
+  cl::sycl::buffer<T> *_output_buffer = nullptr;
+  std::vector<T> _output_buffer_data;
+};
+
+template <typename T, size_t N>
+struct WithInputBuffers {
+
+  template <typename... Args>
+  WithInputBuffers(Args... inputs) {
+    static_assert(sizeof...(Args) == N, "All input buffers must be initialized");
+    constructorHelper<0>(inputs...);
+  }
+
+  ~WithInputBuffers() {
+    for (size_t i = 0; i < N; ++i) {
+      if (_input_buffers[i])
+        delete _input_buffers[i];
+    }
+  }
+
+  cl::sycl::buffer<T> &getInputBuffer(size_t i = 0) {
+    return *_input_buffers[i];
+  }
+
+protected:
+  cl::sycl::buffer<T> *_input_buffers[N] = {nullptr};
+  std::vector<T> _input_buffers_data[N];
+
+private:
+  template <int Index, typename... Args>
+  void constructorHelper(const std::vector<T> &data, Args... rest) {
+    _input_buffers_data[Index] = data;
+    _input_buffers[Index] = new cl::sycl::buffer<T>(_input_buffers_data[Index].data(), _input_buffers_data[Index].size());
+    constructorHelper<Index + 1>(rest...);
+  }
+
+  template <int Index>
+  void constructorHelper() {
+    // nothing to do, recursion stop
+  }
+};
+
+bool isInlineASMSupported(sycl::device Device) {
+
+  sycl::string_class DriverVersion = Device.get_info<sycl::info::device::driver_version>();
+  sycl::string_class DeviceVendorName = Device.get_info<sycl::info::device::vendor>();
+  // TODO: query for some extension/capability/whatever once interface is
+  // defined
+  if (DeviceVendorName.find("Intel") == sycl::string_class::npos)
+    return false;
+  if (DriverVersion.length() < 5)
+    return false;
+  if (DriverVersion[2] != '.')
+    return false;
+  if (std::stoi(DriverVersion.substr(0, 2), nullptr, 10) < 20 || std::stoi(DriverVersion.substr(3, 2), nullptr, 10) < 12)
+    return false;
+  return true;
+}
+
+/// checks if device suppots inline asm feature and launches a test
+///
+/// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
+template <typename F>
+bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
+  cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
+  cl::sycl::device device = deviceQueue.get_device();
+
+  if (!isInlineASMSupported(device)) {
+    std::cout << "Skipping test\n";
+    return false;
+  }
+
+  if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) {
+    std::cout << "Skipping test\n";
+    return false;
+  }
+
+  deviceQueue.submit(f).wait();
+  return true;
+}
+
+template <typename T>
+bool verify_all_the_same(const std::vector<T> &input, T reference_value) {
+  for (int i = 0; i < input.size(); ++i)
+    if (input[i] != reference_value) {
+      std::cerr << "At index: " << i << " ";
+      std::cerr << input[i] << " != " << reference_value << "\n";
+      return false;
+    }
+  return true;
+}
diff --git a/sycl/test/inline-asm/letter_example.cpp b/sycl/test/inline-asm/letter_example.cpp
index 5b9bac0e22d2..22bf26648e78 100644
--- a/sycl/test/inline-asm/letter_example.cpp
+++ b/sycl/test/inline-asm/letter_example.cpp
@@ -2,45 +2,50 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
-#define N 100
-using namespace cl::sycl;
+
+constexpr size_t problem_size = 16;
+
+class kernel_name;
+
 int main() {
-  int *a;
-  queue q;
-  sycl::device Device = q.get_device();
+  cl::sycl::queue q;
+  cl::sycl::device Device = q.get_device();
 
   if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
     std::cout << "Skipping test\n";
     return 0;
   }
   auto ctx = q.get_context();
-  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
-  for (int i = 0; i < N; i++)
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
     a[i] = i;
-  q.submit([&](handler &cgh) {
-     cgh.parallel_for<class kernel>(
-         range<1>(N), [=](id<1> idx)
-                          [[cl::intel_reqd_sub_group_size(16)]] {
+  }
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx)
+                                               [[cl::intel_reqd_sub_group_size(16)]] {
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
-                            int i = idx[0];
-                            asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n"
-                                         "svm_gather.4.1 (M1, 16) %0.0 V52.0\n"
-                                         "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n"
-                                         "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}"
-                                         :
-                                         : "rw"(&a[i]));
+                                                 int i = idx[0];
+                                                 asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n"
+                                                              "svm_gather.4.1 (M1, 16) %0.0 V52.0\n"
+                                                              "add(M1, 16) V52(0, 0)<1> V52(0, 0)<1; 1, 0> 0x1:w\n"
+                                                              "svm_scatter.4.1 (M1, 16) %0.0 V52.0\n}"
+                                                              :
+                                                              : "rw"(&a[i]));
 #else
-                            a[idx[0]]++;
+                                                 a[idx[0]]++;
 #endif
-                          });
+                                               });
    }).wait();
 
   bool currect = true;
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < problem_size; i++) {
     if (a[i] != (i + 1)) {
       currect = false;
       std::cerr << "error in a[" << i << "]="
@@ -48,11 +53,13 @@ int main() {
       break;
     }
   }
+
   if (!currect) {
     std::cerr << "Error" << std::endl;
     cl::sycl::free(a, ctx);
-    return -1;
+    return 1;
   }
+
   std::cerr << "Pass" << std::endl;
   cl::sycl::free(a, ctx);
   return 0;
diff --git a/sycl/test/inline-asm/malloc_shared_32.cpp b/sycl/test/inline-asm/malloc_shared_32.cpp
index d521afabaaca..8f058851c268 100644
--- a/sycl/test/inline-asm/malloc_shared_32.cpp
+++ b/sycl/test/inline-asm/malloc_shared_32.cpp
@@ -2,19 +2,21 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
-#define N 1000
-using namespace cl::sycl;
+
+constexpr size_t problem_size = 32;
+
+class kernel_name;
+
 int main() {
-  int *a;
-  int *b;
-  int *c;
-  queue q;
+  cl::sycl::queue q;
 
-  sycl::device Device = q.get_device();
+  cl::sycl::device Device = q.get_device();
 
   if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
     std::cout << "Skipping test\n";
@@ -22,18 +24,19 @@ int main() {
   }
 
   auto ctx = q.get_context();
-  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
-  b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
-  c = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
-  for (int i = 0; i < N; i++) {
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *c = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
     b[i] = -10;
     a[i] = i;
     c[i] = i;
   }
-  q.submit([&](handler &cgh) {
-     cgh.parallel_for<class kernel>(
-         range<1>(N),
-         [=](id<1> idx)
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size),
+         [=](cl::sycl::id<1> idx)
              [[cl::intel_reqd_sub_group_size(32)]] {
                int i = idx[0];
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
@@ -64,7 +67,7 @@ int main() {
    }).wait();
 
   bool currect = true;
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < problem_size; i++) {
     if (b[i] != a[i] * b[i]) {
       currect = false;
       std::cerr << "error in a[" << i << "]="
@@ -72,13 +75,15 @@ int main() {
       break;
     }
   }
+
   if (!currect) {
     std::cerr << "Error" << std::endl;
     cl::sycl::free(a, ctx);
     cl::sycl::free(b, ctx);
     cl::sycl::free(c, ctx);
-    return -1;
+    return 1;
   }
+
   std::cerr << "Pass" << std::endl;
   cl::sycl::free(a, ctx);
   cl::sycl::free(b, ctx);
diff --git a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp
index fc0fa139036f..a6994bd37919 100644
--- a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp
+++ b/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp
@@ -3,17 +3,18 @@
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
-#define N 100
-using namespace cl::sycl;
+
+constexpr size_t problem_size = 100;
+
+class kernel_name;
+
 int main() {
-  int *a;
-  int *b;
-  queue q;
+  cl::sycl::queue q;
 
-  sycl::device Device = q.get_device();
+  cl::sycl::device Device = q.get_device();
 
   if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
     std::cout << "Skipping test\n";
@@ -21,15 +22,16 @@ int main() {
   }
 
   auto ctx = q.get_context();
-  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
-  b = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
-  for (int i = 0; i < N; i++) {
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  int *b = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++) {
     b[i] = -1;
     a[i] = i;
   }
-  q.submit([&](handler &cgh) {
-     cgh.parallel_for<class kernel>(
-         range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
            int i = idx[0];
            volatile int tmp = a[i];
            tmp += 1;
@@ -44,7 +46,7 @@ int main() {
    }).wait();
 
   bool currect = true;
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < problem_size; i++) {
     if (b[i] != a[i]) {
       currect = false;
       std::cerr << "error in a[" << i << "]="
@@ -52,12 +54,14 @@ int main() {
       break;
     }
   }
+
   if (!currect) {
     std::cerr << "Error" << std::endl;
     cl::sycl::free(a, ctx);
     cl::sycl::free(b, ctx);
-    return -1;
+    return 1;
   }
+
   std::cerr << "Pass" << std::endl;
   cl::sycl::free(a, ctx);
   cl::sycl::free(b, ctx);
diff --git a/sycl/test/inline-asm/malloc_shared_no_input.cpp b/sycl/test/inline-asm/malloc_shared_no_input.cpp
index f2db2b8e7c9f..22cd47abd64a 100644
--- a/sycl/test/inline-asm/malloc_shared_no_input.cpp
+++ b/sycl/test/inline-asm/malloc_shared_no_input.cpp
@@ -2,28 +2,33 @@
 // REQUIRES: gpu,linux
 // RUN: %clangxx -fsycl %s -DINLINE_ASM -o %t.out
 // RUN: %t.out
+// RUN: %clangxx -fsycl %s -o %t.ref.out
+// RUN: %t.ref.out
 
-#include "include/asmcheck.h"
+#include "include/asmhelper.h"
 #include <CL/sycl.hpp>
 #include <iostream>
-#define N 100
-using namespace cl::sycl;
+
+constexpr size_t problem_size = 16;
+
+class kernel_name;
+
 int main() {
-  int *a;
-  queue q;
-  sycl::device Device = q.get_device();
+  cl::sycl::queue q;
+  cl::sycl::device Device = q.get_device();
 
   if (!isInlineASMSupported(Device) || !Device.has_extension("cl_intel_required_subgroup_size")) {
     std::cout << "Skipping test\n";
     return 0;
   }
   auto ctx = q.get_context();
-  a = (int *)malloc_shared(sizeof(int) * N, q.get_device(), ctx);
-  for (int i = 0; i < N; i++)
+  int *a = (int *)malloc_shared(sizeof(int) * problem_size, q.get_device(), ctx);
+  for (int i = 0; i < problem_size; i++)
     a[i] = i;
-  q.submit([&](handler &cgh) {
-     cgh.parallel_for<class kernel>(
-         range<1>(N), [=](id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
+
+  q.submit([&](cl::sycl::handler &cgh) {
+     cgh.parallel_for<kernel_name>(
+         cl::sycl::range<1>(problem_size), [=](cl::sycl::id<1> idx) [[cl::intel_reqd_sub_group_size(16)]] {
            int i = idx[0];
 #if defined(INLINE_ASM) && defined(__SYCL_DEVICE_ONLY__)
            asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d"
@@ -35,7 +40,7 @@ int main() {
    }).wait();
 
   bool currect = true;
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < problem_size; i++) {
     if (a[i] != 7) {
       currect = false;
       std::cerr << "error in a[" << i << "]="
@@ -43,11 +48,13 @@ int main() {
       break;
     }
   }
+
   if (!currect) {
     std::cerr << "Error" << std::endl;
     cl::sycl::free(a, ctx);
-    return -1;
+    return 1;
   }
+
   std::cerr << "Pass" << std::endl;
   cl::sycl::free(a, ctx);
   return 0;

From d67e0e9edccdc173ddbdb20eeba0f179d49575b0 Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Wed, 15 Apr 2020 19:02:00 +0300
Subject: [PATCH 03/10] Apply suggestion

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/inline-asm/include/asmhelper.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h
index 45d044161269..c57abff8c708 100644
--- a/sycl/test/inline-asm/include/asmhelper.h
+++ b/sycl/test/inline-asm/include/asmhelper.h
@@ -99,6 +99,9 @@ bool isInlineASMSupported(sycl::device Device) {
 /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
 template <typename F>
 bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
+#if !defined(INLINE_ASM)
+  return true; // any device is capable to launch reference versions of tests
+#endif
   cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
   cl::sycl::device device = deviceQueue.get_device();
 

From 29c858abbfd3ea31bb770ede564b7d4d5216a410 Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Wed, 15 Apr 2020 19:35:04 +0300
Subject: [PATCH 04/10] Small fix

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/inline-asm/include/asmhelper.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h
index c57abff8c708..dd0adb2eba7f 100644
--- a/sycl/test/inline-asm/include/asmhelper.h
+++ b/sycl/test/inline-asm/include/asmhelper.h
@@ -100,15 +100,13 @@ bool isInlineASMSupported(sycl::device Device) {
 template <typename F>
 bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
 #if !defined(INLINE_ASM)
-  return true; // any device is capable to launch reference versions of tests
-#endif
-  cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
-  cl::sycl::device device = deviceQueue.get_device();
-
-  if (!isInlineASMSupported(device)) {
+ if (!isInlineASMSupported(device)) {
     std::cout << "Skipping test\n";
     return false;
   }
+#endif
+  cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
+  cl::sycl::device device = deviceQueue.get_device();
 
   if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) {
     std::cout << "Skipping test\n";

From 0fc675dc0dadd3dffc96cc059a0cf21efb23e124 Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Wed, 15 Apr 2020 20:18:24 +0300
Subject: [PATCH 05/10] Small fix

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/inline-asm/include/asmhelper.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h
index dd0adb2eba7f..7b70376068ab 100644
--- a/sycl/test/inline-asm/include/asmhelper.h
+++ b/sycl/test/inline-asm/include/asmhelper.h
@@ -99,14 +99,15 @@ bool isInlineASMSupported(sycl::device Device) {
 /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
 template <typename F>
 bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
+  cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
+  cl::sycl::device device = deviceQueue.get_device();
+
 #if !defined(INLINE_ASM)
- if (!isInlineASMSupported(device)) {
+  if (!isInlineASMSupported(device)) {
     std::cout << "Skipping test\n";
     return false;
   }
 #endif
-  cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
-  cl::sycl::device device = deviceQueue.get_device();
 
   if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) {
     std::cout << "Skipping test\n";

From 063e02775a781d66d028b76c80a3b6430fbf88da Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Thu, 16 Apr 2020 15:32:58 +0300
Subject: [PATCH 06/10] Small fix

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/inline-asm/include/asmhelper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h
index 7b70376068ab..41c01f6e8c71 100644
--- a/sycl/test/inline-asm/include/asmhelper.h
+++ b/sycl/test/inline-asm/include/asmhelper.h
@@ -102,7 +102,7 @@ bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
   cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
   cl::sycl::device device = deviceQueue.get_device();
 
-#if !defined(INLINE_ASM)
+#if defined(INLINE_ASM)
   if (!isInlineASMSupported(device)) {
     std::cout << "Skipping test\n";
     return false;

From d4fb662e020891cb16d659a69615fb4e6a5e4861 Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Thu, 16 Apr 2020 15:40:47 +0300
Subject: [PATCH 07/10] Small fix

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/inline-asm/include/asmhelper.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h
index 41c01f6e8c71..59150909b836 100644
--- a/sycl/test/inline-asm/include/asmhelper.h
+++ b/sycl/test/inline-asm/include/asmhelper.h
@@ -8,7 +8,7 @@ constexpr const size_t DEFAULT_PROBLEM_SIZE = 16;
 template <typename T>
 struct WithOutputBuffer {
   WithOutputBuffer(size_t size) {
-    _output_buffer_data.resize(size, 0);
+    _output_buffer_data.resize(size);
     _output_buffer = new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size());
   }
 

From 8bb6e521695c8be9bed5c1c8126e8330edbe002a Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Thu, 16 Apr 2020 21:26:04 +0300
Subject: [PATCH 08/10] Correctly handle buffer

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/inline-asm/include/asmhelper.h | 56 ++++++++++++------------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/inline-asm/include/asmhelper.h
index 59150909b836..75585e161133 100644
--- a/sycl/test/inline-asm/include/asmhelper.h
+++ b/sycl/test/inline-asm/include/asmhelper.h
@@ -1,6 +1,7 @@
 #include <CL/sycl.hpp>
 
 #include <iostream>
+#include <memory>
 #include <vector>
 
 constexpr const size_t DEFAULT_PROBLEM_SIZE = 16;
@@ -9,20 +10,17 @@ template <typename T>
 struct WithOutputBuffer {
   WithOutputBuffer(size_t size) {
     _output_buffer_data.resize(size);
-    _output_buffer = new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size());
+    _output_buffer.reset(new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size()));
   }
 
   WithOutputBuffer(const std::vector<T> &data) {
     _output_buffer_data = data;
-    _output_buffer = new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size());
+    _output_buffer.reset(new cl::sycl::buffer<T>(_output_buffer_data.data(), _output_buffer_data.size()));
   }
 
-  ~WithOutputBuffer() {
-    if (_output_buffer)
-      delete _output_buffer;
-  }
-
-  const std::vector<T> &getOutputBufferData() const {
+  const std::vector<T> &getOutputBufferData() {
+    // We cannoe access the data until the buffer is still alive
+    _output_buffer.reset();
     return _output_buffer_data;
   }
 
@@ -35,7 +33,10 @@ struct WithOutputBuffer {
     return *_output_buffer;
   }
 
-  cl::sycl::buffer<T> *_output_buffer = nullptr;
+  // Functor is being passed by-copy into cl::sycl::queue::submit and destroyed
+  // one more time in there. We need to make sure that buffer is only released
+  // once.
+  std::shared_ptr<cl::sycl::buffer<T>> _output_buffer = nullptr;
   std::vector<T> _output_buffer_data;
 };
 
@@ -48,26 +49,19 @@ struct WithInputBuffers {
     constructorHelper<0>(inputs...);
   }
 
-  ~WithInputBuffers() {
-    for (size_t i = 0; i < N; ++i) {
-      if (_input_buffers[i])
-        delete _input_buffers[i];
-    }
-  }
-
   cl::sycl::buffer<T> &getInputBuffer(size_t i = 0) {
     return *_input_buffers[i];
   }
 
 protected:
-  cl::sycl::buffer<T> *_input_buffers[N] = {nullptr};
+  std::shared_ptr<cl::sycl::buffer<T>> _input_buffers[N] = {nullptr};
   std::vector<T> _input_buffers_data[N];
 
 private:
   template <int Index, typename... Args>
   void constructorHelper(const std::vector<T> &data, Args... rest) {
     _input_buffers_data[Index] = data;
-    _input_buffers[Index] = new cl::sycl::buffer<T>(_input_buffers_data[Index].data(), _input_buffers_data[Index].size());
+    _input_buffers[Index].reset(new cl::sycl::buffer<T>(_input_buffers_data[Index].data(), _input_buffers_data[Index].size()));
     constructorHelper<Index + 1>(rest...);
   }
 
@@ -99,22 +93,26 @@ bool isInlineASMSupported(sycl::device Device) {
 /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise
 template <typename F>
 bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true) {
-  cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
-  cl::sycl::device device = deviceQueue.get_device();
+  try {
+    cl::sycl::queue deviceQueue(cl::sycl::gpu_selector{});
+    cl::sycl::device device = deviceQueue.get_device();
 
 #if defined(INLINE_ASM)
-  if (!isInlineASMSupported(device)) {
-    std::cout << "Skipping test\n";
-    return false;
-  }
+    if (!isInlineASMSupported(device)) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
 #endif
 
-  if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) {
-    std::cout << "Skipping test\n";
-    return false;
-  }
+    if (requires_particular_sg_size && !device.has_extension("cl_intel_required_subgroup_size")) {
+      std::cout << "Skipping test\n";
+      return false;
+    }
 
-  deviceQueue.submit(f).wait();
+    deviceQueue.submit(f).wait();
+  } catch (cl::sycl::exception &e) {
+    std::cerr << "Caught exception: " << e.what() << std::endl;
+  }
   return true;
 }
 

From 2efe78f1af2e0781b48efb698ad212acc7f910a3 Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Fri, 17 Apr 2020 15:32:48 +0300
Subject: [PATCH 09/10] Move inline asm tests to feature tests

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/CMakeLists.txt                                  | 8 ++++++++
 sycl/test/{ => feature-tests}/inline-asm/asm_16_empty.cpp | 0
 .../{ => feature-tests}/inline-asm/asm_16_matrix_mult.cpp | 0
 .../inline-asm/asm_16_no_input_int.cpp                    | 0
 .../{ => feature-tests}/inline-asm/asm_16_no_opts.cpp     | 0
 sycl/test/{ => feature-tests}/inline-asm/asm_8_empty.cpp  | 0
 .../{ => feature-tests}/inline-asm/asm_8_no_input_int.cpp | 0
 .../inline-asm/asm_arbitrary_ops_order.cpp                | 0
 .../{ => feature-tests}/inline-asm/asm_decl_in_scope.cpp  | 0
 .../test/{ => feature-tests}/inline-asm/asm_float_add.cpp | 0
 .../{ => feature-tests}/inline-asm/asm_float_imm_arg.cpp  | 0
 .../test/{ => feature-tests}/inline-asm/asm_float_neg.cpp | 0
 sycl/test/{ => feature-tests}/inline-asm/asm_imm_arg.cpp  | 0
 sycl/test/{ => feature-tests}/inline-asm/asm_mul.cpp      | 0
 .../inline-asm/asm_multiple_instructions.cpp              | 0
 .../{ => feature-tests}/inline-asm/asm_no_operands.cpp    | 0
 .../test/{ => feature-tests}/inline-asm/asm_no_output.cpp | 0
 sycl/test/{ => feature-tests}/inline-asm/asm_plus_mod.cpp | 0
 .../{ => feature-tests}/inline-asm/include/asmhelper.h    | 0
 .../{ => feature-tests}/inline-asm/letter_example.cpp     | 0
 .../{ => feature-tests}/inline-asm/malloc_shared_32.cpp   | 0
 .../inline-asm/malloc_shared_in_out_dif.cpp               | 0
 .../inline-asm/malloc_shared_no_input.cpp                 | 0
 sycl/test/lit.cfg.py                                      | 3 ++-
 24 files changed, 10 insertions(+), 1 deletion(-)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_16_empty.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_16_matrix_mult.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_16_no_input_int.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_16_no_opts.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_8_empty.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_8_no_input_int.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_arbitrary_ops_order.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_decl_in_scope.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_float_add.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_float_imm_arg.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_float_neg.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_imm_arg.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_mul.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_multiple_instructions.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_no_operands.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_no_output.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/asm_plus_mod.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/include/asmhelper.h (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/letter_example.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/malloc_shared_32.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/malloc_shared_in_out_dif.cpp (100%)
 rename sycl/test/{ => feature-tests}/inline-asm/malloc_shared_no_input.cpp (100%)

diff --git a/sycl/test/CMakeLists.txt b/sycl/test/CMakeLists.txt
index a152c4b34137..0fc16451d8fc 100644
--- a/sycl/test/CMakeLists.txt
+++ b/sycl/test/CMakeLists.txt
@@ -61,6 +61,14 @@ add_lit_testsuites(SYCL-DEPLOY ${CMAKE_CURRENT_SOURCE_DIR}
   EXCLUDE_FROM_CHECK_ALL
   )
 
+add_lit_target(check-sycl-inline-asm
+  "Running lit suite ${CMAKE_CURRENT_SOURCE_DIR}/feature-tests/inline-asm"
+  "feature-tests/inline-asm"
+  ARGS ${RT_TEST_ARGS}
+  PARAMS "SYCL_BE=PI_OPENCL"
+  DEPENDS ${SYCL_TEST_DEPS}
+  )
+
 if(SYCL_BUILD_PI_CUDA)
   add_lit_testsuite(check-sycl-cuda "Running the SYCL regression tests for CUDA"
     ${CMAKE_CURRENT_BINARY_DIR}
diff --git a/sycl/test/inline-asm/asm_16_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_16_empty.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_16_empty.cpp
rename to sycl/test/feature-tests/inline-asm/asm_16_empty.cpp
diff --git a/sycl/test/inline-asm/asm_16_matrix_mult.cpp b/sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_16_matrix_mult.cpp
rename to sycl/test/feature-tests/inline-asm/asm_16_matrix_mult.cpp
diff --git a/sycl/test/inline-asm/asm_16_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_16_no_input_int.cpp
rename to sycl/test/feature-tests/inline-asm/asm_16_no_input_int.cpp
diff --git a/sycl/test/inline-asm/asm_16_no_opts.cpp b/sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_16_no_opts.cpp
rename to sycl/test/feature-tests/inline-asm/asm_16_no_opts.cpp
diff --git a/sycl/test/inline-asm/asm_8_empty.cpp b/sycl/test/feature-tests/inline-asm/asm_8_empty.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_8_empty.cpp
rename to sycl/test/feature-tests/inline-asm/asm_8_empty.cpp
diff --git a/sycl/test/inline-asm/asm_8_no_input_int.cpp b/sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_8_no_input_int.cpp
rename to sycl/test/feature-tests/inline-asm/asm_8_no_input_int.cpp
diff --git a/sycl/test/inline-asm/asm_arbitrary_ops_order.cpp b/sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_arbitrary_ops_order.cpp
rename to sycl/test/feature-tests/inline-asm/asm_arbitrary_ops_order.cpp
diff --git a/sycl/test/inline-asm/asm_decl_in_scope.cpp b/sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_decl_in_scope.cpp
rename to sycl/test/feature-tests/inline-asm/asm_decl_in_scope.cpp
diff --git a/sycl/test/inline-asm/asm_float_add.cpp b/sycl/test/feature-tests/inline-asm/asm_float_add.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_float_add.cpp
rename to sycl/test/feature-tests/inline-asm/asm_float_add.cpp
diff --git a/sycl/test/inline-asm/asm_float_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_float_imm_arg.cpp
rename to sycl/test/feature-tests/inline-asm/asm_float_imm_arg.cpp
diff --git a/sycl/test/inline-asm/asm_float_neg.cpp b/sycl/test/feature-tests/inline-asm/asm_float_neg.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_float_neg.cpp
rename to sycl/test/feature-tests/inline-asm/asm_float_neg.cpp
diff --git a/sycl/test/inline-asm/asm_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_imm_arg.cpp
rename to sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp
diff --git a/sycl/test/inline-asm/asm_mul.cpp b/sycl/test/feature-tests/inline-asm/asm_mul.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_mul.cpp
rename to sycl/test/feature-tests/inline-asm/asm_mul.cpp
diff --git a/sycl/test/inline-asm/asm_multiple_instructions.cpp b/sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_multiple_instructions.cpp
rename to sycl/test/feature-tests/inline-asm/asm_multiple_instructions.cpp
diff --git a/sycl/test/inline-asm/asm_no_operands.cpp b/sycl/test/feature-tests/inline-asm/asm_no_operands.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_no_operands.cpp
rename to sycl/test/feature-tests/inline-asm/asm_no_operands.cpp
diff --git a/sycl/test/inline-asm/asm_no_output.cpp b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_no_output.cpp
rename to sycl/test/feature-tests/inline-asm/asm_no_output.cpp
diff --git a/sycl/test/inline-asm/asm_plus_mod.cpp b/sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp
similarity index 100%
rename from sycl/test/inline-asm/asm_plus_mod.cpp
rename to sycl/test/feature-tests/inline-asm/asm_plus_mod.cpp
diff --git a/sycl/test/inline-asm/include/asmhelper.h b/sycl/test/feature-tests/inline-asm/include/asmhelper.h
similarity index 100%
rename from sycl/test/inline-asm/include/asmhelper.h
rename to sycl/test/feature-tests/inline-asm/include/asmhelper.h
diff --git a/sycl/test/inline-asm/letter_example.cpp b/sycl/test/feature-tests/inline-asm/letter_example.cpp
similarity index 100%
rename from sycl/test/inline-asm/letter_example.cpp
rename to sycl/test/feature-tests/inline-asm/letter_example.cpp
diff --git a/sycl/test/inline-asm/malloc_shared_32.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp
similarity index 100%
rename from sycl/test/inline-asm/malloc_shared_32.cpp
rename to sycl/test/feature-tests/inline-asm/malloc_shared_32.cpp
diff --git a/sycl/test/inline-asm/malloc_shared_in_out_dif.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp
similarity index 100%
rename from sycl/test/inline-asm/malloc_shared_in_out_dif.cpp
rename to sycl/test/feature-tests/inline-asm/malloc_shared_in_out_dif.cpp
diff --git a/sycl/test/inline-asm/malloc_shared_no_input.cpp b/sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp
similarity index 100%
rename from sycl/test/inline-asm/malloc_shared_no_input.cpp
rename to sycl/test/feature-tests/inline-asm/malloc_shared_no_input.cpp
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index e6ae419047b9..6285c44d2763 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -26,7 +26,8 @@
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = ['.c', '.cpp'] #add .spv. Currently not clear what to do with those
 
-config.excludes = ['Inputs']
+# feature tests are considered not so lightweight, so, they are excluded by default
+config.excludes = ['Inputs', 'feature-tests']
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)

From aebe6a8759468229192e15518bdeddaca15e1d72 Mon Sep 17 00:00:00 2001
From: amochalo <anastasiya.mochalova@intel.com>
Date: Fri, 17 Apr 2020 15:39:50 +0300
Subject: [PATCH 10/10] Apply suggestion

Signed-off-by: amochalo <anastasiya.mochalova@intel.com>
---
 sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp   | 4 ++--
 sycl/test/feature-tests/inline-asm/asm_no_output.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp
index ce9a3ad948a4..2dba04d1179b 100644
--- a/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp
+++ b/sycl/test/feature-tests/inline-asm/asm_imm_arg.cpp
@@ -45,9 +45,9 @@ int main() {
 
   auto &B = f.getOutputBufferData();
   for (int i = 0; i < DEFAULT_PROBLEM_SIZE; ++i) {
-    if (B[i] != input[i] * CONST_ARGUMENT) {
+    if (B[i] != input[i] + CONST_ARGUMENT) {
       std::cerr << "At index: " << i << ". ";
-      std::cerr << B[i] << " != " << input[i] * CONST_ARGUMENT << "\n";
+      std::cerr << B[i] << " != " << input[i] + CONST_ARGUMENT << "\n";
       return 1;
     }
   }
diff --git a/sycl/test/feature-tests/inline-asm/asm_no_output.cpp b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp
index ab65346d1313..ff6c65d48b31 100644
--- a/sycl/test/feature-tests/inline-asm/asm_no_output.cpp
+++ b/sycl/test/feature-tests/inline-asm/asm_no_output.cpp
@@ -40,7 +40,7 @@ int main() {
   if (!launchInlineASMTest(f))
     return 0;
 
-  if (verify_all_the_same(f.getOutputBufferData(), 47))
+  if (verify_all_the_same(f.getOutputBufferData(), 0))
     return 0;
 
   return 1;