diff --git a/README.md b/README.md
index 21ad58965..958878fae 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,15 @@ cmake --build . --target gc-check
 Notes:
  * `/PATH/TO/llvm-project/llvm-install` should be the install path of LLVM. If you installed LLVM elsewhere by `-DCMAKE_INSTALL_PREFIX` option when building LLVM, you need to change the path in `-DMLIR_DIR` accordingly.
  *  The cmake option `-DLLVM_EXTERNAL_LIT` is for the tests of this project. It requires the `lit` tool to be installed in the system. You can install it via `pip install lit`. If you don't need to run the tests of this repo, you can omit this option in the command line.
- * If GPU components are on (`-DGC_USE_GPU=ON`), make sure the Level-zero runtime is installed in your system. Either install Level-zero runtime via system package managers (e.g. `apt`), or follow the instructions of [IMEX](https://github.com/intel/mlir-extensions).
+
+More notes if GPU components are on (`-DGC_USE_GPU=ON`):
+ * make sure the OpenCL runtime is installed in your system. You can either
+  install using OS-provided package (Ubuntu 22.04)
+```sh
+sudo apt install -y intel-opencl-icd opencl-c-headers
+```
+  Or, download and install package from: https://github.com/intel/compute-runtime/releases
+ * the LLVM codebase needs to be patched to support XeGPU lowering (from IMEX). Please follow instructions of [IMEX](https://github.com/intel/mlir-extensions) on patching LLVM.
 
 Graph Compiler supports the following build-time options.
 
diff --git a/cmake/imex.cmake b/cmake/imex.cmake
index 0a21d0181..8ed61e3b3 100644
--- a/cmake/imex.cmake
+++ b/cmake/imex.cmake
@@ -4,11 +4,11 @@ get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
 if (NOT DEFINED IMEX_INCLUDES)
     include(functions)
     set(IMEX_CHECK_LLVM_VERSION ON)
-    set(IMEX_ENABLE_L0_RUNTIME 1)
+    set(IMEX_ENABLE_L0_RUNTIME 0)
     # TODO: Change to main https://github.com/oneapi-src/oneDNN.git when all the
     # required functionality is merged.
     gc_fetch_content(imex 496b240093b5e132b60c5ee69878300fe69be300 https://github.com/Menooker/mlir-extensions
-            CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=1"
+            CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=0"
     )
 
     set(IMEX_INCLUDES
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
index fc727abb3..79a62f028 100644
--- a/include/gc/Transforms/Passes.td
+++ b/include/gc/Transforms/Passes.td
@@ -32,7 +32,7 @@ def ConvertOneDNNGraphToLinalg : Pass<"convert-onednn-graph-to-linalg"> {
   ];
 }
 
-
+#ifdef GC_USE_GPU
 def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
   let summary = "Convert linalg dialect to XeGPU dialect.";
   let description = [{
@@ -57,5 +57,6 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> {
                "DPAS register block sizes MxNxK">,
   ];
 }
+#endif
 
 #endif // GC_DIALECT_GC_PASSES
diff --git a/lib/gc/CAPI/CMakeLists.txt b/lib/gc/CAPI/CMakeLists.txt
index 1d2e7687e..f7738c064 100644
--- a/lib/gc/CAPI/CMakeLists.txt
+++ b/lib/gc/CAPI/CMakeLists.txt
@@ -1,10 +1,16 @@
+set(GC_ALL_LIBS 
+  MLIROneDNNGraph
+  MLIRCPURuntimeDialect
+  GCPasses
+  MLIRCPURuntimeTransforms)
+
+if(GC_USE_GPU)
+  list(APPEND GC_ALL_LIBS GCGPUPasses)
+endif()
+
 add_mlir_public_c_api_library(GcCAPI
   Dialects.cpp
   Passes.cpp
   LINK_LIBS PUBLIC
-  MLIROneDNNGraph
-  MLIRCPURuntimeDialect
-  GCPasses
-  GCGPUPasses
-  MLIRCPURuntimeTransforms
+  ${GC_ALL_LIBS}
 )
\ No newline at end of file
diff --git a/lib/gc/ExecutionEngine/CMakeLists.txt b/lib/gc/ExecutionEngine/CMakeLists.txt
index ae0c1c8df..d5279b044 100644
--- a/lib/gc/ExecutionEngine/CMakeLists.txt
+++ b/lib/gc/ExecutionEngine/CMakeLists.txt
@@ -1,2 +1,5 @@
 add_subdirectory(CPURuntime)
-add_subdirectory(Driver)
\ No newline at end of file
+add_subdirectory(Driver)
+if(GC_USE_GPU)
+    add_subdirectory(OpenCLRuntime)
+endif()
\ No newline at end of file
diff --git a/lib/gc/ExecutionEngine/Driver/CMakeLists.txt b/lib/gc/ExecutionEngine/Driver/CMakeLists.txt
index d04dbbb4e..8742ef6e9 100644
--- a/lib/gc/ExecutionEngine/Driver/CMakeLists.txt
+++ b/lib/gc/ExecutionEngine/Driver/CMakeLists.txt
@@ -26,6 +26,11 @@ else()
   )
 endif()
 
+set(GC_PASSES GCPasses)
+if(GC_USE_GPU)
+  list(APPEND GC_PASSES GCGPUPasses)
+endif()
+
 add_mlir_library(GCJitWrapper
   Driver.cpp
 
@@ -35,8 +40,7 @@ add_mlir_library(GCJitWrapper
   LINK_LIBS PUBLIC
     ${MLIR_LINK_COMPONENTS}
     ${dialect_libs}
-    ${conversion_libs} 
-    GCPasses
-    GCGPUPasses
+    ${conversion_libs}
+    ${GC_PASSES}
   )
 
diff --git a/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt b/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt
new file mode 100644
index 000000000..62163496f
--- /dev/null
+++ b/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt
@@ -0,0 +1,22 @@
+find_package(OpenCL REQUIRED)
+
+add_mlir_library(mlir_opencl_runtime
+    SHARED
+    OpenCLRuntimeWrappers.cpp
+
+    EXCLUDE_FROM_LIBMLIR
+  )
+
+check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+if(NOT CXX_HAS_FRTTI_FLAG)
+    message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+endif()
+target_compile_options (mlir_opencl_runtime PUBLIC -fexceptions -frtti)
+
+target_include_directories(mlir_opencl_runtime PRIVATE
+    ${MLIR_INCLUDE_DIRS}
+    ${OpenCL_INCLUDE_DIRS}
+    )
+
+message(STATUS "OpenCL Libraries: ${OpenCL_LIBRARIES}")
+target_link_libraries(mlir_opencl_runtime PUBLIC ${OpenCL_LIBRARIES})
diff --git a/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp b/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp
new file mode 100644
index 000000000..b78ca2233
--- /dev/null
+++ b/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp
@@ -0,0 +1,468 @@
+//===-- OpenCLRuntimeWrappers.cpp - OpenCLRuntimeWrappers -------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define CL_TARGET_OPENCL_VERSION 300
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <mutex>
+#include <vector>
+
+#ifdef _WIN32
+#define OCL_RUNTIME_EXPORT __declspec(dllexport)
+#else
+#define OCL_RUNTIME_EXPORT
+#endif // _WIN32
+
+namespace {
+
+#define CL_SAFE_CALL2(a)                                                       \
+  do {                                                                         \
+    (a);                                                                       \
+    if (err != CL_SUCCESS) {                                                   \
+      fprintf(stderr, "FAIL: err=%d @ line=%d (%s)\n", err, __LINE__, (#a));   \
+      abort();                                                                 \
+    }                                                                          \
+  } while (0)
+
+#define CL_SAFE_CALL(call)                                                     \
+  {                                                                            \
+    auto status = (call);                                                      \
+    if (status != CL_SUCCESS) {                                                \
+      fprintf(stderr, "CL error %d @ line=%d (%s)\n", status, __LINE__,        \
+              (#call));                                                        \
+      abort();                                                                 \
+    }                                                                          \
+  }
+
+constexpr char DeviceMemAllocName[] = "clDeviceMemAllocINTEL";
+constexpr char SharedMemAllocName[] = "clSharedMemAllocINTEL";
+constexpr char MemBlockingFreeName[] = "clMemBlockingFreeINTEL";
+constexpr char SetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL";
+static constexpr char EnqueueMemcpyName[] = "clEnqueueMemcpyINTEL";
+
+void *queryCLExtFunc(cl_platform_id CurPlatform, const char *FuncName) {
+  void *ret = clGetExtensionFunctionAddressForPlatform(CurPlatform, FuncName);
+
+  if (!ret) {
+    fflush(stderr);
+    abort();
+  }
+  return ret;
+}
+
+void *queryCLExtFunc(cl_device_id dev, const char *FuncName) {
+  cl_platform_id CurPlatform;
+  CL_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM, sizeof(cl_platform_id),
+                               &CurPlatform, nullptr));
+  return queryCLExtFunc(CurPlatform, FuncName);
+}
+
+struct CLExtTable {
+  clDeviceMemAllocINTEL_fn allocDev;
+  clSharedMemAllocINTEL_fn allocShared;
+  clMemBlockingFreeINTEL_fn blockingFree;
+  clSetKernelArgMemPointerINTEL_fn setKernelArgMemPtr;
+  clEnqueueMemcpyINTEL_fn enqueneMemcpy;
+  CLExtTable() = default;
+  CLExtTable(cl_platform_id plat) {
+    allocDev =
+        (clDeviceMemAllocINTEL_fn)queryCLExtFunc(plat, DeviceMemAllocName);
+    allocShared =
+        (clSharedMemAllocINTEL_fn)queryCLExtFunc(plat, SharedMemAllocName);
+    blockingFree =
+        (clMemBlockingFreeINTEL_fn)queryCLExtFunc(plat, MemBlockingFreeName);
+    setKernelArgMemPtr = (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc(
+        plat, SetKernelArgMemPointerName);
+    enqueneMemcpy =
+        (clEnqueueMemcpyINTEL_fn)queryCLExtFunc(plat, EnqueueMemcpyName);
+  }
+};
+
+struct CLExtTableCache {
+  cl_platform_id platform;
+  CLExtTable table;
+  CLExtTableCache(cl_platform_id CurPlatform)
+      : platform{CurPlatform}, table{CurPlatform} {}
+  static CLExtTable *get(cl_device_id dev) {
+    cl_platform_id CurPlatform;
+    CL_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM,
+                                 sizeof(cl_platform_id), &CurPlatform,
+                                 nullptr));
+    static CLExtTableCache v{CurPlatform};
+    if (v.platform == CurPlatform) {
+      return &v.table;
+    }
+    return nullptr;
+  }
+};
+
+struct ParamDesc {
+  void *data;
+  size_t size;
+
+  bool operator==(const ParamDesc &rhs) const {
+    return data == rhs.data && size == rhs.size;
+  }
+
+  bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); }
+};
+
+template <typename T> size_t countUntil(T *ptr, T &&elem) {
+  assert(ptr);
+  auto curr = ptr;
+  while (*curr != elem) {
+    ++curr;
+  }
+  return static_cast<size_t>(curr - ptr);
+}
+} // namespace
+
+static cl_device_id getDevice(cl_device_type *devtype) {
+  cl_platform_id platform; // OpenCL platform
+  cl_device_id device;     // device ID
+  CL_SAFE_CALL(clGetPlatformIDs(1, &platform, NULL));
+  CL_SAFE_CALL(clGetDeviceIDs(platform, *devtype, 1, &device, NULL));
+  return device;
+}
+
+struct GPUCLQUEUE {
+  cl_device_id device_ = nullptr;
+  cl_context context_ = nullptr;
+  cl_command_queue queue_ = nullptr;
+  bool context_owned_ = false;
+  bool queue_owned_ = false;
+  CLExtTable *ext_table_ = nullptr;
+  std::vector<cl_program> programs_;
+  std::vector<cl_kernel> kernels_;
+
+  GPUCLQUEUE(cl_device_type *device, cl_context context,
+             cl_command_queue queue) {
+    cl_device_type defaultdev = CL_DEVICE_TYPE_GPU;
+    if (!device) {
+      device = &defaultdev;
+    }
+    device_ = getDevice(device);
+    init_context(context, queue, device_);
+    ext_table_ = CLExtTableCache::get(device_);
+  }
+  GPUCLQUEUE(cl_device_id device, cl_context context, cl_command_queue queue) {
+    if (!device) {
+      cl_device_type defaultdev = CL_DEVICE_TYPE_GPU;
+      device = getDevice(&defaultdev);
+    }
+    device_ = device;
+    init_context(context, queue, device_);
+    ext_table_ = CLExtTableCache::get(device_);
+  }
+  ~GPUCLQUEUE() {
+    for (auto p : kernels_) {
+      clReleaseKernel(p);
+    }
+    for (auto p : programs_) {
+      clReleaseProgram(p);
+    }
+    if (queue_ && queue_owned_)
+      clReleaseCommandQueue(queue_);
+    if (context_ && context_owned_)
+      clReleaseContext(context_);
+  }
+
+private:
+  void init_context(cl_context context, cl_command_queue queue,
+                    cl_device_id device) {
+    if (queue) {
+      if (!context) {
+        throw std::runtime_error(
+            "Cannot create QUEUE wrapper with queue and without context");
+      }
+      queue_ = queue;
+      queue_owned_ = true;
+      context_ = context;
+      context_owned_ = true;
+      return;
+    }
+    cl_int err;
+    if (!context) {
+      CL_SAFE_CALL2(context_ =
+                        clCreateContext(NULL, 1, &device, NULL, NULL, &err));
+      context_owned_ = true;
+    } else {
+      context_ = context;
+    }
+    CL_SAFE_CALL2(
+        queue_ = clCreateCommandQueueWithProperties(context_, device, 0, &err));
+    queue_owned_ = true;
+  }
+}; // end of GPUCLQUEUE
+
+static void *allocDeviceMemory(GPUCLQUEUE *queue, size_t size, size_t alignment,
+                               bool isShared) {
+  void *memPtr = nullptr;
+  cl_int err;
+  if (isShared) {
+    auto func = queue->ext_table_ ? queue->ext_table_->allocShared
+                                  : (clSharedMemAllocINTEL_fn)queryCLExtFunc(
+                                        queue->device_, SharedMemAllocName);
+    CL_SAFE_CALL2(memPtr = func(queue->context_, queue->device_, nullptr, size,
+                                alignment, &err));
+  } else {
+    auto func = queue->ext_table_ ? queue->ext_table_->allocDev
+                                  : (clDeviceMemAllocINTEL_fn)queryCLExtFunc(
+                                        queue->device_, DeviceMemAllocName);
+    CL_SAFE_CALL2(memPtr = func(queue->context_, queue->device_, nullptr, size,
+                                alignment, &err));
+  }
+  return memPtr;
+}
+
+static void deallocDeviceMemory(GPUCLQUEUE *queue, void *ptr) {
+  auto func = queue->ext_table_ ? queue->ext_table_->blockingFree
+                                : (clMemBlockingFreeINTEL_fn)queryCLExtFunc(
+                                      queue->device_, MemBlockingFreeName);
+  CL_SAFE_CALL(func(queue->context_, ptr));
+}
+
+static cl_program loadModule(GPUCLQUEUE *queue, const unsigned char *data,
+                             size_t dataSize, bool takeOwnership) {
+  assert(data);
+  cl_int errNum = 0;
+  const unsigned char *codes[1] = {data};
+  size_t sizes[1] = {dataSize};
+  cl_program program;
+  cl_int err;
+  CL_SAFE_CALL2(program = clCreateProgramWithBinary(queue->context_, 1,
+                                                    &queue->device_, sizes,
+                                                    codes, &err, &errNum));
+  const char *build_flags = "-cl-kernel-arg-info -x spir";
+  // enable large register file if needed
+  if (getenv("IMEX_ENABLE_LARGE_REG_FILE")) {
+    build_flags =
+        "-vc-codegen -doubleGRF -Xfinalizer -noLocalSplit -Xfinalizer "
+        "-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer "
+        "'-printregusage -enableBCR' -cl-kernel-arg-info -x spir";
+  }
+  CL_SAFE_CALL(clBuildProgram(program, 0, NULL, build_flags, NULL, NULL));
+  if (takeOwnership)
+    queue->programs_.push_back(program);
+  return program;
+}
+
+static cl_kernel getKernel(GPUCLQUEUE *queue, cl_program program,
+                           const char *name) {
+  cl_kernel kernel;
+  cl_int err;
+  CL_SAFE_CALL2(kernel = clCreateKernel(program, name, &err));
+  cl_bool TrueVal = CL_TRUE;
+  CL_SAFE_CALL(clSetKernelExecInfo(
+      kernel, CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL, sizeof(cl_bool),
+      &TrueVal));
+  CL_SAFE_CALL(clSetKernelExecInfo(
+      kernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, sizeof(cl_bool),
+      &TrueVal));
+  CL_SAFE_CALL(clSetKernelExecInfo(
+      kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool),
+      &TrueVal));
+  queue->kernels_.push_back(kernel);
+  return kernel;
+}
+
+template <typename NumArgsFuncT, typename GetParamFuncT>
+static void launchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX,
+                         size_t gridY, size_t gridZ, size_t blockX,
+                         size_t blockY, size_t blockZ, size_t sharedMemBytes,
+                         NumArgsFuncT &&fnGetNumArgs,
+                         GetParamFuncT &&fnGetParamFunc) {
+  auto clSetKernelArgMemPointerINTEL =
+      queue->ext_table_ ? queue->ext_table_->setKernelArgMemPtr
+                        : (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc(
+                              queue->device_, SetKernelArgMemPointerName);
+  auto paramsCount = fnGetNumArgs();
+  for (size_t i = 0; i < paramsCount; i++) {
+    cl_kernel_arg_address_qualifier name;
+    size_t nameSize = sizeof(name);
+    // we can do better here, to cache the arginfo for the kernel
+    CL_SAFE_CALL(clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_ADDRESS_QUALIFIER,
+                                    sizeof(name), &name, &nameSize));
+    auto [paramData, paramSize] = fnGetParamFunc(i);
+    if (paramSize == sizeof(void *) && name == CL_KERNEL_ARG_ADDRESS_GLOBAL) {
+      // pass the value of the pointer instead of the pointer of the pointer
+      CL_SAFE_CALL(
+          clSetKernelArgMemPointerINTEL(kernel, i, *(void **)paramData));
+    } else {
+      CL_SAFE_CALL(clSetKernelArg(kernel, i, paramSize, paramData));
+    }
+  }
+  if (sharedMemBytes) {
+    CL_SAFE_CALL(clSetKernelArg(kernel, paramsCount, sharedMemBytes, nullptr));
+  }
+  size_t globalSize[3] = {gridX * blockX, gridY * blockY, gridZ * blockZ};
+  size_t localSize[3] = {blockX, blockY, blockZ};
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(queue->queue_, kernel, 3, NULL,
+                                      globalSize, localSize, 0, NULL, NULL));
+}
+
+static GPUCLQUEUE *getDefaultQueue() {
+  static GPUCLQUEUE defaultq(static_cast<cl_device_id>(nullptr), nullptr,
+                             nullptr);
+  return &defaultq;
+}
+
+// Wrappers
+
+extern "C" OCL_RUNTIME_EXPORT GPUCLQUEUE *gpuCreateStream(void *device,
+                                                          void *context) {
+  // todo: this is a workaround of issue of gpux generating multiple streams
+  if (!device && !context) {
+    return getDefaultQueue();
+  }
+  return new GPUCLQUEUE(reinterpret_cast<cl_device_id>(device),
+                        reinterpret_cast<cl_context>(context), nullptr);
+}
+
+extern "C" OCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUCLQUEUE *queue) {
+  // todo: this is a workaround of issue of gpux generating multiple streams
+  // should uncomment the below line to release the queue
+  // delete queue;
+}
+
+extern "C" OCL_RUNTIME_EXPORT void *
+gpuMemAlloc(GPUCLQUEUE *queue, size_t size, size_t alignment, bool isShared) {
+  if (queue) {
+    return allocDeviceMemory(queue, size, alignment, isShared);
+  }
+  return nullptr;
+}
+
+extern "C" OCL_RUNTIME_EXPORT void gpuMemFree(GPUCLQUEUE *queue, void *ptr) {
+  if (queue && ptr) {
+    deallocDeviceMemory(queue, ptr);
+  }
+}
+
+extern "C" OCL_RUNTIME_EXPORT cl_program
+gpuModuleLoad(GPUCLQUEUE *queue, const unsigned char *data, size_t dataSize) {
+  if (queue) {
+    return loadModule(queue, data, dataSize, false);
+  }
+  return nullptr;
+}
+
+extern "C" OCL_RUNTIME_EXPORT cl_kernel gpuKernelGet(GPUCLQUEUE *queue,
+                                                     cl_program module,
+                                                     const char *name) {
+  if (queue) {
+    return getKernel(queue, module, name);
+  }
+  return nullptr;
+}
+
+extern "C" OCL_RUNTIME_EXPORT void
+gpuLaunchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX, size_t gridY,
+                size_t gridZ, size_t blockX, size_t blockY, size_t blockZ,
+                size_t sharedMemBytes, void *params) {
+  if (queue) {
+    auto typedParams = static_cast<ParamDesc *>(params);
+    launchKernel(
+        queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+        sharedMemBytes,
+        [&]() {
+          // The assumption is, if there is a param for the shared local memory,
+          // then that will always be the last argument.
+          auto paramsCount = countUntil(typedParams, ParamDesc{nullptr, 0});
+          if (sharedMemBytes) {
+            paramsCount = paramsCount - 1;
+          }
+          return paramsCount;
+        },
+        [&](size_t i) -> const ParamDesc & { return typedParams[i]; });
+  }
+}
+
+extern "C" OCL_RUNTIME_EXPORT void gpuWait(GPUCLQUEUE *queue) {
+  if (queue) {
+    CL_SAFE_CALL(clFinish(queue->queue_));
+  }
+}
+
+////////////////////////////////////////////////////////////////
+// Here starts the upstream OCL wrappers
+////////////////////////////////////////////////////////////////
+
+// a silly workaround for mgpuModuleLoad. OCL needs context and device to load
+// the module. We remember the last call to any mgpu* APIs
+static thread_local GPUCLQUEUE *lastQueue;
+extern "C" OCL_RUNTIME_EXPORT GPUCLQUEUE *mgpuStreamCreate() {
+  auto ret =
+      new GPUCLQUEUE(static_cast<cl_device_id>(nullptr), nullptr, nullptr);
+  lastQueue = ret;
+  return ret;
+}
+
+extern "C" OCL_RUNTIME_EXPORT void mgpuStreamDestroy(GPUCLQUEUE *queue) {
+  lastQueue = nullptr;
+  delete queue;
+}
+
+extern "C" OCL_RUNTIME_EXPORT void *
+mgpuMemAlloc(uint64_t size, GPUCLQUEUE *queue, bool isShared) {
+  lastQueue = queue;
+  return allocDeviceMemory(queue, size, /*alignment*/ 64, isShared);
+}
+
+extern "C" OCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, GPUCLQUEUE *queue) {
+  lastQueue = queue;
+  if (ptr) {
+    deallocDeviceMemory(queue, ptr);
+  }
+}
+
+// mgpuModuleLoad and mgpuModuleGetFunction does not have
+// queue in parameters, but OCL APIs requires them. We implicitly use the queue
+// pointer of the last mgpu* API of the current thread as the queue for these
+// functions. This is ugly and error-prone. We might need another workaround.
+extern "C" OCL_RUNTIME_EXPORT cl_program mgpuModuleLoad(const void *data,
+                                                        size_t gpuBlobSize) {
+  return loadModule(lastQueue, (const unsigned char *)data, gpuBlobSize, false);
+}
+
+extern "C" OCL_RUNTIME_EXPORT cl_kernel
+mgpuModuleGetFunction(cl_program module, const char *name) {
+  // we need to push the kernel to lastQueue to avoid cl_kernel resource leak
+  return getKernel(lastQueue, module, name);
+}
+
+extern "C" OCL_RUNTIME_EXPORT void mgpuModuleUnload(cl_program module) {
+  CL_SAFE_CALL(clReleaseProgram(module));
+}
+
+extern "C" OCL_RUNTIME_EXPORT void
+mgpuLaunchKernel(cl_kernel kernel, size_t gridX, size_t gridY, size_t gridZ,
+                 size_t blockX, size_t blockY, size_t blockZ,
+                 size_t sharedMemBytes, GPUCLQUEUE *queue, void **params,
+                 void ** /*extra*/, size_t paramsCount) {
+  launchKernel(
+      queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+      sharedMemBytes,
+      [&]() {
+        // todo (yijie): do we need to handle shared mem? If there is dynamic
+        // shared mem required, which value should paramsCount be?
+        return paramsCount;
+      },
+      [&](size_t i) {
+        // todo (yijie): assuming all parameters are passed with pointer size
+        return std::make_pair(params[i], sizeof(void *));
+      });
+}
+
+extern "C" OCL_RUNTIME_EXPORT void mgpuStreamSynchronize(GPUCLQUEUE *queue) {
+  CL_SAFE_CALL(clFinish(queue->queue_));
+}
diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt
index 1b4f2cb73..f60c8cec2 100644
--- a/lib/gc/Transforms/CMakeLists.txt
+++ b/lib/gc/Transforms/CMakeLists.txt
@@ -28,4 +28,6 @@ add_mlir_library(GCPasses
   )
 
 set_property(GLOBAL APPEND PROPERTY GC_PASS_LIBS GCPasses)
-add_subdirectory(GPU)
+if(GC_USE_GPU)
+  add_subdirectory(GPU)
+endif()
diff --git a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp
index 7e9cf10d1..ea886631b 100644
--- a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp
+++ b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp
@@ -732,9 +732,9 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles,
 
   VectorType vecLoadType =
       VectorType::get(tileType.getShape(), tileType.getElementType());
-  IntegerAttr vnniAxisAttr = nullptr;
+  UnitAttr vnniAxisAttr = nullptr;
   if (vnniConf) {
-    vnniAxisAttr = IntegerAttr::get(rewriter.getI64Type(), vnniConf->vnniAxis);
+    vnniAxisAttr = UnitAttr::get(rewriter.getContext());
     vecLoadType = getVnniVector(tileType.getShape(), tileType.getElementType(),
                                 *vnniConf);
   }
@@ -742,7 +742,7 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles,
   SmallVector<Value> loadVec;
   for (auto tile : loadTiles) {
     auto loadOp = rewriter.create<xegpu::LoadNdOp>(
-        loc, vecLoadType, tile, vnniAxisAttr, transpose,
+        loc, vecLoadType, tile, vnniAxisAttr, transpose, nullptr,
         /*l1_hint=*/hint,
         /*l2_hint=*/hint, /*l3_hint=*/hint);
     loadVec.push_back(loadOp);
diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt
index af2125870..f127653de 100644
--- a/src/gc-opt/CMakeLists.txt
+++ b/src/gc-opt/CMakeLists.txt
@@ -33,15 +33,15 @@ set(gc_opt_libs
         ${dialect_libs}
         ${conversion_libs} 
         ${MLIR_LINK_COMPONENTS}
-        GCPasses
-        GCGPUPasses)
+        GCPasses)
 
 if(GC_USE_GPU)
   add_definitions(-DGC_USE_GPU=1)
   get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES)
   include_directories(${IMEX_INCLUDES})
   list(APPEND gc_opt_libs IMEXGPUXDialect IMEXXeTileDialect IMEXRegionDialect IMEXRegionTransforms
-    IMEXTransforms IMEXGPUToGPUX IMEXGPUToSPIRV IMEXGPUXToLLVM IMEXXeGPUToVC IMEXXeTileToXeGPU IMEXUtil)
+    IMEXTransforms IMEXGPUToGPUX IMEXGPUToSPIRV IMEXGPUXToLLVM IMEXXeGPUToVC IMEXXeTileToXeGPU IMEXUtil
+    GCGPUPasses)
 endif()
 if(GC_MLIR_CXX_FLAGS)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}")
diff --git a/test/mlir/test/CMakeLists.txt b/test/mlir/test/CMakeLists.txt
index 2a93c91e5..7ac5a89c6 100644
--- a/test/mlir/test/CMakeLists.txt
+++ b/test/mlir/test/CMakeLists.txt
@@ -25,7 +25,7 @@ set(GC_OPT_TEST_DEPENDS
 
 if(GC_USE_GPU)
         include(imex)
-        list(APPEND GC_OPT_TEST_DEPENDS level-zero-runtime)
+        list(APPEND GC_OPT_TEST_DEPENDS mlir_opencl_runtime)
 endif()
 
 if(GC_ENABLE_BINDINGS_PYTHON)
diff --git a/test/mlir/test/gc/Transforms/GPU/lit.local.cfg b/test/mlir/test/gc/Transforms/GPU/lit.local.cfg
new file mode 100644
index 000000000..f086e9be8
--- /dev/null
+++ b/test/mlir/test/gc/Transforms/GPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.gc_use_gpu:
+    config.unsupported = True
\ No newline at end of file
diff --git a/test/mlir/test/gc/gpu-runner/mlp.mlir b/test/mlir/test/gc/gpu-runner/mlp.mlir
index 4fb88983c..29dad1c8a 100644
--- a/test/mlir/test/gc/gpu-runner/mlp.mlir
+++ b/test/mlir/test/gc/gpu-runner/mlp.mlir
@@ -1,4 +1,4 @@
-// RUN: gc-opt %s --pass-pipeline='builtin.module(convert-tensor-to-linalg,func.func(empty-tensor-to-alloc-tensor),one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries},func.func(convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu),func.func(insert-gpu-allocs{client-api=opencl}),canonicalize,normalize-memrefs,func.func(lower-affine),gpu-kernel-outlining,canonicalize,cse,set-spirv-capabilities{client-api=opencl},gpu.module(set-spirv-abi-attrs{client-api=opencl}),canonicalize,fold-memref-alias-ops,imex-convert-gpu-to-spirv,spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),serialize-spirv,convert-gpu-to-gpux,convert-func-to-llvm,convert-math-to-llvm,convert-gpux-to-llvm,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime | FileCheck %s
+// RUN: gc-opt %s --pass-pipeline='builtin.module(convert-tensor-to-linalg,func.func(empty-tensor-to-alloc-tensor),one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries},func.func(convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu),func.func(insert-gpu-allocs{client-api=opencl}),canonicalize,normalize-memrefs,func.func(lower-affine),gpu-kernel-outlining,canonicalize,cse,set-spirv-capabilities{client-api=opencl},gpu.module(set-spirv-abi-attrs{client-api=opencl}),canonicalize,fold-memref-alias-ops,imex-convert-gpu-to-spirv,spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),serialize-spirv,convert-gpu-to-gpux,convert-func-to-llvm,convert-math-to-llvm,convert-gpux-to-llvm,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s
 #map0 = affine_map<(d0, d1) -> (d1)>
 #map1 = affine_map<(d0, d1) -> (d0, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
diff --git a/test/mlir/test/lit.cfg.py b/test/mlir/test/lit.cfg.py
index 5cabfe8f7..d53e105e1 100644
--- a/test/mlir/test/lit.cfg.py
+++ b/test/mlir/test/lit.cfg.py
@@ -34,7 +34,7 @@
 config.substitutions.append(('%mlir_c_runner_utils', config.mlir_c_runner_utils))
 
 if config.gc_use_gpu:
-    config.substitutions.append(('%levelzero_runtime', config.levelzero_runtime))
+    config.substitutions.append(('%opencl_runtime', config.opencl_runtime))
 
 llvm_config.with_system_environment(["HOME", "INCLUDE", "LIB", "TMP", "TEMP"])
 
diff --git a/test/mlir/test/lit.site.cfg.py.in b/test/mlir/test/lit.site.cfg.py.in
index f44b45662..6bea6d13e 100644
--- a/test/mlir/test/lit.site.cfg.py.in
+++ b/test/mlir/test/lit.site.cfg.py.in
@@ -42,7 +42,7 @@ config.mlir_runner_utils_dir = "@MLIR_RUNNER_UTILS_DIR@"
 config.mlir_runner_utils = os.path.normpath(os.path.join(config.mlir_runner_utils_dir, config.shlib_prefix + "mlir_runner_utils" + config.llvm_shlib_ext))
 config.mlir_c_runner_utils = os.path.normpath(os.path.join(config.mlir_runner_utils_dir, config.shlib_prefix + "mlir_c_runner_utils" + config.llvm_shlib_ext))
 
-config.levelzero_runtime = os.path.normpath(os.path.join(config.gc_lib_dir, config.shlib_prefix + "level-zero-runtime" + config.llvm_shlib_ext))
+config.opencl_runtime = os.path.normpath(os.path.join(config.gc_lib_dir, config.shlib_prefix + "mlir_opencl_runtime" + config.llvm_shlib_ext))
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)