diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index fa4f6e76f985f..4a67e01827381 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -116,6 +116,7 @@ add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ENABLE_ROCM_CONVERSIONS})
 
 set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the mlir CUDA runner")
 set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the mlir ROCm runner")
+set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the mlir Sycl runner")
 set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the mlir SPIR-V cpu runner")
 set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the mlir Vulkan runner")
 set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZero.cmake
new file mode 100644
index 0000000000000..012187f0afc0b
--- /dev/null
+++ b/mlir/cmake/modules/FindLevelZero.cmake
@@ -0,0 +1,221 @@
+# CMake find_package() module for level-zero
+#
+# Example usage:
+#
+# find_package(LevelZero)
+#
+# If successful, the following variables will be defined:
+# LevelZero_FOUND
+# LevelZero_INCLUDE_DIRS
+# LevelZero_LIBRARY
+# LevelZero_LIBRARIES_DIR
+#
+# By default, the module searches the standard paths to locate the "ze_api.h"
+# and the ze_loader shared library. When using a custom level-zero installation,
+# the environment variable "LEVEL_ZERO_DIR" should be specified telling the
+# module to get the level-zero library and headers from that location.
+
+include(FindPackageHandleStandardArgs)
+
+# Search path priority
+# 1. CMake Variable LEVEL_ZERO_DIR
+# 2. Environment Variable LEVEL_ZERO_DIR
+
+if(NOT LEVEL_ZERO_DIR)
+    if(DEFINED ENV{LEVEL_ZERO_DIR})
+        set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}")
+    endif()
+endif()
+
+if(LEVEL_ZERO_DIR)
+    find_path(LevelZero_INCLUDE_DIR
+        NAMES level_zero/ze_api.h
+        PATHS ${LEVEL_ZERO_DIR}/include
+        NO_DEFAULT_PATH
+    )
+
+    if(LINUX)
+        find_library(LevelZero_LIBRARY
+            NAMES ze_loader
+            PATHS ${LEVEL_ZERO_DIR}/lib
+                  ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
+            NO_DEFAULT_PATH
+        )
+    else()
+        find_library(LevelZero_LIBRARY
+            NAMES ze_loader
+            PATHS ${LEVEL_ZERO_DIR}/lib
+            NO_DEFAULT_PATH
+        )
+    endif()
+else()
+    find_path(LevelZero_INCLUDE_DIR
+        NAMES level_zero/ze_api.h
+    )
+
+    find_library(LevelZero_LIBRARY
+        NAMES ze_loader
+    )
+endif()
+
+# Compares the two version string that are supposed to be in x.y.z format
+# and reports if the argument VERSION_STR1 is greater than or equal than
+# version_str2. The strings are compared lexicographically after conversion to
+# lists of equal lengths, with the shorter string getting zero-padded.
+function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
+    # Convert the strings to list
+    string(REPLACE  "." ";" VL1 ${VERSION_STR1})
+    string(REPLACE  "." ";" VL2 ${VERSION_STR2})
+    # get lengths of both lists
+    list(LENGTH VL1 VL1_LEN)
+    list(LENGTH VL2 VL2_LEN)
+    set(LEN ${VL1_LEN})
+    # If they differ in size pad the shorter list with 0s
+    if(VL1_LEN GREATER VL2_LEN)
+        math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+        foreach(IDX RANGE 1 ${DIFF} 1)
+            list(APPEND VL2 "0")
+        endforeach()
+    elseif(VL2_LEN GREATER VL2_LEN)
+        math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+        foreach(IDX RANGE 1 ${DIFF} 1)
+            list(APPEND VL2 "0")
+        endforeach()
+        set(LEN ${VL2_LEN})
+    endif()
+    math(EXPR LEN_SUB_ONE "${LEN}-1")
+    foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1)
+        list(GET VL1 ${IDX} VAL1)
+        list(GET VL2 ${IDX} VAL2)
+
+        if(${VAL1} GREATER ${VAL2})
+            set(${OUTPUT} TRUE PARENT_SCOPE)
+            break()
+        elseif(${VAL1} LESS ${VAL2})
+            set(${OUTPUT} FALSE PARENT_SCOPE)
+            break()
+        else()
+            set(${OUTPUT} TRUE PARENT_SCOPE)
+        endif()
+    endforeach()
+
+    endfunction(compare_versions)
+
+# Creates a small function to run and extract the LevelZero loader version.
+function(get_l0_loader_version)
+
+    set(L0_VERSIONEER_SRC
+        [====[
+        #include <iostream>
+        #include <level_zero/loader/ze_loader.h>
+        #include <string>
+        int main() {
+            ze_result_t result;
+            std::string loader("loader");
+            zel_component_version_t *versions;
+            size_t size = 0;
+            result = zeInit(0);
+            if (result != ZE_RESULT_SUCCESS) {
+                std::cerr << "Failed to init ze driver" << std::endl;
+                return -1;
+            }
+            zelLoaderGetVersions(&size, nullptr);
+            versions = new zel_component_version_t[size];
+            zelLoaderGetVersions(&size, versions);
+            for (size_t i = 0; i < size; i++) {
+                if (loader.compare(versions[i].component_name) == 0) {
+                    std::cout << versions[i].component_lib_version.major << "."
+                              << versions[i].component_lib_version.minor << "."
+                              << versions[i].component_lib_version.patch;
+                    break;
+                }
+            }
+            delete[] versions;
+            return 0;
+        }
+        ]====]
+    )
+
+    set(L0_VERSIONEER_FILE ${CMAKE_BINARY_DIR}/temp/l0_versioneer.cpp)
+
+    file(WRITE ${L0_VERSIONEER_FILE} "${L0_VERSIONEER_SRC}")
+
+    # We need both the directories in the include path as ze_loader.h
+    # includes "ze_api.h" and not "level_zero/ze_api.h".
+    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero)
+    list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR)
+    try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE
+            "${CMAKE_BINARY_DIR}"
+            "${L0_VERSIONEER_FILE}"
+            LINK_LIBRARIES ${LevelZero_LIBRARY}
+            CMAKE_FLAGS
+                "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
+            RUN_OUTPUT_VARIABLE L0_VERSION
+    )
+    if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN))
+        set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE)
+        message(STATUS "Found Level Zero of version: ${L0_VERSION}")
+    else()
+        message(FATAL_ERROR
+            "Could not compile a level-zero program to extract loader version"
+        )
+    endif()
+endfunction(get_l0_loader_version)
+
+if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY)
+    list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}")
+    list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+    if(OpenCL_FOUND)
+      list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+    endif()
+
+    cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH)
+    set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH})
+
+    if(NOT TARGET LevelZero::LevelZero)
+      add_library(LevelZero::LevelZero INTERFACE IMPORTED)
+      set_target_properties(LevelZero::LevelZero
+        PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
+      )
+      set_target_properties(LevelZero::LevelZero
+        PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
+      )
+    endif()
+endif()
+
+# Check if a specific version of Level Zero is required
+if(LevelZero_FIND_VERSION)
+    get_l0_loader_version()
+    set(VERSION_GT_FIND_VERSION FALSE)
+    compare_versions(
+        ${LevelZero_VERSION}
+        ${LevelZero_FIND_VERSION}
+        VERSION_GT_FIND_VERSION
+    )
+    if(${VERSION_GT_FIND_VERSION})
+        set(LevelZero_FOUND TRUE)
+    else()
+        set(LevelZero_FOUND FALSE)
+    endif()
+else()
+    set(LevelZero_FOUND TRUE)
+endif()
+
+find_package_handle_standard_args(LevelZero
+    REQUIRED_VARS
+        LevelZero_FOUND
+        LevelZero_INCLUDE_DIRS
+        LevelZero_LIBRARY
+        LevelZero_LIBRARIES_DIR
+    HANDLE_COMPONENTS
+)
+mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS)
+
+if(LevelZero_FOUND)
+    find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}"
+        "(found version ${LevelZero_VERSION})"
+    )
+else()
+    find_package_message(LevelZero "Could not find LevelZero" "")
+endif()
diff --git a/mlir/cmake/modules/FindSyclRuntime.cmake b/mlir/cmake/modules/FindSyclRuntime.cmake
new file mode 100644
index 0000000000000..38b065a3f284c
--- /dev/null
+++ b/mlir/cmake/modules/FindSyclRuntime.cmake
@@ -0,0 +1,68 @@
+# CMake find_package() module for SYCL Runtime
+#
+# Example usage:
+#
+# find_package(SyclRuntime)
+#
+# If successful, the following variables will be defined:
+# SyclRuntime_FOUND
+# SyclRuntime_INCLUDE_DIRS
+# SyclRuntime_LIBRARY
+# SyclRuntime_LIBRARIES_DIR
+#
+
+include(FindPackageHandleStandardArgs)
+
+if(NOT DEFINED ENV{CMPLR_ROOT})
+    message(WARNING "Please make sure to install Intel DPC++ Compiler and run setvars.(sh/bat)")
+    message(WARNING "You can download standalone Intel DPC++ Compiler from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html#compilers")
+else()
+    if(LINUX OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux"))
+        set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/linux")
+    elseif(WIN32)
+        set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/windows")
+    endif()
+    list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include")
+    list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include/sycl")
+
+    set(SyclRuntime_LIBRARY_DIR "${SyclRuntime_ROOT}/lib")
+
+    message(STATUS "SyclRuntime_LIBRARY_DIR: ${SyclRuntime_LIBRARY_DIR}")
+    find_library(SyclRuntime_LIBRARY
+        NAMES sycl
+        PATHS ${SyclRuntime_LIBRARY_DIR}
+        NO_DEFAULT_PATH
+        )
+endif()
+
+if(SyclRuntime_LIBRARY)
+    set(SyclRuntime_FOUND TRUE)
+    if(NOT TARGET SyclRuntime::SyclRuntime)
+        add_library(SyclRuntime::SyclRuntime INTERFACE IMPORTED)
+        set_target_properties(SyclRuntime::SyclRuntime
+            PROPERTIES INTERFACE_LINK_LIBRARIES "${SyclRuntime_LIBRARY}"
+      )
+      set_target_properties(SyclRuntime::SyclRuntime
+          PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${SyclRuntime_INCLUDE_DIRS}"
+      )
+    endif()
+else()
+    set(SyclRuntime_FOUND FALSE)
+endif()
+
+find_package_handle_standard_args(SyclRuntime
+    REQUIRED_VARS
+        SyclRuntime_FOUND
+        SyclRuntime_INCLUDE_DIRS
+        SyclRuntime_LIBRARY
+        SyclRuntime_LIBRARY_DIR
+    HANDLE_COMPONENTS
+)
+
+mark_as_advanced(SyclRuntime_LIBRARY SyclRuntime_INCLUDE_DIRS)
+
+if(SyclRuntime_FOUND)
+    find_package_message(SyclRuntime "Found SyclRuntime: ${SyclRuntime_LIBRARY}" "")
+else()
+    find_package_message(SyclRuntime "Could not find SyclRuntime" "")
+endif()
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index ed37abf85275b..3bb6006a467fe 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -568,7 +568,10 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> {
   let options = [
     Option<"use64bitIndex", "use-64bit-index",
            "bool", /*default=*/"false",
-           "Use 64-bit integers to convert index types">
+           "Use 64-bit integers to convert index types">,
+    Option<"useOpenCL", "use-opencl",
+           "bool", /*default=*/"false",
+           "Use OpenCL instead of Vulkan">
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index fc20bd2ed921a..f285f45448ecc 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -187,4 +187,8 @@ def GpuROCDLAttachTarget: Pass<"rocdl-attach-target", ""> {
   ];
 }
 
+def GpuSerializeToSPIRVPass : Pass<"gpu-serialize-to-spirv", "ModuleOp"> {
+  let summary = "Serialize spirv dialect to spirv binary";
+}
+
 #endif // MLIR_DIALECT_GPU_PASSES
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index eddf3e9a47d0b..808431b824724 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -101,7 +101,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
   FunctionCallBuilder moduleLoadCallBuilder = {
       "mgpuModuleLoad",
       llvmPointerType /* void *module */,
-      {llvmPointerType /* void *cubin */}};
+      {llvmPointerType, /* void *cubin */
+       llvmInt64Type /* size_t size */}};
   FunctionCallBuilder moduleUnloadCallBuilder = {
       "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
   FunctionCallBuilder moduleGetFunctionCallBuilder = {
@@ -125,7 +126,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
           llvmInt32Type,          /* unsigned int sharedMemBytes */
           llvmPointerType,        /* void *hstream */
           llvmPointerPointerType, /* void **kernelParams */
-          llvmPointerPointerType  /* void **extra */
+          llvmPointerPointerType, /* void **extra */
+          llvmInt64Type           /* size_t paramsCount */
       }};
   FunctionCallBuilder streamCreateCallBuilder = {
       "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
@@ -167,7 +169,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       "mgpuMemAlloc",
       llvmPointerType /* void * */,
       {llvmIntPtrType /* intptr_t sizeBytes */,
-       llvmPointerType /* void *stream */}};
+       llvmPointerType /* void *stream */,
+       llvmInt64Type /* size_t isHostShared */}};
   FunctionCallBuilder deallocCallBuilder = {
       "mgpuMemFree",
       llvmVoidType,
@@ -786,10 +789,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite(
 LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::AllocOp allocOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
-  if (adaptor.getHostShared())
-    return rewriter.notifyMatchFailure(
-        allocOp, "host_shared allocation is not supported");
-
   MemRefType memRefType = allocOp.getType();
 
   if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) ||
@@ -799,6 +798,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   auto loc = allocOp.getLoc();
 
+  bool isShared = allocOp.getHostShared();
+
   // Get shape of the memref as values: static sizes are constant
   // values and dynamic sizes are passed to 'alloc' as operands.
   SmallVector<Value, 4> shape;
@@ -811,8 +812,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
   // descriptor.
   Type elementPtrType = this->getElementPtrType(memRefType);
   auto stream = adaptor.getAsyncDependencies().front();
+
+  auto isHostShared = rewriter.create<mlir::LLVM::ConstantOp>(
+      loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared));
+
   Value allocatedPtr =
-      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
+      allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
+          .getResult();
   if (!getTypeConverter()->useOpaquePointers())
     allocatedPtr =
         rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
@@ -1134,7 +1140,21 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
       loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
       LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
 
-  auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
+  // SPIRV requires binary size
+  auto gpuBlob = binaryAttr.getValue();
+  auto gpuBlobSize = rewriter.create<mlir::LLVM::ConstantOp>(
+      loc, llvmInt64Type,
+      mlir::IntegerAttr::get(llvmInt64Type,
+                             static_cast<int64_t>(gpuBlob.size())));
+
+  auto paramsCount = rewriter.create<mlir::LLVM::ConstantOp>(
+      loc, llvmInt64Type,
+      mlir::IntegerAttr::get(
+          llvmInt64Type,
+          static_cast<int64_t>(launchOp.getNumKernelOperands())));
+
+  auto module =
+      moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize});
   // Get the function from the module. The name corresponds to the name of
   // the kernel function.
   auto kernelName = generateKernelNameConstant(
@@ -1158,7 +1178,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
       {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(),
        adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
        adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams,
-       /*extra=*/nullpointer});
+       /*extra=*/nullpointer, paramsCount});
 
   if (launchOp.getAsyncToken()) {
     // Async launch: make dependent ops use the same stream.
diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
index f37c70a771f59..a52c99ec9daec 100644
--- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp
@@ -71,7 +71,8 @@ void GPUToSPIRVPass::runOnOperation() {
       std::unique_ptr<ConversionTarget> target =
           spirv::getMemorySpaceToStorageClassTarget(*context);
       spirv::MemorySpaceToStorageClassMap memorySpaceMap =
-          spirv::mapMemorySpaceToVulkanStorageClass;
+          this->useOpenCL ? spirv::mapMemorySpaceToOpenCLStorageClass :
+              spirv::mapMemorySpaceToVulkanStorageClass;
       spirv::MemorySpaceToStorageClassConverter converter(memorySpaceMap);
 
       RewritePatternSet patterns(context);
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6244132c073a4..38fa60ba06f59 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -58,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   Transforms/SerializeToBlob.cpp
   Transforms/SerializeToCubin.cpp
   Transforms/SerializeToHsaco.cpp
+  Transforms/SerializeToSPIRV.cpp
   Transforms/ShuffleRewriter.cpp
   Transforms/ROCDLAttachTarget.cpp
 
@@ -96,6 +97,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
   MLIRSupport
   MLIRROCDLTarget
   MLIRTransformUtils
+  MLIRSPIRVSerialization
   )
 
 add_subdirectory(TransformOps)
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
new file mode 100644
index 0000000000000..f013f531371de
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp
@@ -0,0 +1,70 @@
+//===- SerializeToSPIRV.cpp - Convert GPU kernel to SPIRV blob -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This pass iterates all the SPIR-V modules in the top module and serializes
+/// each SPIR-V module to SPIR-V binary and then attachs the binary blob as a
+/// string attribute to the corresponding gpu module.
+///
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/GPU/Transforms/Passes.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
+#include "mlir/Target/SPIRV/Serialization.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_GPUSERIALIZETOSPIRVPASS
+#include "mlir/Dialect/GPU/Transforms/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+struct GpuSerializeToSPIRVPass : public mlir::impl::GpuSerializeToSPIRVPassBase<GpuSerializeToSPIRVPass> {
+public:
+  void runOnOperation() override {
+    auto mod = getOperation();
+    llvm::SmallVector<uint32_t, 0> spvBinary;
+    for (mlir::gpu::GPUModuleOp gpuMod : mod.getOps<gpu::GPUModuleOp>()) {
+      auto name = gpuMod.getName();
+      // check that the spv module has the same name with gpu module except the
+      // prefix "__spv__"
+      auto isSameMod = [&](spirv::ModuleOp spvMod) -> bool {
+        auto spvModName = spvMod.getName();
+        return spvModName->consume_front("__spv__") && spvModName == name;
+      };
+      auto spvMods = mod.getOps<spirv::ModuleOp>();
+      auto it = llvm::find_if(spvMods, isSameMod);
+      if (it == spvMods.end()) {
+        gpuMod.emitError() << "Unable to find corresponding SPIR-V module";
+        signalPassFailure();
+        return;
+      }
+      auto spvMod = *it;
+
+      spvBinary.clear();
+      // serialize the spv module to spv binary
+      if (mlir::failed(spirv::serialize(spvMod, spvBinary))) {
+        spvMod.emitError() << "Failed to serialize SPIR-V module";
+        signalPassFailure();
+        return;
+      }
+
+      // attach the spv binary to the gpu module
+      auto spvData =
+          llvm::StringRef(reinterpret_cast<const char *>(spvBinary.data()),
+                          spvBinary.size() * sizeof(uint32_t));
+      auto spvAttr = mlir::StringAttr::get(&getContext(), spvData);
+      gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr);
+      spvMod->erase();
+    }
+  }
+};
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index ea33c2c6ed261..101d9baafcccc 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_OPTIONAL_SOURCES
   CRunnerUtils.cpp
   CudaRuntimeWrappers.cpp
   SparseTensorRuntime.cpp
+  SyclRuntimeWrappers.cpp
   ExecutionEngine.cpp
   Float16bits.cpp
   RocmRuntimeWrappers.cpp
@@ -328,4 +329,39 @@ if(LLVM_ENABLE_PIC)
       hip::host hip::amdhip64
     )
   endif()
+
+  if(MLIR_ENABLE_SYCL_RUNNER)
+    find_package(SyclRuntime)
+
+    if(NOT SyclRuntime_FOUND)
+      message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
+    endif()
+
+    find_package(LevelZero)
+
+    if(NOT LevelZero_FOUND)
+      message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
+    endif()
+
+    add_mlir_library(mlir_sycl_runtime
+      SHARED
+      SyclRuntimeWrappers.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+
+    check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+    if(NOT CXX_HAS_FRTTI_FLAG)
+      message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+    endif()
+    target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti)
+
+    target_include_directories(mlir_sycl_runtime PRIVATE
+      ${MLIR_INCLUDE_DIRS}
+    )
+
+    target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+
+    set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+  endif()
 endif()
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 1dba677ebe663..79dc2eed38f06 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false;
 #endif // MLIR_ENABLE_CUDA_CUSPARSELT
 #endif // MLIR_ENABLE_CUDA_CUSPARSE
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule
+mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
   ScopedContext scopedContext;
   CUmodule module = nullptr;
   CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
@@ -144,7 +145,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
                  intptr_t gridZ, intptr_t blockX, intptr_t blockY,
                  intptr_t blockZ, int32_t smem, CUstream stream, void **params,
-                 void **extra) {
+                 void **extra, size_t /*paramsCount*/) {
   ScopedContext scopedContext;
   int32_t maxShmem = 0;
   CUdevice device = getDefaultCuDevice();
@@ -210,7 +211,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event,
   CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream));
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/,
+                              bool /*isHostShared*/) {
   ScopedContext scopedContext;
   CUdeviceptr ptr;
   CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes));
diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
index bd3868a8e196f..b50fd7eb9d059 100644
--- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -32,7 +32,7 @@
 
 thread_local static int32_t defaultDevice = 0;
 
-extern "C" hipModule_t mgpuModuleLoad(void *data) {
+extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) {
   hipModule_t module = nullptr;
   HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data));
   return module;
@@ -57,7 +57,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX,
                                  intptr_t blockX, intptr_t blockY,
                                  intptr_t blockZ, int32_t smem,
                                  hipStream_t stream, void **params,
-                                 void **extra) {
+                                 void **extra, size_t /*paramsCount*/) {
   HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ,
                                             blockX, blockY, blockZ, smem,
                                             stream, params, extra));
@@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) {
   HIP_REPORT_IF_ERROR(hipEventRecord(event, stream));
 }
 
-extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) {
+extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/,
+                              bool /*isHostShared*/) {
   void *ptr;
   HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes));
   return ptr;
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
new file mode 100644
index 0000000000000..6b40d4a6922c9
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -0,0 +1,223 @@
+//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements C wrappers around the sycl runtime library.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <array>
+#include <atomic>
+#include <cassert>
+#include <cfloat>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <stdexcept>
+#include <tuple>
+#include <vector>
+
+#include <CL/sycl.hpp>
+#include <level_zero/ze_api.h>
+#include <map>
+#include <mutex>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+
+#ifdef _WIN32
+#define SYCL_RUNTIME_EXPORT __declspec(dllexport)
+#else
+#define SYCL_RUNTIME_EXPORT
+#endif // _WIN32
+
+namespace {
+
+template <typename F>
+auto catchAll(F &&func) {
+  try {
+    return func();
+  } catch (const std::exception &e) {
+    fprintf(stdout, "An exception was thrown: %s\n", e.what());
+    fflush(stdout);
+    abort();
+  } catch (...) {
+    fprintf(stdout, "An unknown exception was thrown\n");
+    fflush(stdout);
+    abort();
+  }
+}
+
+#define L0_SAFE_CALL(call)                                                     \
+  {                                                                            \
+    ze_result_t status = (call);                                               \
+    if (status != ZE_RESULT_SUCCESS) {                                         \
+      fprintf(stdout, "L0 error %d\n", status);                                \
+      fflush(stdout);                                                          \
+      abort();                                                                 \
+    }                                                                          \
+  }
+
+} // namespace
+
+static sycl::device getDefaultDevice() {
+  auto platformList = sycl::platform::get_platforms();
+  for (const auto &platform : platformList) {
+    auto platformName = platform.get_info<sycl::info::platform::name>();
+    bool isLevelZero = platformName.find("Level-Zero") != std::string::npos;
+    if (!isLevelZero)
+      continue;
+
+    return platform.get_devices()[0];
+  }
+  throw std::runtime_error("getDefaultDevice failed");
+}
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+
+// Create global device and context
+sycl::device syclDevice = getDefaultDevice();
+sycl::context syclContext = sycl::context(syclDevice);
+
+#pragma clang diagnostic pop
+
+struct QUEUE {
+  sycl::queue syclQueue_;
+
+  QUEUE() { syclQueue_ = sycl::queue(syclContext, syclDevice); }
+};
+
+static void *allocDeviceMemory(QUEUE *queue, size_t size, bool isShared) {
+  void *memPtr = nullptr;
+  if (isShared) {
+    memPtr = sycl::aligned_alloc_shared(64, size, syclDevice, syclContext);
+  } else {
+    memPtr = sycl::aligned_alloc_device(64, size, syclDevice, syclContext);
+  }
+  if (memPtr == nullptr) {
+    throw std::runtime_error("mem allocation failed!");
+  }
+  return memPtr;
+}
+
+static void deallocDeviceMemory(QUEUE *queue, void *ptr) {
+  sycl::free(ptr, queue->syclQueue_);
+}
+
+static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
+  assert(data);
+  ze_module_handle_t zeModule;
+  ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
+                           nullptr,
+                           ZE_MODULE_FORMAT_IL_SPIRV,
+                           dataSize,
+                           (const uint8_t *)data,
+                           nullptr,
+                           nullptr};
+  auto zeDevice =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclDevice);
+  auto zeContext =
+      sycl::get_native<sycl::backend::ext_oneapi_level_zero>(syclContext);
+  L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr));
+  return zeModule;
+}
+
+static sycl::kernel *getKernel(ze_module_handle_t zeModule, const char *name) {
+  assert(zeModule);
+  assert(name);
+  ze_kernel_handle_t zeKernel;
+  sycl::kernel *syclKernel;
+  ze_kernel_desc_t desc = {};
+  desc.pKernelName = name;
+
+  L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
+  sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle =
+      sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
+                               sycl::bundle_state::executable>({zeModule},
+                                                               syclContext);
+
+  auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
+      {kernelBundle, zeKernel}, syclContext);
+  syclKernel = new sycl::kernel(kernel);
+  return syclKernel;
+}
+
+static void launchKernel(QUEUE *queue, sycl::kernel *kernel, size_t gridX,
+                         size_t gridY, size_t gridZ, size_t blockX,
+                         size_t blockY, size_t blockZ, size_t sharedMemBytes,
+                         void **params, size_t paramsCount) {
+  auto syclGlobalRange =
+      ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
+  auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX);
+  sycl::nd_range<3> syclNdRange(
+      sycl::nd_range<3>(syclGlobalRange, syclLocalRange));
+
+  queue->syclQueue_.submit([&](sycl::handler &cgh) {
+    for (size_t i = 0; i < paramsCount; i++) {
+      cgh.set_arg(static_cast<uint32_t>(i), *(static_cast<void **>(params[i])));
+    }
+    cgh.parallel_for(syclNdRange, *kernel);
+  });
+}
+
+// Wrappers
+
+extern "C" SYCL_RUNTIME_EXPORT QUEUE *mgpuStreamCreate() {
+
+  return catchAll([&]() { return new QUEUE(); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamDestroy(QUEUE *queue) {
+  catchAll([&]() { delete queue; });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void *mgpuMemAlloc(uint64_t size, QUEUE *queue,
+                                                  bool isShared) {
+  return catchAll([&]() {
+    return allocDeviceMemory(queue, static_cast<size_t>(size), true);
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, QUEUE *queue) {
+  catchAll([&]() {
+    if (ptr) {
+      deallocDeviceMemory(queue, ptr);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t
+mgpuModuleLoad(const void *data, size_t gpuBlobSize) {
+  return catchAll([&]() { return loadModule(data, gpuBlobSize); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT sycl::kernel *
+mgpuModuleGetFunction(ze_module_handle_t module, const char *name) {
+  return catchAll([&]() { return getKernel(module, name); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+mgpuLaunchKernel(sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ,
+                 size_t blockX, size_t blockY, size_t blockZ,
+                 size_t sharedMemBytes, QUEUE *queue, void **params,
+                 void **extra, size_t paramsCount) {
+  return catchAll([&]() {
+    launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+                 sharedMemBytes, params, paramsCount);
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamSynchronize(QUEUE *queue) {
+
+  catchAll([&]() { queue->syclQueue_.wait(); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+mgpuModuleUnload(ze_module_handle_t module) {
+
+  catchAll([&]() { L0_SAFE_CALL(zeModuleDestroy(module)); });
+}
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 66a9cb01106ba..874e7718f4a36 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -135,6 +135,10 @@ if(MLIR_ENABLE_ROCM_RUNNER)
   list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime)
 endif()
 
+if(MLIR_ENABLE_SYCL_RUNNER)
+  list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime)
+endif()
+
 list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests)
 
 if(LLVM_BUILD_EXAMPLES)
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index 2506c6ceb990e..f365dcb02daf4 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant 
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 2cdc4e8dbb1ad..96e8a6dbd35b1 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -34,8 +34,10 @@ module attributes {gpu.container_module} {
   // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]]
   // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0]
   // CHECK-SAME: -> !llvm.ptr
+  // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant
+  // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant
 
-  // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]])
+  // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]])
   // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}})
 
   // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate
@@ -53,10 +55,11 @@ module attributes {gpu.container_module} {
   // CHECK: llvm.getelementptr %[[MEMREF]][0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
 
   // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr
+  
 
   // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]],
   // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]],
-  // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]])
+  // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]])
   // CHECK: llvm.call @mgpuStreamSynchronize
   // CHECK: llvm.call @mgpuStreamDestroy
   // CHECK: llvm.call @mgpuModuleUnload
diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
index 2fa6c854c5678..e27162c7dbc19 100644
--- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
+++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
@@ -8,7 +8,8 @@ module attributes {gpu.container_module} {
     %0 = gpu.wait async
     // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
     // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
+    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
     %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
     // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
     // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
diff --git a/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir
new file mode 100644
index 0000000000000..d70e18f3401d3
--- /dev/null
+++ b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir
@@ -0,0 +1,53 @@
+// RUN: mlir-opt -gpu-serialize-to-spirv %s | FileCheck %s
+module attributes {gpu.container_module, spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, #spirv.resource_limits<>>} {
+  // CHECK:        gpu.module @addt_kernel attributes {gpu.binary =
+  spirv.module @__spv__addt_kernel Physical64 OpenCL requires #spirv.vce<v1.0, [Int64, Addresses, Kernel], []> {
+    spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr<vector<3xi64>, Input>
+    spirv.func @addt_kernel(%arg0: !spirv.ptr<f32, CrossWorkgroup>, %arg1: !spirv.ptr<f32, CrossWorkgroup>, %arg2: !spirv.ptr<f32, CrossWorkgroup>) "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64} {
+      %cst5_i64 = spirv.Constant 5 : i64
+      %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+      %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi64>
+      %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi64>
+      %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr<vector<3xi64>, Input>
+      %2 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi64>
+      %3 = spirv.CompositeExtract %2[1 : i32] : vector<3xi64>
+      spirv.Branch ^bb1
+    ^bb1:  // pred: ^bb0
+      %4 = spirv.IMul %1, %cst5_i64 : i64
+      %5 = spirv.IAdd %4, %3 : i64
+      %6 = spirv.InBoundsPtrAccessChain %arg0[%5] : !spirv.ptr<f32, CrossWorkgroup>, i64
+      %7 = spirv.Load "CrossWorkgroup" %6 ["Aligned", 4] : f32
+      %8 = spirv.IMul %1, %cst5_i64 : i64
+      %9 = spirv.IAdd %8, %3 : i64
+      %10 = spirv.InBoundsPtrAccessChain %arg1[%9] : !spirv.ptr<f32, CrossWorkgroup>, i64
+      %11 = spirv.Load "CrossWorkgroup" %10 ["Aligned", 4] : f32
+      %12 = spirv.FAdd %7, %11 : f32
+      %13 = spirv.IMul %1, %cst5_i64 : i64
+      %14 = spirv.IAdd %13, %3 : i64
+      %15 = spirv.InBoundsPtrAccessChain %arg2[%14] : !spirv.ptr<f32, CrossWorkgroup>, i64
+      spirv.Store "CrossWorkgroup" %15, %12 ["Aligned", 4] : f32
+      spirv.Return
+    }
+    spirv.EntryPoint "Kernel" @addt_kernel, @__builtin_var_WorkgroupId__
+  }
+  gpu.module @addt_kernel {
+    gpu.func @addt_kernel(%arg0: memref<?xf32>, %arg1: memref<?xf32>, %arg2: memref<?xf32>) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %c5 = arith.constant 5 : index
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      cf.br ^bb1
+    ^bb1:  // pred: ^bb0
+      %2 = arith.muli %0, %c5 : index
+      %3 = arith.addi %2, %1 : index
+      %4 = memref.load %arg0[%3] : memref<?xf32>
+      %5 = arith.muli %0, %c5 : index
+      %6 = arith.addi %5, %1 : index
+      %7 = memref.load %arg1[%6] : memref<?xf32>
+      %8 = arith.addf %4, %7 : f32
+      %9 = arith.muli %0, %c5 : index
+      %10 = arith.addi %9, %1 : index
+      memref.store %8, %arg2[%10] : memref<?xf32>
+      gpu.return
+    }
+  }
+}
diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
new file mode 100644
index 0000000000000..36d132d0c94d3
--- /dev/null
+++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir
@@ -0,0 +1,58 @@
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @add attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]>
+  memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]>
+  func.func @main() {
+    %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64>
+    %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64>
+    %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64>
+    %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64>
+    call @printMemrefI64(%cast) : (memref<*xi64>) -> ()
+    return
+  }
+  func.func private @printMemrefI64(memref<*xi64>)
+  func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> {
+  %c3 = arith.constant 3 : index
+  %c1 = arith.constant 1 : index
+  %0 = gpu.wait async
+  %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64>
+  gpu.wait [%asyncToken]
+  memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64>
+  %1 = gpu.wait async
+  %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<3x3xi64>
+  gpu.wait [%asyncToken_1]
+  memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64>
+  %2 = gpu.wait async
+  %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<3x3xi64>
+  %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>)
+  gpu.wait [%3]
+  %alloc = memref.alloc() : memref<3x3xi64>
+  memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64>
+  %4 = gpu.wait async
+  %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64>
+  %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64>
+  %7 = gpu.dealloc async [%6] %memref : memref<3x3xi64>
+  gpu.wait [%7]
+  return %alloc : memref<3x3xi64>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 3, 3, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %0 = gpu.block_id  x
+      %1 = gpu.block_id  y
+      %2 = memref.load %arg0[%0, %1] : memref<3x3xi64>
+      %3 = memref.load %arg1[%0, %1] : memref<3x3xi64>
+      %4 = arith.addi %2, %3 : i64
+      memref.store %4, %arg2[%0, %1] : memref<3x3xi64>
+      gpu.return
+    }
+  }
+  // CHECK: [2,   4100,   6],
+  // CHECK: [16777224,   10,   4294971404],
+  // CHECK: [16777230,   1103806595088,   1099511627794]
+}
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index f265ac794c6f6..5d3a4dc575a7b 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -119,6 +119,9 @@ def add_runtime(name):
 if config.enable_cuda_runner:
     tools.extend([add_runtime("mlir_cuda_runtime")])
 
+if config.enable_sycl_runner:
+    tools.extend([add_runtime("mlir_sycl_runtime")])
+
 # The following tools are optional
 tools.extend(
     [
diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in
index ef1fdbc0cba07..897c12f3abcac 100644
--- a/mlir/test/lit.site.cfg.py.in
+++ b/mlir/test/lit.site.cfg.py.in
@@ -30,6 +30,7 @@ config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@
 config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@
 config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@
 config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@"
+config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@
 config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@
 config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@
 config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@