diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt index fa4f6e76f985f..4a67e01827381 100644 --- a/mlir/CMakeLists.txt +++ b/mlir/CMakeLists.txt @@ -116,6 +116,7 @@ add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ENABLE_ROCM_CONVERSIONS}) set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the mlir CUDA runner") set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the mlir ROCm runner") +set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the mlir Sycl runner") set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the mlir SPIR-V cpu runner") set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the mlir Vulkan runner") set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZero.cmake new file mode 100644 index 0000000000000..012187f0afc0b --- /dev/null +++ b/mlir/cmake/modules/FindLevelZero.cmake @@ -0,0 +1,221 @@ +# CMake find_package() module for level-zero +# +# Example usage: +# +# find_package(LevelZero) +# +# If successful, the following variables will be defined: +# LevelZero_FOUND +# LevelZero_INCLUDE_DIRS +# LevelZero_LIBRARY +# LevelZero_LIBRARIES_DIR +# +# By default, the module searches the standard paths to locate the "ze_api.h" +# and the ze_loader shared library. When using a custom level-zero installation, +# the environment variable "LEVEL_ZERO_DIR" should be specified telling the +# module to get the level-zero library and headers from that location. + +include(FindPackageHandleStandardArgs) + +# Search path priority +# 1. CMake Variable LEVEL_ZERO_DIR +# 2. Environment Variable LEVEL_ZERO_DIR + +if(NOT LEVEL_ZERO_DIR) + if(DEFINED ENV{LEVEL_ZERO_DIR}) + set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}") + endif() +endif() + +if(LEVEL_ZERO_DIR) + find_path(LevelZero_INCLUDE_DIR + NAMES level_zero/ze_api.h + PATHS ${LEVEL_ZERO_DIR}/include + NO_DEFAULT_PATH + ) + + if(LINUX) + find_library(LevelZero_LIBRARY + NAMES ze_loader + PATHS ${LEVEL_ZERO_DIR}/lib + ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu + NO_DEFAULT_PATH + ) + else() + find_library(LevelZero_LIBRARY + NAMES ze_loader + PATHS ${LEVEL_ZERO_DIR}/lib + NO_DEFAULT_PATH + ) + endif() +else() + find_path(LevelZero_INCLUDE_DIR + NAMES level_zero/ze_api.h + ) + + find_library(LevelZero_LIBRARY + NAMES ze_loader + ) +endif() + +# Compares the two version string that are supposed to be in x.y.z format +# and reports if the argument VERSION_STR1 is greater than or equal than +# version_str2. The strings are compared lexicographically after conversion to +# lists of equal lengths, with the shorter string getting zero-padded. +function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT) + # Convert the strings to list + string(REPLACE "." ";" VL1 ${VERSION_STR1}) + string(REPLACE "." ";" VL2 ${VERSION_STR2}) + # get lengths of both lists + list(LENGTH VL1 VL1_LEN) + list(LENGTH VL2 VL2_LEN) + set(LEN ${VL1_LEN}) + # If they differ in size pad the shorter list with 0s + if(VL1_LEN GREATER VL2_LEN) + math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL) + foreach(IDX RANGE 1 ${DIFF} 1) + list(APPEND VL2 "0") + endforeach() + elseif(VL2_LEN GREATER VL2_LEN) + math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL) + foreach(IDX RANGE 1 ${DIFF} 1) + list(APPEND VL2 "0") + endforeach() + set(LEN ${VL2_LEN}) + endif() + math(EXPR LEN_SUB_ONE "${LEN}-1") + foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1) + list(GET VL1 ${IDX} VAL1) + list(GET VL2 ${IDX} VAL2) + + if(${VAL1} GREATER ${VAL2}) + set(${OUTPUT} TRUE PARENT_SCOPE) + break() + elseif(${VAL1} LESS ${VAL2}) + set(${OUTPUT} FALSE PARENT_SCOPE) + break() + else() + set(${OUTPUT} TRUE PARENT_SCOPE) + endif() + endforeach() + + endfunction(compare_versions) + +# Creates a small function to run and extract the LevelZero loader version. +function(get_l0_loader_version) + + set(L0_VERSIONEER_SRC + [====[ + #include + #include + #include + int main() { + ze_result_t result; + std::string loader("loader"); + zel_component_version_t *versions; + size_t size = 0; + result = zeInit(0); + if (result != ZE_RESULT_SUCCESS) { + std::cerr << "Failed to init ze driver" << std::endl; + return -1; + } + zelLoaderGetVersions(&size, nullptr); + versions = new zel_component_version_t[size]; + zelLoaderGetVersions(&size, versions); + for (size_t i = 0; i < size; i++) { + if (loader.compare(versions[i].component_name) == 0) { + std::cout << versions[i].component_lib_version.major << "." + << versions[i].component_lib_version.minor << "." + << versions[i].component_lib_version.patch; + break; + } + } + delete[] versions; + return 0; + } + ]====] + ) + + set(L0_VERSIONEER_FILE ${CMAKE_BINARY_DIR}/temp/l0_versioneer.cpp) + + file(WRITE ${L0_VERSIONEER_FILE} "${L0_VERSIONEER_SRC}") + + # We need both the directories in the include path as ze_loader.h + # includes "ze_api.h" and not "level_zero/ze_api.h". + list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}) + list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero) + list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR) + try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE + "${CMAKE_BINARY_DIR}" + "${L0_VERSIONEER_FILE}" + LINK_LIBRARIES ${LevelZero_LIBRARY} + CMAKE_FLAGS + "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}" + RUN_OUTPUT_VARIABLE L0_VERSION + ) + if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN)) + set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE) + message(STATUS "Found Level Zero of version: ${L0_VERSION}") + else() + message(FATAL_ERROR + "Could not compile a level-zero program to extract loader version" + ) + endif() +endfunction(get_l0_loader_version) + +if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY) + list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}") + list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}) + if(OpenCL_FOUND) + list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS}) + endif() + + cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH) + set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH}) + + if(NOT TARGET LevelZero::LevelZero) + add_library(LevelZero::LevelZero INTERFACE IMPORTED) + set_target_properties(LevelZero::LevelZero + PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}" + ) + set_target_properties(LevelZero::LevelZero + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}" + ) + endif() +endif() + +# Check if a specific version of Level Zero is required +if(LevelZero_FIND_VERSION) + get_l0_loader_version() + set(VERSION_GT_FIND_VERSION FALSE) + compare_versions( + ${LevelZero_VERSION} + ${LevelZero_FIND_VERSION} + VERSION_GT_FIND_VERSION + ) + if(${VERSION_GT_FIND_VERSION}) + set(LevelZero_FOUND TRUE) + else() + set(LevelZero_FOUND FALSE) + endif() +else() + set(LevelZero_FOUND TRUE) +endif() + +find_package_handle_standard_args(LevelZero + REQUIRED_VARS + LevelZero_FOUND + LevelZero_INCLUDE_DIRS + LevelZero_LIBRARY + LevelZero_LIBRARIES_DIR + HANDLE_COMPONENTS +) +mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS) + +if(LevelZero_FOUND) + find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}" + "(found version ${LevelZero_VERSION})" + ) +else() + find_package_message(LevelZero "Could not find LevelZero" "") +endif() diff --git a/mlir/cmake/modules/FindSyclRuntime.cmake b/mlir/cmake/modules/FindSyclRuntime.cmake new file mode 100644 index 0000000000000..38b065a3f284c --- /dev/null +++ b/mlir/cmake/modules/FindSyclRuntime.cmake @@ -0,0 +1,68 @@ +# CMake find_package() module for SYCL Runtime +# +# Example usage: +# +# find_package(SyclRuntime) +# +# If successful, the following variables will be defined: +# SyclRuntime_FOUND +# SyclRuntime_INCLUDE_DIRS +# SyclRuntime_LIBRARY +# SyclRuntime_LIBRARIES_DIR +# + +include(FindPackageHandleStandardArgs) + +if(NOT DEFINED ENV{CMPLR_ROOT}) + message(WARNING "Please make sure to install Intel DPC++ Compiler and run setvars.(sh/bat)") + message(WARNING "You can download standalone Intel DPC++ Compiler from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html#compilers") +else() + if(LINUX OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux")) + set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/linux") + elseif(WIN32) + set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/windows") + endif() + list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include") + list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include/sycl") + + set(SyclRuntime_LIBRARY_DIR "${SyclRuntime_ROOT}/lib") + + message(STATUS "SyclRuntime_LIBRARY_DIR: ${SyclRuntime_LIBRARY_DIR}") + find_library(SyclRuntime_LIBRARY + NAMES sycl + PATHS ${SyclRuntime_LIBRARY_DIR} + NO_DEFAULT_PATH + ) +endif() + +if(SyclRuntime_LIBRARY) + set(SyclRuntime_FOUND TRUE) + if(NOT TARGET SyclRuntime::SyclRuntime) + add_library(SyclRuntime::SyclRuntime INTERFACE IMPORTED) + set_target_properties(SyclRuntime::SyclRuntime + PROPERTIES INTERFACE_LINK_LIBRARIES "${SyclRuntime_LIBRARY}" + ) + set_target_properties(SyclRuntime::SyclRuntime + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${SyclRuntime_INCLUDE_DIRS}" + ) + endif() +else() + set(SyclRuntime_FOUND FALSE) +endif() + +find_package_handle_standard_args(SyclRuntime + REQUIRED_VARS + SyclRuntime_FOUND + SyclRuntime_INCLUDE_DIRS + SyclRuntime_LIBRARY + SyclRuntime_LIBRARY_DIR + HANDLE_COMPONENTS +) + +mark_as_advanced(SyclRuntime_LIBRARY SyclRuntime_INCLUDE_DIRS) + +if(SyclRuntime_FOUND) + find_package_message(SyclRuntime "Found SyclRuntime: ${SyclRuntime_LIBRARY}" "") +else() + find_package_message(SyclRuntime "Could not find SyclRuntime" "") +endif() diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index ed37abf85275b..3bb6006a467fe 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -568,7 +568,10 @@ def ConvertGPUToSPIRV : Pass<"convert-gpu-to-spirv", "ModuleOp"> { let options = [ Option<"use64bitIndex", "use-64bit-index", "bool", /*default=*/"false", - "Use 64-bit integers to convert index types"> + "Use 64-bit integers to convert index types">, + Option<"useOpenCL", "use-opencl", + "bool", /*default=*/"false", + "Use OpenCL instead of Vulkan"> ]; } diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index fc20bd2ed921a..f285f45448ecc 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -187,4 +187,8 @@ def GpuROCDLAttachTarget: Pass<"rocdl-attach-target", ""> { ]; } +def GpuSerializeToSPIRVPass : Pass<"gpu-serialize-to-spirv", "ModuleOp"> { + let summary = "Serialize spirv dialect to spirv binary"; +} + #endif // MLIR_DIALECT_GPU_PASSES diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index eddf3e9a47d0b..808431b824724 100644 --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -101,7 +101,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { FunctionCallBuilder moduleLoadCallBuilder = { "mgpuModuleLoad", llvmPointerType /* void *module */, - {llvmPointerType /* void *cubin */}}; + {llvmPointerType, /* void *cubin */ + llvmInt64Type /* size_t size */}}; FunctionCallBuilder moduleUnloadCallBuilder = { "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}}; FunctionCallBuilder moduleGetFunctionCallBuilder = { @@ -125,7 +126,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { llvmInt32Type, /* unsigned int sharedMemBytes */ llvmPointerType, /* void *hstream */ llvmPointerPointerType, /* void **kernelParams */ - llvmPointerPointerType /* void **extra */ + llvmPointerPointerType, /* void **extra */ + llvmInt64Type /* size_t paramsCount */ }}; FunctionCallBuilder streamCreateCallBuilder = { "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; @@ -167,7 +169,8 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { "mgpuMemAlloc", llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, - llvmPointerType /* void *stream */}}; + llvmPointerType /* void *stream */, + llvmInt64Type /* size_t isHostShared */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, @@ -786,10 +789,6 @@ LogicalResult ConvertHostUnregisterOpToGpuRuntimeCallPattern::matchAndRewrite( LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { - if (adaptor.getHostShared()) - return rewriter.notifyMatchFailure( - allocOp, "host_shared allocation is not supported"); - MemRefType memRefType = allocOp.getType(); if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || @@ -799,6 +798,8 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( auto loc = allocOp.getLoc(); + bool isShared = allocOp.getHostShared(); + // Get shape of the memref as values: static sizes are constant // values and dynamic sizes are passed to 'alloc' as operands. SmallVector shape; @@ -811,8 +812,13 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( // descriptor. Type elementPtrType = this->getElementPtrType(memRefType); auto stream = adaptor.getAsyncDependencies().front(); + + auto isHostShared = rewriter.create( + loc, llvmInt64Type, rewriter.getI64IntegerAttr(isShared)); + Value allocatedPtr = - allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); + allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared}) + .getResult(); if (!getTypeConverter()->useOpaquePointers()) allocatedPtr = rewriter.create(loc, elementPtrType, allocatedPtr); @@ -1134,7 +1140,21 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); - auto module = moduleLoadCallBuilder.create(loc, rewriter, data); + // SPIRV requires binary size + auto gpuBlob = binaryAttr.getValue(); + auto gpuBlobSize = rewriter.create( + loc, llvmInt64Type, + mlir::IntegerAttr::get(llvmInt64Type, + static_cast(gpuBlob.size()))); + + auto paramsCount = rewriter.create( + loc, llvmInt64Type, + mlir::IntegerAttr::get( + llvmInt64Type, + static_cast(launchOp.getNumKernelOperands()))); + + auto module = + moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize}); // Get the function from the module. The name corresponds to the name of // the kernel function. auto kernelName = generateKernelNameConstant( @@ -1158,7 +1178,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(), adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams, - /*extra=*/nullpointer}); + /*extra=*/nullpointer, paramsCount}); if (launchOp.getAsyncToken()) { // Async launch: make dependent ops use the same stream. diff --git a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp index f37c70a771f59..a52c99ec9daec 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp @@ -71,7 +71,8 @@ void GPUToSPIRVPass::runOnOperation() { std::unique_ptr target = spirv::getMemorySpaceToStorageClassTarget(*context); spirv::MemorySpaceToStorageClassMap memorySpaceMap = - spirv::mapMemorySpaceToVulkanStorageClass; + this->useOpenCL ? spirv::mapMemorySpaceToOpenCLStorageClass : + spirv::mapMemorySpaceToVulkanStorageClass; spirv::MemorySpaceToStorageClassConverter converter(memorySpaceMap); RewritePatternSet patterns(context); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index 6244132c073a4..38fa60ba06f59 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -58,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms Transforms/SerializeToBlob.cpp Transforms/SerializeToCubin.cpp Transforms/SerializeToHsaco.cpp + Transforms/SerializeToSPIRV.cpp Transforms/ShuffleRewriter.cpp Transforms/ROCDLAttachTarget.cpp @@ -96,6 +97,7 @@ add_mlir_dialect_library(MLIRGPUTransforms MLIRSupport MLIRROCDLTarget MLIRTransformUtils + MLIRSPIRVSerialization ) add_subdirectory(TransformOps) diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp new file mode 100644 index 0000000000000..f013f531371de --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToSPIRV.cpp @@ -0,0 +1,70 @@ +//===- SerializeToSPIRV.cpp - Convert GPU kernel to SPIRV blob -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This pass iterates all the SPIR-V modules in the top module and serializes +/// each SPIR-V module to SPIR-V binary and then attachs the binary blob as a +/// string attribute to the corresponding gpu module. +/// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/Transforms/Passes.h" + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" +#include "mlir/Target/SPIRV/Serialization.h" + +namespace mlir { +#define GEN_PASS_DEF_GPUSERIALIZETOSPIRVPASS +#include "mlir/Dialect/GPU/Transforms/Passes.h.inc" +} // namespace mlir + +using namespace mlir; + +struct GpuSerializeToSPIRVPass : public mlir::impl::GpuSerializeToSPIRVPassBase { +public: + void runOnOperation() override { + auto mod = getOperation(); + llvm::SmallVector spvBinary; + for (mlir::gpu::GPUModuleOp gpuMod : mod.getOps()) { + auto name = gpuMod.getName(); + // check that the spv module has the same name with gpu module except the + // prefix "__spv__" + auto isSameMod = [&](spirv::ModuleOp spvMod) -> bool { + auto spvModName = spvMod.getName(); + return spvModName->consume_front("__spv__") && spvModName == name; + }; + auto spvMods = mod.getOps(); + auto it = llvm::find_if(spvMods, isSameMod); + if (it == spvMods.end()) { + gpuMod.emitError() << "Unable to find corresponding SPIR-V module"; + signalPassFailure(); + return; + } + auto spvMod = *it; + + spvBinary.clear(); + // serialize the spv module to spv binary + if (mlir::failed(spirv::serialize(spvMod, spvBinary))) { + spvMod.emitError() << "Failed to serialize SPIR-V module"; + signalPassFailure(); + return; + } + + // attach the spv binary to the gpu module + auto spvData = + llvm::StringRef(reinterpret_cast(spvBinary.data()), + spvBinary.size() * sizeof(uint32_t)); + auto spvAttr = mlir::StringAttr::get(&getContext(), spvData); + gpuMod->setAttr(gpu::getDefaultGpuBinaryAnnotation(), spvAttr); + spvMod->erase(); + } + } +}; diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index ea33c2c6ed261..101d9baafcccc 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -6,6 +6,7 @@ set(LLVM_OPTIONAL_SOURCES CRunnerUtils.cpp CudaRuntimeWrappers.cpp SparseTensorRuntime.cpp + SyclRuntimeWrappers.cpp ExecutionEngine.cpp Float16bits.cpp RocmRuntimeWrappers.cpp @@ -328,4 +329,39 @@ if(LLVM_ENABLE_PIC) hip::host hip::amdhip64 ) endif() + + if(MLIR_ENABLE_SYCL_RUNNER) + find_package(SyclRuntime) + + if(NOT SyclRuntime_FOUND) + message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.") + endif() + + find_package(LevelZero) + + if(NOT LevelZero_FOUND) + message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.") + endif() + + add_mlir_library(mlir_sycl_runtime + SHARED + SyclRuntimeWrappers.cpp + + EXCLUDE_FROM_LIBMLIR + ) + + check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG) + if(NOT CXX_HAS_FRTTI_FLAG) + message(FATAL_ERROR "CXX compiler does not accept flag -frtti") + endif() + target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti) + + target_include_directories(mlir_sycl_runtime PRIVATE + ${MLIR_INCLUDE_DIRS} + ) + + target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime) + + set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}") + endif() endif() diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 1dba677ebe663..79dc2eed38f06 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -119,7 +119,8 @@ static bool cusparseLt_initiated = false; #endif // MLIR_ENABLE_CUDA_CUSPARSELT #endif // MLIR_ENABLE_CUDA_CUSPARSE -extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule +mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { ScopedContext scopedContext; CUmodule module = nullptr; CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data)); @@ -144,7 +145,7 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, CUstream stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { ScopedContext scopedContext; int32_t maxShmem = 0; CUdevice device = getDefaultCuDevice(); @@ -210,7 +211,8 @@ extern MLIR_CUDA_WRAPPERS_EXPORT "C" void mgpuEventRecord(CUevent event, CUDA_REPORT_IF_ERROR(cuEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, CUstream /*stream*/, + bool /*isHostShared*/) { ScopedContext scopedContext; CUdeviceptr ptr; CUDA_REPORT_IF_ERROR(cuMemAlloc(&ptr, sizeBytes)); diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index bd3868a8e196f..b50fd7eb9d059 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -32,7 +32,7 @@ thread_local static int32_t defaultDevice = 0; -extern "C" hipModule_t mgpuModuleLoad(void *data) { +extern "C" hipModule_t mgpuModuleLoad(void *data, size_t /*gpuBlobSize*/) { hipModule_t module = nullptr; HIP_REPORT_IF_ERROR(hipModuleLoadData(&module, data)); return module; @@ -57,7 +57,7 @@ extern "C" void mgpuLaunchKernel(hipFunction_t function, intptr_t gridX, intptr_t blockX, intptr_t blockY, intptr_t blockZ, int32_t smem, hipStream_t stream, void **params, - void **extra) { + void **extra, size_t /*paramsCount*/) { HIP_REPORT_IF_ERROR(hipModuleLaunchKernel(function, gridX, gridY, gridZ, blockX, blockY, blockZ, smem, stream, params, extra)); @@ -99,7 +99,8 @@ extern "C" void mgpuEventRecord(hipEvent_t event, hipStream_t stream) { HIP_REPORT_IF_ERROR(hipEventRecord(event, stream)); } -extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/) { +extern "C" void *mgpuMemAlloc(uint64_t sizeBytes, hipStream_t /*stream*/, + bool /*isHostShared*/) { void *ptr; HIP_REPORT_IF_ERROR(hipMalloc(&ptr, sizeBytes)); return ptr; diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp new file mode 100644 index 0000000000000..6b40d4a6922c9 --- /dev/null +++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp @@ -0,0 +1,223 @@ +//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implements C wrappers around the sycl runtime library. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define SYCL_RUNTIME_EXPORT __declspec(dllexport) +#else +#define SYCL_RUNTIME_EXPORT +#endif // _WIN32 + +namespace { + +template +auto catchAll(F &&func) { + try { + return func(); + } catch (const std::exception &e) { + fprintf(stdout, "An exception was thrown: %s\n", e.what()); + fflush(stdout); + abort(); + } catch (...) { + fprintf(stdout, "An unknown exception was thrown\n"); + fflush(stdout); + abort(); + } +} + +#define L0_SAFE_CALL(call) \ + { \ + ze_result_t status = (call); \ + if (status != ZE_RESULT_SUCCESS) { \ + fprintf(stdout, "L0 error %d\n", status); \ + fflush(stdout); \ + abort(); \ + } \ + } + +} // namespace + +static sycl::device getDefaultDevice() { + auto platformList = sycl::platform::get_platforms(); + for (const auto &platform : platformList) { + auto platformName = platform.get_info(); + bool isLevelZero = platformName.find("Level-Zero") != std::string::npos; + if (!isLevelZero) + continue; + + return platform.get_devices()[0]; + } + throw std::runtime_error("getDefaultDevice failed"); +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wglobal-constructors" + +// Create global device and context +sycl::device syclDevice = getDefaultDevice(); +sycl::context syclContext = sycl::context(syclDevice); + +#pragma clang diagnostic pop + +struct QUEUE { + sycl::queue syclQueue_; + + QUEUE() { syclQueue_ = sycl::queue(syclContext, syclDevice); } +}; + +static void *allocDeviceMemory(QUEUE *queue, size_t size, bool isShared) { + void *memPtr = nullptr; + if (isShared) { + memPtr = sycl::aligned_alloc_shared(64, size, syclDevice, syclContext); + } else { + memPtr = sycl::aligned_alloc_device(64, size, syclDevice, syclContext); + } + if (memPtr == nullptr) { + throw std::runtime_error("mem allocation failed!"); + } + return memPtr; +} + +static void deallocDeviceMemory(QUEUE *queue, void *ptr) { + sycl::free(ptr, queue->syclQueue_); +} + +static ze_module_handle_t loadModule(const void *data, size_t dataSize) { + assert(data); + ze_module_handle_t zeModule; + ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC, + nullptr, + ZE_MODULE_FORMAT_IL_SPIRV, + dataSize, + (const uint8_t *)data, + nullptr, + nullptr}; + auto zeDevice = + sycl::get_native(syclDevice); + auto zeContext = + sycl::get_native(syclContext); + L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr)); + return zeModule; +} + +static sycl::kernel *getKernel(ze_module_handle_t zeModule, const char *name) { + assert(zeModule); + assert(name); + ze_kernel_handle_t zeKernel; + sycl::kernel *syclKernel; + ze_kernel_desc_t desc = {}; + desc.pKernelName = name; + + L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel)); + sycl::kernel_bundle kernelBundle = + sycl::make_kernel_bundle({zeModule}, + syclContext); + + auto kernel = sycl::make_kernel( + {kernelBundle, zeKernel}, syclContext); + syclKernel = new sycl::kernel(kernel); + return syclKernel; +} + +static void launchKernel(QUEUE *queue, sycl::kernel *kernel, size_t gridX, + size_t gridY, size_t gridZ, size_t blockX, + size_t blockY, size_t blockZ, size_t sharedMemBytes, + void **params, size_t paramsCount) { + auto syclGlobalRange = + ::sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX); + auto syclLocalRange = ::sycl::range<3>(blockZ, blockY, blockX); + sycl::nd_range<3> syclNdRange( + sycl::nd_range<3>(syclGlobalRange, syclLocalRange)); + + queue->syclQueue_.submit([&](sycl::handler &cgh) { + for (size_t i = 0; i < paramsCount; i++) { + cgh.set_arg(static_cast(i), *(static_cast(params[i]))); + } + cgh.parallel_for(syclNdRange, *kernel); + }); +} + +// Wrappers + +extern "C" SYCL_RUNTIME_EXPORT QUEUE *mgpuStreamCreate() { + + return catchAll([&]() { return new QUEUE(); }); +} + +extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamDestroy(QUEUE *queue) { + catchAll([&]() { delete queue; }); +} + +extern "C" SYCL_RUNTIME_EXPORT void *mgpuMemAlloc(uint64_t size, QUEUE *queue, + bool isShared) { + return catchAll([&]() { + return allocDeviceMemory(queue, static_cast(size), true); + }); +} + +extern "C" SYCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, QUEUE *queue) { + catchAll([&]() { + if (ptr) { + deallocDeviceMemory(queue, ptr); + } + }); +} + +extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t +mgpuModuleLoad(const void *data, size_t gpuBlobSize) { + return catchAll([&]() { return loadModule(data, gpuBlobSize); }); +} + +extern "C" SYCL_RUNTIME_EXPORT sycl::kernel * +mgpuModuleGetFunction(ze_module_handle_t module, const char *name) { + return catchAll([&]() { return getKernel(module, name); }); +} + +extern "C" SYCL_RUNTIME_EXPORT void +mgpuLaunchKernel(sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ, + size_t blockX, size_t blockY, size_t blockZ, + size_t sharedMemBytes, QUEUE *queue, void **params, + void **extra, size_t paramsCount) { + return catchAll([&]() { + launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ, + sharedMemBytes, params, paramsCount); + }); +} + +extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamSynchronize(QUEUE *queue) { + + catchAll([&]() { queue->syclQueue_.wait(); }); +} + +extern "C" SYCL_RUNTIME_EXPORT void +mgpuModuleUnload(ze_module_handle_t module) { + + catchAll([&]() { L0_SAFE_CALL(zeModuleDestroy(module)); }); +} diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 66a9cb01106ba..874e7718f4a36 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -135,6 +135,10 @@ if(MLIR_ENABLE_ROCM_RUNNER) list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime) endif() +if(MLIR_ENABLE_SYCL_RUNNER) + list(APPEND MLIR_TEST_DEPENDS mlir_sycl_runtime) +endif() + list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests) if(LLVM_BUILD_EXAMPLES) diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir index 2506c6ceb990e..f365dcb02daf4 100644 --- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]]) diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir index 2cdc4e8dbb1ad..96e8a6dbd35b1 100644 --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -34,8 +34,10 @@ module attributes {gpu.container_module} { // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]] // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0] // CHECK-SAME: -> !llvm.ptr + // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant + // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant - // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]]) + // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]]) // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}}) // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate @@ -53,10 +55,11 @@ module attributes {gpu.container_module} { // CHECK: llvm.getelementptr %[[MEMREF]][0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]] // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr + // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]], // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]], - // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]]) + // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]]) // CHECK: llvm.call @mgpuStreamSynchronize // CHECK: llvm.call @mgpuStreamDestroy // CHECK: llvm.call @mgpuModuleUnload diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir index 2fa6c854c5678..e27162c7dbc19 100644 --- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir +++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir @@ -8,7 +8,8 @@ module attributes {gpu.container_module} { %0 = gpu.wait async // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] - // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]]) %1, %2 = gpu.alloc async [%0] (%size) : memref // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]] diff --git a/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir new file mode 100644 index 0000000000000..d70e18f3401d3 --- /dev/null +++ b/mlir/test/Dialect/GPU/Transforms/serialize-spirv.mlir @@ -0,0 +1,53 @@ +// RUN: mlir-opt -gpu-serialize-to-spirv %s | FileCheck %s +module attributes {gpu.container_module, spirv.target_env = #spirv.target_env<#spirv.vce, #spirv.resource_limits<>>} { + // CHECK: gpu.module @addt_kernel attributes {gpu.binary = + spirv.module @__spv__addt_kernel Physical64 OpenCL requires #spirv.vce { + spirv.GlobalVariable @__builtin_var_WorkgroupId__ built_in("WorkgroupId") : !spirv.ptr, Input> + spirv.func @addt_kernel(%arg0: !spirv.ptr, %arg1: !spirv.ptr, %arg2: !spirv.ptr) "None" attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>, workgroup_attributions = 0 : i64} { + %cst5_i64 = spirv.Constant 5 : i64 + %__builtin_var_WorkgroupId___addr = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr, Input> + %0 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr : vector<3xi64> + %1 = spirv.CompositeExtract %0[0 : i32] : vector<3xi64> + %__builtin_var_WorkgroupId___addr_0 = spirv.mlir.addressof @__builtin_var_WorkgroupId__ : !spirv.ptr, Input> + %2 = spirv.Load "Input" %__builtin_var_WorkgroupId___addr_0 : vector<3xi64> + %3 = spirv.CompositeExtract %2[1 : i32] : vector<3xi64> + spirv.Branch ^bb1 + ^bb1: // pred: ^bb0 + %4 = spirv.IMul %1, %cst5_i64 : i64 + %5 = spirv.IAdd %4, %3 : i64 + %6 = spirv.InBoundsPtrAccessChain %arg0[%5] : !spirv.ptr, i64 + %7 = spirv.Load "CrossWorkgroup" %6 ["Aligned", 4] : f32 + %8 = spirv.IMul %1, %cst5_i64 : i64 + %9 = spirv.IAdd %8, %3 : i64 + %10 = spirv.InBoundsPtrAccessChain %arg1[%9] : !spirv.ptr, i64 + %11 = spirv.Load "CrossWorkgroup" %10 ["Aligned", 4] : f32 + %12 = spirv.FAdd %7, %11 : f32 + %13 = spirv.IMul %1, %cst5_i64 : i64 + %14 = spirv.IAdd %13, %3 : i64 + %15 = spirv.InBoundsPtrAccessChain %arg2[%14] : !spirv.ptr, i64 + spirv.Store "CrossWorkgroup" %15, %12 ["Aligned", 4] : f32 + spirv.Return + } + spirv.EntryPoint "Kernel" @addt_kernel, @__builtin_var_WorkgroupId__ + } + gpu.module @addt_kernel { + gpu.func @addt_kernel(%arg0: memref, %arg1: memref, %arg2: memref) kernel attributes {spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %c5 = arith.constant 5 : index + %0 = gpu.block_id x + %1 = gpu.block_id y + cf.br ^bb1 + ^bb1: // pred: ^bb0 + %2 = arith.muli %0, %c5 : index + %3 = arith.addi %2, %1 : index + %4 = memref.load %arg0[%3] : memref + %5 = arith.muli %0, %c5 : index + %6 = arith.addi %5, %1 : index + %7 = memref.load %arg1[%6] : memref + %8 = arith.addf %4, %7 : f32 + %9 = arith.muli %0, %c5 : index + %10 = arith.addi %9, %1 : index + memref.store %8, %arg2[%10] : memref + gpu.return + } + } +} diff --git a/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir new file mode 100644 index 0000000000000..36d132d0c94d3 --- /dev/null +++ b/mlir/test/Integration/GPU/SYCL/gpu-to-spirv.mlir @@ -0,0 +1,58 @@ +// RUN: mlir-opt %s -pass-pipeline='builtin.module(convert-gpu-to-spirv{use-64bit-index=true use-opencl=true},spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),gpu-serialize-to-spirv,convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_sycl_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @add attributes {gpu.container_module} { + memref.global "private" constant @__constant_3x3xi64_0 : memref<3x3xi64> = dense<[[1, 4098, 3], [16777220, 5, 4294967302], [7, 1099511627784, 9]]> + memref.global "private" constant @__constant_3x3xi64 : memref<3x3xi64> = dense<[[1, 2, 3], [4, 5, 4102], [16777223, 4294967304, 1099511627785]]> + func.func @main() { + %0 = memref.get_global @__constant_3x3xi64 : memref<3x3xi64> + %1 = memref.get_global @__constant_3x3xi64_0 : memref<3x3xi64> + %2 = call @test(%0, %1) : (memref<3x3xi64>, memref<3x3xi64>) -> memref<3x3xi64> + %cast = memref.cast %2 : memref<3x3xi64> to memref<*xi64> + call @printMemrefI64(%cast) : (memref<*xi64>) -> () + return + } + func.func private @printMemrefI64(memref<*xi64>) + func.func @test(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) -> memref<3x3xi64> { + %c3 = arith.constant 3 : index + %c1 = arith.constant 1 : index + %0 = gpu.wait async + %memref, %asyncToken = gpu.alloc async [%0] host_shared (): memref<3x3xi64> + gpu.wait [%asyncToken] + memref.copy %arg1, %memref : memref<3x3xi64> to memref<3x3xi64> + %1 = gpu.wait async + %memref_0, %asyncToken_1 = gpu.alloc async [%1] host_shared () : memref<3x3xi64> + gpu.wait [%asyncToken_1] + memref.copy %arg0, %memref_0 : memref<3x3xi64> to memref<3x3xi64> + %2 = gpu.wait async + %memref_2, %asyncToken_3 = gpu.alloc async [%2] host_shared () : memref<3x3xi64> + %3 = gpu.launch_func async [%asyncToken_3] @test_kernel::@test_kernel blocks in (%c3, %c3, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<3x3xi64>, %memref : memref<3x3xi64>, %memref_2 : memref<3x3xi64>) + gpu.wait [%3] + %alloc = memref.alloc() : memref<3x3xi64> + memref.copy %memref_2, %alloc : memref<3x3xi64> to memref<3x3xi64> + %4 = gpu.wait async + %5 = gpu.dealloc async [%4] %memref_2 : memref<3x3xi64> + %6 = gpu.dealloc async [%5] %memref_0 : memref<3x3xi64> + %7 = gpu.dealloc async [%6] %memref : memref<3x3xi64> + gpu.wait [%7] + return %alloc : memref<3x3xi64> + } + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>, %arg2: memref<3x3xi64>) kernel attributes {gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %0 = gpu.block_id x + %1 = gpu.block_id y + %2 = memref.load %arg0[%0, %1] : memref<3x3xi64> + %3 = memref.load %arg1[%0, %1] : memref<3x3xi64> + %4 = arith.addi %2, %3 : i64 + memref.store %4, %arg2[%0, %1] : memref<3x3xi64> + gpu.return + } + } + // CHECK: [2, 4100, 6], + // CHECK: [16777224, 10, 4294971404], + // CHECK: [16777230, 1103806595088, 1099511627794] +} diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index f265ac794c6f6..5d3a4dc575a7b 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -119,6 +119,9 @@ def add_runtime(name): if config.enable_cuda_runner: tools.extend([add_runtime("mlir_cuda_runtime")]) +if config.enable_sycl_runner: + tools.extend([add_runtime("mlir_sycl_runtime")]) + # The following tools are optional tools.extend( [ diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index ef1fdbc0cba07..897c12f3abcac 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -30,6 +30,7 @@ config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@ config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@" +config.enable_sycl_runner = @MLIR_ENABLE_SYCL_RUNNER@ config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@ config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@ config.enable_bindings_python = @MLIR_ENABLE_BINDINGS_PYTHON@