diff --git a/README.md b/README.md index 21ad58965..958878fae 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,15 @@ cmake --build . --target gc-check Notes: * `/PATH/TO/llvm-project/llvm-install` should be the install path of LLVM. If you installed LLVM elsewhere by `-DCMAKE_INSTALL_PREFIX` option when building LLVM, you need to change the path in `-DMLIR_DIR` accordingly. * The cmake option `-DLLVM_EXTERNAL_LIT` is for the tests of this project. It requires the `lit` tool to be installed in the system. You can install it via `pip install lit`. If you don't need to run the tests of this repo, you can omit this option in the command line. - * If GPU components are on (`-DGC_USE_GPU=ON`), make sure the Level-zero runtime is installed in your system. Either install Level-zero runtime via system package managers (e.g. `apt`), or follow the instructions of [IMEX](https://github.com/intel/mlir-extensions). + +More notes if GPU components are on (`-DGC_USE_GPU=ON`): + * make sure the OpenCL runtime is installed in your system. You can either + install using OS-provided package (Ubuntu 22.04) +```sh +sudo apt install -y intel-opencl-icd opencl-c-headers +``` + Or, download and install package from: https://github.com/intel/compute-runtime/releases + * the LLVM codebase needs to be patched to support XeGPU lowering (from IMEX). Please follow instructions of [IMEX](https://github.com/intel/mlir-extensions) on patching LLVM. Graph Compiler supports the following build-time options. diff --git a/cmake/imex.cmake b/cmake/imex.cmake index 0a21d0181..8ed61e3b3 100644 --- a/cmake/imex.cmake +++ b/cmake/imex.cmake @@ -4,11 +4,11 @@ get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES) if (NOT DEFINED IMEX_INCLUDES) include(functions) set(IMEX_CHECK_LLVM_VERSION ON) - set(IMEX_ENABLE_L0_RUNTIME 1) + set(IMEX_ENABLE_L0_RUNTIME 0) # TODO: Change to main https://github.com/oneapi-src/oneDNN.git when all the # required functionality is merged. gc_fetch_content(imex 496b240093b5e132b60c5ee69878300fe69be300 https://github.com/Menooker/mlir-extensions - CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=1" + CMAKE_ARGS "-DMLIR_DIR=${MLIR_DIR};-DIMEX_CHECK_LLVM_VERSION=ON;-DIMEX_ENABLE_L0_RUNTIME=0" ) set(IMEX_INCLUDES diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index fc727abb3..79a62f028 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -32,7 +32,7 @@ def ConvertOneDNNGraphToLinalg : Pass<"convert-onednn-graph-to-linalg"> { ]; } - +#ifdef GC_USE_GPU def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> { let summary = "Convert linalg dialect to XeGPU dialect."; let description = [{ @@ -57,5 +57,6 @@ def LinalgToXeGPU : Pass<"linalg-to-xegpu", "func::FuncOp"> { "DPAS register block sizes MxNxK">, ]; } +#endif #endif // GC_DIALECT_GC_PASSES diff --git a/lib/gc/CAPI/CMakeLists.txt b/lib/gc/CAPI/CMakeLists.txt index 1d2e7687e..f7738c064 100644 --- a/lib/gc/CAPI/CMakeLists.txt +++ b/lib/gc/CAPI/CMakeLists.txt @@ -1,10 +1,16 @@ +set(GC_ALL_LIBS + MLIROneDNNGraph + MLIRCPURuntimeDialect + GCPasses + MLIRCPURuntimeTransforms) + +if(GC_USE_GPU) + list(APPEND GC_ALL_LIBS GCGPUPasses) +endif() + add_mlir_public_c_api_library(GcCAPI Dialects.cpp Passes.cpp LINK_LIBS PUBLIC - MLIROneDNNGraph - MLIRCPURuntimeDialect - GCPasses - GCGPUPasses - MLIRCPURuntimeTransforms + ${GC_ALL_LIBS} ) \ No newline at end of file diff --git a/lib/gc/ExecutionEngine/CMakeLists.txt b/lib/gc/ExecutionEngine/CMakeLists.txt index ae0c1c8df..d5279b044 100644 --- a/lib/gc/ExecutionEngine/CMakeLists.txt +++ b/lib/gc/ExecutionEngine/CMakeLists.txt @@ -1,2 +1,5 @@ add_subdirectory(CPURuntime) -add_subdirectory(Driver) \ No newline at end of file +add_subdirectory(Driver) +if(GC_USE_GPU) + add_subdirectory(OpenCLRuntime) +endif() \ No newline at end of file diff --git a/lib/gc/ExecutionEngine/Driver/CMakeLists.txt b/lib/gc/ExecutionEngine/Driver/CMakeLists.txt index d04dbbb4e..8742ef6e9 100644 --- a/lib/gc/ExecutionEngine/Driver/CMakeLists.txt +++ b/lib/gc/ExecutionEngine/Driver/CMakeLists.txt @@ -26,6 +26,11 @@ else() ) endif() +set(GC_PASSES GCPasses) +if(GC_USE_GPU) + list(APPEND GC_PASSES GCGPUPasses) +endif() + add_mlir_library(GCJitWrapper Driver.cpp @@ -35,8 +40,7 @@ add_mlir_library(GCJitWrapper LINK_LIBS PUBLIC ${MLIR_LINK_COMPONENTS} ${dialect_libs} - ${conversion_libs} - GCPasses - GCGPUPasses + ${conversion_libs} + ${GC_PASSES} ) diff --git a/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt b/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt new file mode 100644 index 000000000..62163496f --- /dev/null +++ b/lib/gc/ExecutionEngine/OpenCLRuntime/CMakeLists.txt @@ -0,0 +1,22 @@ +find_package(OpenCL REQUIRED) + +add_mlir_library(mlir_opencl_runtime + SHARED + OpenCLRuntimeWrappers.cpp + + EXCLUDE_FROM_LIBMLIR + ) + +check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG) +if(NOT CXX_HAS_FRTTI_FLAG) + message(FATAL_ERROR "CXX compiler does not accept flag -frtti") +endif() +target_compile_options (mlir_opencl_runtime PUBLIC -fexceptions -frtti) + +target_include_directories(mlir_opencl_runtime PRIVATE + ${MLIR_INCLUDE_DIRS} + ${OpenCL_INCLUDE_DIRS} + ) + +message(STATUS "OpenCL Libraries: ${OpenCL_LIBRARIES}") +target_link_libraries(mlir_opencl_runtime PUBLIC ${OpenCL_LIBRARIES}) diff --git a/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp b/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp new file mode 100644 index 000000000..b78ca2233 --- /dev/null +++ b/lib/gc/ExecutionEngine/OpenCLRuntime/OpenCLRuntimeWrappers.cpp @@ -0,0 +1,468 @@ +//===-- OpenCLRuntimeWrappers.cpp - OpenCLRuntimeWrappers -------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define CL_TARGET_OPENCL_VERSION 300 +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#define OCL_RUNTIME_EXPORT __declspec(dllexport) +#else +#define OCL_RUNTIME_EXPORT +#endif // _WIN32 + +namespace { + +#define CL_SAFE_CALL2(a) \ + do { \ + (a); \ + if (err != CL_SUCCESS) { \ + fprintf(stderr, "FAIL: err=%d @ line=%d (%s)\n", err, __LINE__, (#a)); \ + abort(); \ + } \ + } while (0) + +#define CL_SAFE_CALL(call) \ + { \ + auto status = (call); \ + if (status != CL_SUCCESS) { \ + fprintf(stderr, "CL error %d @ line=%d (%s)\n", status, __LINE__, \ + (#call)); \ + abort(); \ + } \ + } + +constexpr char DeviceMemAllocName[] = "clDeviceMemAllocINTEL"; +constexpr char SharedMemAllocName[] = "clSharedMemAllocINTEL"; +constexpr char MemBlockingFreeName[] = "clMemBlockingFreeINTEL"; +constexpr char SetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL"; +static constexpr char EnqueueMemcpyName[] = "clEnqueueMemcpyINTEL"; + +void *queryCLExtFunc(cl_platform_id CurPlatform, const char *FuncName) { + void *ret = clGetExtensionFunctionAddressForPlatform(CurPlatform, FuncName); + + if (!ret) { + fflush(stderr); + abort(); + } + return ret; +} + +void *queryCLExtFunc(cl_device_id dev, const char *FuncName) { + cl_platform_id CurPlatform; + CL_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), + &CurPlatform, nullptr)); + return queryCLExtFunc(CurPlatform, FuncName); +} + +struct CLExtTable { + clDeviceMemAllocINTEL_fn allocDev; + clSharedMemAllocINTEL_fn allocShared; + clMemBlockingFreeINTEL_fn blockingFree; + clSetKernelArgMemPointerINTEL_fn setKernelArgMemPtr; + clEnqueueMemcpyINTEL_fn enqueneMemcpy; + CLExtTable() = default; + CLExtTable(cl_platform_id plat) { + allocDev = + (clDeviceMemAllocINTEL_fn)queryCLExtFunc(plat, DeviceMemAllocName); + allocShared = + (clSharedMemAllocINTEL_fn)queryCLExtFunc(plat, SharedMemAllocName); + blockingFree = + (clMemBlockingFreeINTEL_fn)queryCLExtFunc(plat, MemBlockingFreeName); + setKernelArgMemPtr = (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc( + plat, SetKernelArgMemPointerName); + enqueneMemcpy = + (clEnqueueMemcpyINTEL_fn)queryCLExtFunc(plat, EnqueueMemcpyName); + } +}; + +struct CLExtTableCache { + cl_platform_id platform; + CLExtTable table; + CLExtTableCache(cl_platform_id CurPlatform) + : platform{CurPlatform}, table{CurPlatform} {} + static CLExtTable *get(cl_device_id dev) { + cl_platform_id CurPlatform; + CL_SAFE_CALL(clGetDeviceInfo(dev, CL_DEVICE_PLATFORM, + sizeof(cl_platform_id), &CurPlatform, + nullptr)); + static CLExtTableCache v{CurPlatform}; + if (v.platform == CurPlatform) { + return &v.table; + } + return nullptr; + } +}; + +struct ParamDesc { + void *data; + size_t size; + + bool operator==(const ParamDesc &rhs) const { + return data == rhs.data && size == rhs.size; + } + + bool operator!=(const ParamDesc &rhs) const { return !(*this == rhs); } +}; + +template size_t countUntil(T *ptr, T &&elem) { + assert(ptr); + auto curr = ptr; + while (*curr != elem) { + ++curr; + } + return static_cast(curr - ptr); +} +} // namespace + +static cl_device_id getDevice(cl_device_type *devtype) { + cl_platform_id platform; // OpenCL platform + cl_device_id device; // device ID + CL_SAFE_CALL(clGetPlatformIDs(1, &platform, NULL)); + CL_SAFE_CALL(clGetDeviceIDs(platform, *devtype, 1, &device, NULL)); + return device; +} + +struct GPUCLQUEUE { + cl_device_id device_ = nullptr; + cl_context context_ = nullptr; + cl_command_queue queue_ = nullptr; + bool context_owned_ = false; + bool queue_owned_ = false; + CLExtTable *ext_table_ = nullptr; + std::vector programs_; + std::vector kernels_; + + GPUCLQUEUE(cl_device_type *device, cl_context context, + cl_command_queue queue) { + cl_device_type defaultdev = CL_DEVICE_TYPE_GPU; + if (!device) { + device = &defaultdev; + } + device_ = getDevice(device); + init_context(context, queue, device_); + ext_table_ = CLExtTableCache::get(device_); + } + GPUCLQUEUE(cl_device_id device, cl_context context, cl_command_queue queue) { + if (!device) { + cl_device_type defaultdev = CL_DEVICE_TYPE_GPU; + device = getDevice(&defaultdev); + } + device_ = device; + init_context(context, queue, device_); + ext_table_ = CLExtTableCache::get(device_); + } + ~GPUCLQUEUE() { + for (auto p : kernels_) { + clReleaseKernel(p); + } + for (auto p : programs_) { + clReleaseProgram(p); + } + if (queue_ && queue_owned_) + clReleaseCommandQueue(queue_); + if (context_ && context_owned_) + clReleaseContext(context_); + } + +private: + void init_context(cl_context context, cl_command_queue queue, + cl_device_id device) { + if (queue) { + if (!context) { + throw std::runtime_error( + "Cannot create QUEUE wrapper with queue and without context"); + } + queue_ = queue; + queue_owned_ = true; + context_ = context; + context_owned_ = true; + return; + } + cl_int err; + if (!context) { + CL_SAFE_CALL2(context_ = + clCreateContext(NULL, 1, &device, NULL, NULL, &err)); + context_owned_ = true; + } else { + context_ = context; + } + CL_SAFE_CALL2( + queue_ = clCreateCommandQueueWithProperties(context_, device, 0, &err)); + queue_owned_ = true; + } +}; // end of GPUCLQUEUE + +static void *allocDeviceMemory(GPUCLQUEUE *queue, size_t size, size_t alignment, + bool isShared) { + void *memPtr = nullptr; + cl_int err; + if (isShared) { + auto func = queue->ext_table_ ? queue->ext_table_->allocShared + : (clSharedMemAllocINTEL_fn)queryCLExtFunc( + queue->device_, SharedMemAllocName); + CL_SAFE_CALL2(memPtr = func(queue->context_, queue->device_, nullptr, size, + alignment, &err)); + } else { + auto func = queue->ext_table_ ? queue->ext_table_->allocDev + : (clDeviceMemAllocINTEL_fn)queryCLExtFunc( + queue->device_, DeviceMemAllocName); + CL_SAFE_CALL2(memPtr = func(queue->context_, queue->device_, nullptr, size, + alignment, &err)); + } + return memPtr; +} + +static void deallocDeviceMemory(GPUCLQUEUE *queue, void *ptr) { + auto func = queue->ext_table_ ? queue->ext_table_->blockingFree + : (clMemBlockingFreeINTEL_fn)queryCLExtFunc( + queue->device_, MemBlockingFreeName); + CL_SAFE_CALL(func(queue->context_, ptr)); +} + +static cl_program loadModule(GPUCLQUEUE *queue, const unsigned char *data, + size_t dataSize, bool takeOwnership) { + assert(data); + cl_int errNum = 0; + const unsigned char *codes[1] = {data}; + size_t sizes[1] = {dataSize}; + cl_program program; + cl_int err; + CL_SAFE_CALL2(program = clCreateProgramWithBinary(queue->context_, 1, + &queue->device_, sizes, + codes, &err, &errNum)); + const char *build_flags = "-cl-kernel-arg-info -x spir"; + // enable large register file if needed + if (getenv("IMEX_ENABLE_LARGE_REG_FILE")) { + build_flags = + "-vc-codegen -doubleGRF -Xfinalizer -noLocalSplit -Xfinalizer " + "-DPASTokenReduction -Xfinalizer -SWSBDepReduction -Xfinalizer " + "'-printregusage -enableBCR' -cl-kernel-arg-info -x spir"; + } + CL_SAFE_CALL(clBuildProgram(program, 0, NULL, build_flags, NULL, NULL)); + if (takeOwnership) + queue->programs_.push_back(program); + return program; +} + +static cl_kernel getKernel(GPUCLQUEUE *queue, cl_program program, + const char *name) { + cl_kernel kernel; + cl_int err; + CL_SAFE_CALL2(kernel = clCreateKernel(program, name, &err)); + cl_bool TrueVal = CL_TRUE; + CL_SAFE_CALL(clSetKernelExecInfo( + kernel, CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL, sizeof(cl_bool), + &TrueVal)); + CL_SAFE_CALL(clSetKernelExecInfo( + kernel, CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL, sizeof(cl_bool), + &TrueVal)); + CL_SAFE_CALL(clSetKernelExecInfo( + kernel, CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL, sizeof(cl_bool), + &TrueVal)); + queue->kernels_.push_back(kernel); + return kernel; +} + +template +static void launchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX, + size_t gridY, size_t gridZ, size_t blockX, + size_t blockY, size_t blockZ, size_t sharedMemBytes, + NumArgsFuncT &&fnGetNumArgs, + GetParamFuncT &&fnGetParamFunc) { + auto clSetKernelArgMemPointerINTEL = + queue->ext_table_ ? queue->ext_table_->setKernelArgMemPtr + : (clSetKernelArgMemPointerINTEL_fn)queryCLExtFunc( + queue->device_, SetKernelArgMemPointerName); + auto paramsCount = fnGetNumArgs(); + for (size_t i = 0; i < paramsCount; i++) { + cl_kernel_arg_address_qualifier name; + size_t nameSize = sizeof(name); + // we can do better here, to cache the arginfo for the kernel + CL_SAFE_CALL(clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_ADDRESS_QUALIFIER, + sizeof(name), &name, &nameSize)); + auto [paramData, paramSize] = fnGetParamFunc(i); + if (paramSize == sizeof(void *) && name == CL_KERNEL_ARG_ADDRESS_GLOBAL) { + // pass the value of the pointer instead of the pointer of the pointer + CL_SAFE_CALL( + clSetKernelArgMemPointerINTEL(kernel, i, *(void **)paramData)); + } else { + CL_SAFE_CALL(clSetKernelArg(kernel, i, paramSize, paramData)); + } + } + if (sharedMemBytes) { + CL_SAFE_CALL(clSetKernelArg(kernel, paramsCount, sharedMemBytes, nullptr)); + } + size_t globalSize[3] = {gridX * blockX, gridY * blockY, gridZ * blockZ}; + size_t localSize[3] = {blockX, blockY, blockZ}; + CL_SAFE_CALL(clEnqueueNDRangeKernel(queue->queue_, kernel, 3, NULL, + globalSize, localSize, 0, NULL, NULL)); +} + +static GPUCLQUEUE *getDefaultQueue() { + static GPUCLQUEUE defaultq(static_cast(nullptr), nullptr, + nullptr); + return &defaultq; +} + +// Wrappers + +extern "C" OCL_RUNTIME_EXPORT GPUCLQUEUE *gpuCreateStream(void *device, + void *context) { + // todo: this is a workaround of issue of gpux generating multiple streams + if (!device && !context) { + return getDefaultQueue(); + } + return new GPUCLQUEUE(reinterpret_cast(device), + reinterpret_cast(context), nullptr); +} + +extern "C" OCL_RUNTIME_EXPORT void gpuStreamDestroy(GPUCLQUEUE *queue) { + // todo: this is a workaround of issue of gpux generating multiple streams + // should uncomment the below line to release the queue + // delete queue; +} + +extern "C" OCL_RUNTIME_EXPORT void * +gpuMemAlloc(GPUCLQUEUE *queue, size_t size, size_t alignment, bool isShared) { + if (queue) { + return allocDeviceMemory(queue, size, alignment, isShared); + } + return nullptr; +} + +extern "C" OCL_RUNTIME_EXPORT void gpuMemFree(GPUCLQUEUE *queue, void *ptr) { + if (queue && ptr) { + deallocDeviceMemory(queue, ptr); + } +} + +extern "C" OCL_RUNTIME_EXPORT cl_program +gpuModuleLoad(GPUCLQUEUE *queue, const unsigned char *data, size_t dataSize) { + if (queue) { + return loadModule(queue, data, dataSize, false); + } + return nullptr; +} + +extern "C" OCL_RUNTIME_EXPORT cl_kernel gpuKernelGet(GPUCLQUEUE *queue, + cl_program module, + const char *name) { + if (queue) { + return getKernel(queue, module, name); + } + return nullptr; +} + +extern "C" OCL_RUNTIME_EXPORT void +gpuLaunchKernel(GPUCLQUEUE *queue, cl_kernel kernel, size_t gridX, size_t gridY, + size_t gridZ, size_t blockX, size_t blockY, size_t blockZ, + size_t sharedMemBytes, void *params) { + if (queue) { + auto typedParams = static_cast(params); + launchKernel( + queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ, + sharedMemBytes, + [&]() { + // The assumption is, if there is a param for the shared local memory, + // then that will always be the last argument. + auto paramsCount = countUntil(typedParams, ParamDesc{nullptr, 0}); + if (sharedMemBytes) { + paramsCount = paramsCount - 1; + } + return paramsCount; + }, + [&](size_t i) -> const ParamDesc & { return typedParams[i]; }); + } +} + +extern "C" OCL_RUNTIME_EXPORT void gpuWait(GPUCLQUEUE *queue) { + if (queue) { + CL_SAFE_CALL(clFinish(queue->queue_)); + } +} + +//////////////////////////////////////////////////////////////// +// Here starts the upstream OCL wrappers +//////////////////////////////////////////////////////////////// + +// a silly workaround for mgpuModuleLoad. OCL needs context and device to load +// the module. We remember the last call to any mgpu* APIs +static thread_local GPUCLQUEUE *lastQueue; +extern "C" OCL_RUNTIME_EXPORT GPUCLQUEUE *mgpuStreamCreate() { + auto ret = + new GPUCLQUEUE(static_cast(nullptr), nullptr, nullptr); + lastQueue = ret; + return ret; +} + +extern "C" OCL_RUNTIME_EXPORT void mgpuStreamDestroy(GPUCLQUEUE *queue) { + lastQueue = nullptr; + delete queue; +} + +extern "C" OCL_RUNTIME_EXPORT void * +mgpuMemAlloc(uint64_t size, GPUCLQUEUE *queue, bool isShared) { + lastQueue = queue; + return allocDeviceMemory(queue, size, /*alignment*/ 64, isShared); +} + +extern "C" OCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, GPUCLQUEUE *queue) { + lastQueue = queue; + if (ptr) { + deallocDeviceMemory(queue, ptr); + } +} + +// mgpuModuleLoad and mgpuModuleGetFunction does not have +// queue in parameters, but OCL APIs requires them. We implicitly use the queue +// pointer of the last mgpu* API of the current thread as the queue for these +// functions. This is ugly and error-prone. We might need another workaround. +extern "C" OCL_RUNTIME_EXPORT cl_program mgpuModuleLoad(const void *data, + size_t gpuBlobSize) { + return loadModule(lastQueue, (const unsigned char *)data, gpuBlobSize, false); +} + +extern "C" OCL_RUNTIME_EXPORT cl_kernel +mgpuModuleGetFunction(cl_program module, const char *name) { + // we need to push the kernel to lastQueue to avoid cl_kernel resource leak + return getKernel(lastQueue, module, name); +} + +extern "C" OCL_RUNTIME_EXPORT void mgpuModuleUnload(cl_program module) { + CL_SAFE_CALL(clReleaseProgram(module)); +} + +extern "C" OCL_RUNTIME_EXPORT void +mgpuLaunchKernel(cl_kernel kernel, size_t gridX, size_t gridY, size_t gridZ, + size_t blockX, size_t blockY, size_t blockZ, + size_t sharedMemBytes, GPUCLQUEUE *queue, void **params, + void ** /*extra*/, size_t paramsCount) { + launchKernel( + queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ, + sharedMemBytes, + [&]() { + // todo (yijie): do we need to handle shared mem? If there is dynamic + // shared mem required, which value should paramsCount be? + return paramsCount; + }, + [&](size_t i) { + // todo (yijie): assuming all parameters are passed with pointer size + return std::make_pair(params[i], sizeof(void *)); + }); +} + +extern "C" OCL_RUNTIME_EXPORT void mgpuStreamSynchronize(GPUCLQUEUE *queue) { + CL_SAFE_CALL(clFinish(queue->queue_)); +} diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt index 1b4f2cb73..f60c8cec2 100644 --- a/lib/gc/Transforms/CMakeLists.txt +++ b/lib/gc/Transforms/CMakeLists.txt @@ -28,4 +28,6 @@ add_mlir_library(GCPasses ) set_property(GLOBAL APPEND PROPERTY GC_PASS_LIBS GCPasses) -add_subdirectory(GPU) +if(GC_USE_GPU) + add_subdirectory(GPU) +endif() diff --git a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp index 7e9cf10d1..ea886631b 100644 --- a/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp +++ b/lib/gc/Transforms/GPU/LinalgToXeGPU.cpp @@ -732,9 +732,9 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles, VectorType vecLoadType = VectorType::get(tileType.getShape(), tileType.getElementType()); - IntegerAttr vnniAxisAttr = nullptr; + UnitAttr vnniAxisAttr = nullptr; if (vnniConf) { - vnniAxisAttr = IntegerAttr::get(rewriter.getI64Type(), vnniConf->vnniAxis); + vnniAxisAttr = UnitAttr::get(rewriter.getContext()); vecLoadType = getVnniVector(tileType.getShape(), tileType.getElementType(), *vnniConf); } @@ -742,7 +742,7 @@ loadNdDescTiles(PatternRewriter &rewriter, Location loc, ValueRange loadTiles, SmallVector loadVec; for (auto tile : loadTiles) { auto loadOp = rewriter.create( - loc, vecLoadType, tile, vnniAxisAttr, transpose, + loc, vecLoadType, tile, vnniAxisAttr, transpose, nullptr, /*l1_hint=*/hint, /*l2_hint=*/hint, /*l3_hint=*/hint); loadVec.push_back(loadOp); diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt index af2125870..f127653de 100644 --- a/src/gc-opt/CMakeLists.txt +++ b/src/gc-opt/CMakeLists.txt @@ -33,15 +33,15 @@ set(gc_opt_libs ${dialect_libs} ${conversion_libs} ${MLIR_LINK_COMPONENTS} - GCPasses - GCGPUPasses) + GCPasses) if(GC_USE_GPU) add_definitions(-DGC_USE_GPU=1) get_property(IMEX_INCLUDES GLOBAL PROPERTY IMEX_INCLUDES) include_directories(${IMEX_INCLUDES}) list(APPEND gc_opt_libs IMEXGPUXDialect IMEXXeTileDialect IMEXRegionDialect IMEXRegionTransforms - IMEXTransforms IMEXGPUToGPUX IMEXGPUToSPIRV IMEXGPUXToLLVM IMEXXeGPUToVC IMEXXeTileToXeGPU IMEXUtil) + IMEXTransforms IMEXGPUToGPUX IMEXGPUToSPIRV IMEXGPUXToLLVM IMEXXeGPUToVC IMEXXeTileToXeGPU IMEXUtil + GCGPUPasses) endif() if(GC_MLIR_CXX_FLAGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}") diff --git a/test/mlir/test/CMakeLists.txt b/test/mlir/test/CMakeLists.txt index 2a93c91e5..7ac5a89c6 100644 --- a/test/mlir/test/CMakeLists.txt +++ b/test/mlir/test/CMakeLists.txt @@ -25,7 +25,7 @@ set(GC_OPT_TEST_DEPENDS if(GC_USE_GPU) include(imex) - list(APPEND GC_OPT_TEST_DEPENDS level-zero-runtime) + list(APPEND GC_OPT_TEST_DEPENDS mlir_opencl_runtime) endif() if(GC_ENABLE_BINDINGS_PYTHON) diff --git a/test/mlir/test/gc/Transforms/GPU/lit.local.cfg b/test/mlir/test/gc/Transforms/GPU/lit.local.cfg new file mode 100644 index 000000000..f086e9be8 --- /dev/null +++ b/test/mlir/test/gc/Transforms/GPU/lit.local.cfg @@ -0,0 +1,2 @@ +if not config.gc_use_gpu: + config.unsupported = True \ No newline at end of file diff --git a/test/mlir/test/gc/gpu-runner/mlp.mlir b/test/mlir/test/gc/gpu-runner/mlp.mlir index 4fb88983c..29dad1c8a 100644 --- a/test/mlir/test/gc/gpu-runner/mlp.mlir +++ b/test/mlir/test/gc/gpu-runner/mlp.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt %s --pass-pipeline='builtin.module(convert-tensor-to-linalg,func.func(empty-tensor-to-alloc-tensor),one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries},func.func(convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu),func.func(insert-gpu-allocs{client-api=opencl}),canonicalize,normalize-memrefs,func.func(lower-affine),gpu-kernel-outlining,canonicalize,cse,set-spirv-capabilities{client-api=opencl},gpu.module(set-spirv-abi-attrs{client-api=opencl}),canonicalize,fold-memref-alias-ops,imex-convert-gpu-to-spirv,spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),serialize-spirv,convert-gpu-to-gpux,convert-func-to-llvm,convert-math-to-llvm,convert-gpux-to-llvm,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime | FileCheck %s +// RUN: gc-opt %s --pass-pipeline='builtin.module(convert-tensor-to-linalg,func.func(empty-tensor-to-alloc-tensor),one-shot-bufferize{unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries},func.func(convert-linalg-to-parallel-loops,imex-add-outer-parallel-loop,gpu-map-parallel-loops,convert-parallel-loops-to-gpu),func.func(insert-gpu-allocs{client-api=opencl}),canonicalize,normalize-memrefs,func.func(lower-affine),gpu-kernel-outlining,canonicalize,cse,set-spirv-capabilities{client-api=opencl},gpu.module(set-spirv-abi-attrs{client-api=opencl}),canonicalize,fold-memref-alias-ops,imex-convert-gpu-to-spirv,spirv.module(spirv-lower-abi-attrs,spirv-update-vce),func.func(llvm-request-c-wrappers),serialize-spirv,convert-gpu-to-gpux,convert-func-to-llvm,convert-math-to-llvm,convert-gpux-to-llvm,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s #map0 = affine_map<(d0, d1) -> (d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d2)> diff --git a/test/mlir/test/lit.cfg.py b/test/mlir/test/lit.cfg.py index 5cabfe8f7..d53e105e1 100644 --- a/test/mlir/test/lit.cfg.py +++ b/test/mlir/test/lit.cfg.py @@ -34,7 +34,7 @@ config.substitutions.append(('%mlir_c_runner_utils', config.mlir_c_runner_utils)) if config.gc_use_gpu: - config.substitutions.append(('%levelzero_runtime', config.levelzero_runtime)) + config.substitutions.append(('%opencl_runtime', config.opencl_runtime)) llvm_config.with_system_environment(["HOME", "INCLUDE", "LIB", "TMP", "TEMP"]) diff --git a/test/mlir/test/lit.site.cfg.py.in b/test/mlir/test/lit.site.cfg.py.in index f44b45662..6bea6d13e 100644 --- a/test/mlir/test/lit.site.cfg.py.in +++ b/test/mlir/test/lit.site.cfg.py.in @@ -42,7 +42,7 @@ config.mlir_runner_utils_dir = "@MLIR_RUNNER_UTILS_DIR@" config.mlir_runner_utils = os.path.normpath(os.path.join(config.mlir_runner_utils_dir, config.shlib_prefix + "mlir_runner_utils" + config.llvm_shlib_ext)) config.mlir_c_runner_utils = os.path.normpath(os.path.join(config.mlir_runner_utils_dir, config.shlib_prefix + "mlir_c_runner_utils" + config.llvm_shlib_ext)) -config.levelzero_runtime = os.path.normpath(os.path.join(config.gc_lib_dir, config.shlib_prefix + "level-zero-runtime" + config.llvm_shlib_ext)) +config.opencl_runtime = os.path.normpath(os.path.join(config.gc_lib_dir, config.shlib_prefix + "mlir_opencl_runtime" + config.llvm_shlib_ext)) import lit.llvm lit.llvm.initialize(lit_config, config)