Skip to content

[GpuOclRuntime] Add DLTI attributes from the device info #406

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
177 changes: 149 additions & 28 deletions lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
#include "llvm/ExecutionEngine/Orc/LLJIT.h"
#include "llvm/Support/Error.h"

#include "mlir/Dialect/DLTI/DLTI.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Interfaces/DataLayoutInterfaces.h"
#include "mlir/Pass/PassManager.h"

namespace mlir::gc::gpu {
Expand Down Expand Up @@ -148,10 +150,12 @@ struct Kernel {
}

~Kernel() {
CL_CHECKR(clReleaseKernel(kernel), "Failed to release OpenCL kernel.");
gcLogD("Released OpenCL kernel: ", kernel);
CL_CHECKR(clReleaseProgram(program), "Failed to release OpenCL program.");
gcLogD("Released OpenCL program: ", program);
if (kernel != nullptr) {
CL_CHECKR(clReleaseKernel(kernel), "Failed to release OpenCL kernel.");
gcLogD("Released OpenCL kernel: ", kernel);
CL_CHECKR(clReleaseProgram(program), "Failed to release OpenCL program.");
gcLogD("Released OpenCL program: ", program);
}
}
};

Expand Down Expand Up @@ -220,7 +224,14 @@ struct OclRuntime::Exports {
gcLogD("The program has been built: ", program);

auto kernel = clCreateKernel(program, name, &err);
CL_CHECKR(err, "Failed to create OpenCL kernel from program: ", program);
if (err != CL_SUCCESS) {
// This is a special case, handled by OclModuleBuilder::build(), that
// allows rebuilding the kernel with different options in case of failure.
clReleaseProgram(program);
gcLogD("OpenCL error ", err,
": Failed to create OpenCL kernel from program: ", program);
return new Kernel(nullptr, nullptr, gridSize, blockSize, argNum, argSize);
}
gcLogD("Created new OpenCL kernel ", kernel, " from program ", program);

cl_bool enable = CL_TRUE;
Expand Down Expand Up @@ -639,8 +650,7 @@ void OclContext::setLastEvent(cl_event event) {
}
}

OclModule::~OclModule() {
assert(engine);
static void destroyKernels(const std::unique_ptr<ExecutionEngine> &engine) {
auto fn = engine->lookup(GPU_OCL_MOD_DESTRUCTOR);
if (fn) {
reinterpret_cast<void (*)()>(fn.get())();
Expand All @@ -649,13 +659,19 @@ OclModule::~OclModule() {
}
}

OclModule::~OclModule() {
assert(engine);
destroyKernels(engine);
}

// If all arguments of 'origFunc' are memrefs with static shape, create a new
// function called gcGpuOclStaticMain, that accepts 2 arguments: a pointer to
// OclContext and a pointer to an array, containing pointers to aligned memory
// buffers. The function will call the original function with the context,
// buffers and the offset/shape/strides, statically created from the
// memref descriptor.
StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
StringRef createStaticMain(OpBuilder &builder, ModuleOp &module,
const StringRef &funcName,
const ArrayRef<Type> argTypes) {
auto mainFunc = module.lookupSymbol<LLVM::LLVMFuncOp>(funcName);
if (!mainFunc) {
Expand All @@ -670,11 +686,8 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
"' must have an least 3 arguments.");
}

auto ctx = module.getContext();
ctx->getOrLoadDialect<LLVM::LLVMDialect>();
OpBuilder builder(ctx);
auto i64Type = builder.getI64Type();
auto ptrType = LLVM::LLVMPointerType::get(ctx);
auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());

if (mainArgTypes[nargs - 3] != ptrType ||
mainArgTypes[nargs - 2] != ptrType ||
Expand Down Expand Up @@ -722,7 +735,7 @@ StringRef createStaticMain(ModuleOp &module, const StringRef &funcName,
auto loc = mainFunc.getLoc();
auto newFuncType = LLVM::LLVMFunctionType::get(
mainFunc.getNumResults() ? mainFunc->getResult(0).getType()
: LLVM::LLVMVoidType::get(ctx),
: LLVM::LLVMVoidType::get(builder.getContext()),
{ptrType, ptrType});
auto newFunc =
OpBuilder::atBlockEnd(module.getBody())
Expand Down Expand Up @@ -848,17 +861,58 @@ OclModuleBuilder::build(cl_device_id device, cl_context context) {

llvm::Expected<std::shared_ptr<const OclModule>>
OclModuleBuilder::build(const OclRuntime::Ext &ext) {
auto mod = mlirModule.clone();
PassManager pm{mod.getContext()};
pipeline(pm);
CHECK(!pm.run(mod).failed(), "GPU pipeline failed!");
auto ctx = mlirModule.getContext();
ctx->getOrLoadDialect<DLTIDialect>();
ctx->getOrLoadDialect<LLVM::LLVMDialect>();
OpBuilder builder(ctx);
DataLayoutEntryInterface dltiAttrs[6];

auto staticMain = createStaticMain(mod, funcName, argTypes);
{
struct DevInfo {
cl_device_info key;
const char *attrName;
};
DevInfo devInfo[]{
{CL_DEVICE_MAX_COMPUTE_UNITS, "num_exec_units"},
{CL_DEVICE_NUM_EUS_PER_SUB_SLICE_INTEL, "num_exec_units_per_slice"},
{CL_DEVICE_NUM_THREADS_PER_EU_INTEL, "num_threads_per_eu"},
// Assuming the cache size is equal to the local mem
{CL_DEVICE_LOCAL_MEM_SIZE, "L1_cache_size_in_bytes"},
};

if (printIr) {
mod.dump();
}
unsigned i = 0;
for (auto &[key, attrName] : devInfo) {
int64_t value = 0;
CL_CHECK(
clGetDeviceInfo(ext.device, key, sizeof(cl_ulong), &value, nullptr),
"Failed to get the device property ", attrName);
gcLogD("Device property ", attrName, "=", value);
dltiAttrs[i++] =
DataLayoutEntryAttr::get(ctx, builder.getStringAttr(attrName),
builder.getI64IntegerAttr(value));
}

// There is no a corresponding property in the OpenCL API, using the
// hardcoded value.
// TODO: Get the real value.
dltiAttrs[i] = DataLayoutEntryAttr::get(
ctx, builder.getStringAttr("max_vector_op_width"),
builder.getI64IntegerAttr(512));
}

OclRuntime rt(ext);
auto expectedQueue = rt.createQueue();
CHECKE(expectedQueue, "Failed to create queue!");
struct OclQueue {
cl_command_queue queue;
~OclQueue() { clReleaseCommandQueue(queue); }
} queue{*expectedQueue};
OclContext oclCtx{rt, queue.queue, false};

ModuleOp mod;
StringRef staticMain;
std::unique_ptr<ExecutionEngine> eng;
auto devStr = builder.getStringAttr("GPU" /* device ID*/);
ExecutionEngineOptions opts;
opts.jitCodeGenOptLevel = llvm::CodeGenOptLevel::Aggressive;
opts.enableObjectDump = enableObjectDump;
Expand All @@ -868,18 +922,86 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
opts.enablePerfNotificationListener = false;
#endif

auto eng = ExecutionEngine::create(mod, opts);
CHECKE(eng, "Failed to create ExecutionEngine!");
eng->get()->registerSymbols(OclRuntime::Exports::symbolMap);
// Build the module and check the kernels workgroup size. If the workgroup
// size is different, rebuild the module with the new size.
for (size_t wgSize = 64, maxSize = std::numeric_limits<size_t>::max();;) {
dltiAttrs[sizeof(dltiAttrs) / sizeof(DataLayoutEntryInterface) - 1] =
DataLayoutEntryAttr::get(
ctx, builder.getStringAttr("max_work_group_size"),
builder.getI64IntegerAttr(static_cast<int64_t>(wgSize)));
TargetDeviceSpecInterface devSpec =
TargetDeviceSpecAttr::get(ctx, dltiAttrs);
auto sysSpec =
TargetSystemSpecAttr::get(ctx, ArrayRef(std::pair(devStr, devSpec)));
mod = mlirModule.clone();
mod.getOperation()->setAttr("#dlti.sys_spec", sysSpec);
PassManager pm{ctx};
pipeline(pm);
CHECK(!pm.run(mod).failed(), "GPU pipeline failed!");
staticMain = createStaticMain(builder, mod, funcName, argTypes);
auto expectedEng = ExecutionEngine::create(mod, opts);
CHECKE(expectedEng, "Failed to create ExecutionEngine!");
expectedEng->get()->registerSymbols(OclRuntime::Exports::symbolMap);

// Find all kernels and query the workgroup size
size_t minSize = maxSize;
mod.walk<>([&](LLVM::LLVMFuncOp func) {
auto name = func.getName();
if (!name.starts_with("createGcGpuOclKernel_")) {
return WalkResult::skip();
}
auto fn = expectedEng.get()->lookup(name);
if (!fn) {
gcLogE("Function not found: ", name.data());
return WalkResult::skip();
}

Kernel *kernel =
reinterpret_cast<Kernel *(*)(OclContext *)>(fn.get())(&oclCtx);

if (kernel->kernel == nullptr) {
maxSize = wgSize / 2;
if (maxSize == 0) {
gcReportErr("Failed to build the kernel.");
}
minSize = maxSize;
return WalkResult::interrupt();
}

size_t s = 0;
auto err = clGetKernelWorkGroupInfo(kernel->kernel, ext.device,
CL_KERNEL_WORK_GROUP_SIZE,
sizeof(size_t), &s, nullptr);
if (err == CL_SUCCESS) {
minSize = std::min(minSize, s);
} else {
gcLogE("Failed to get the kernel workgroup size: ", err);
}
return WalkResult::skip();
});

if (minSize == wgSize || minSize == std::numeric_limits<size_t>::max()) {
eng = std::move(*expectedEng);
break;
}

destroyKernels(expectedEng.get());
gcLogD("Changing the workgroup size from ", wgSize, " to ", minSize);
wgSize = minSize;
}

if (printIr) {
mod.dump();
}

OclModule::MainFunc main = {nullptr};

if (staticMain.empty()) {
auto expect = eng.get()->lookupPacked(funcName);
auto expect = eng->lookupPacked(funcName);
CHECKE(expect, "Packed function '", funcName.begin(), "' not found!");
main.wrappedMain = *expect;
} else {
auto expect = eng.get()->lookup(staticMain);
auto expect = eng->lookup(staticMain);
CHECKE(expect, "Compiled function '", staticMain.begin(), "' not found!");
main.staticMain = reinterpret_cast<OclModule::StaticMainFunc>(*expect);
}
Expand All @@ -889,8 +1011,7 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) {
return it->second;
}
std::shared_ptr<const OclModule> ptr(
new OclModule(OclRuntime(ext), !staticMain.empty(), main, argTypes,
std::move(eng.get())));
new OclModule(rt, !staticMain.empty(), main, argTypes, std::move(eng)));
return cache.emplace(OclDevCtxPair(ext.device, ext.context), ptr)
.first->second;
}
Expand Down
3 changes: 1 addition & 2 deletions lib/gc/Transforms/GPU/GpuToGpuOcl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,7 @@ struct ConvertLaunch final : ConvertOpPattern<gpu::LaunchFuncOp> {

auto function = rewriter.create<LLVM::LLVMFuncOp>(
loc, funcName,
LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}),
LLVM::Linkage::Internal);
LLVM::LLVMFunctionType::get(helper.ptrType, {helper.ptrType}));
rewriter.setInsertionPointToStart(function.addEntryBlock(rewriter));

auto ptr = mod.lookupSymbol<LLVM::GlobalOp>(str("Ptr"));
Expand Down
2 changes: 1 addition & 1 deletion test/mlir/test/gc/Transforms/GPU/gpu-to-gpuocl.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ module @test attributes {gpu.container_module} {
// CHECK: llvm.mlir.global internal constant @gcGpuOclKernel_entry_kernel_Name
// CHECK: llvm.mlir.global internal @gcGpuOclKernel_entry_kernel_Ptr

// CHECK: llvm.func internal @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
// CHECK: llvm.func @createGcGpuOclKernel_entry_kernel([[CTX:%.+]]: !llvm.ptr) -> !llvm.ptr
// CHECK: [[NEW_PTR:%.+]] = llvm.call @gcGpuOclKernelCreate([[CTX]]
// CHECK: [[ZERO:%.+]] = llvm.mlir.zero
// CHECK: [[PTR_ADDR:%.+]] = llvm.mlir.addressof @gcGpuOclKernel_entry_kernel_Ptr
Expand Down