diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp index d3451c5f21231..a615f907c780c 100644 --- a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp @@ -79,7 +79,7 @@ namespace { // /^_Z(\d+)__esimd_\w+/ static constexpr char ESIMD_INTRIN_PREF0[] = "_Z"; static constexpr char ESIMD_INTRIN_PREF1[] = "__esimd_"; -static constexpr char SPIRV_INTRIN_PREF[] = "__spirv_"; +static constexpr char SPIRV_INTRIN_PREF[] = "__spirv_BuiltIn"; static constexpr char GENX_KERNEL_METADATA[] = "genx.kernels"; @@ -778,108 +778,122 @@ static int getIndexForSuffix(StringRef Suff) { .Default(-1); } -// Helper function to convert SPIRV intrinsic into GenX intrinsic, -// that returns vector of coordinates. -// Example: -// %call = call spir_func i64 @_Z23__spirv_WorkgroupSize_xv() -// => -// %call.esimd = tail call <3 x i32> @llvm.genx.local.size.v3i32() -// %wgsize.x = extractelement <3 x i32> %call.esimd, i32 0 -// %wgsize.x.cast.ty = zext i32 %wgsize.x to i64 -static Instruction *generateVectorGenXForSpirv(CallInst &CI, StringRef Suff, +// Helper function to convert extractelement instruction associated with the +// load from SPIRV builtin global, into the GenX intrinsic that returns vector +// of coordinates. It also generates required extractelement and cast +// instructions. Example: +// %0 = load <3 x i64>, <3 x i64> addrspace(4)* addrspacecast +// (<3 x i64> addrspace(1)* @__spirv_BuiltInLocalInvocationId +// to <3 x i64> addrspace(4)*), align 32 +// %1 = extractelement <3 x i64> %0, i64 0 +// +// => +// +// %.esimd = call <3 x i32> @llvm.genx.local.id.v3i32() +// %local_id.x = extractelement <3 x i32> %.esimd, i32 0 +// %local_id.x.cast.ty = zext i32 %local_id.x to i64 +static Instruction *generateVectorGenXForSpirv(ExtractElementInst *EEI, + StringRef Suff, const std::string &IntrinName, StringRef ValueName) { std::string IntrName = std::string(GenXIntrinsic::getGenXIntrinsicPrefix()) + IntrinName; auto ID = GenXIntrinsic::lookupGenXIntrinsicID(IntrName); - LLVMContext &Ctx = CI.getModule()->getContext(); + LLVMContext &Ctx = EEI->getModule()->getContext(); Type *I32Ty = Type::getInt32Ty(Ctx); Function *NewFDecl = GenXIntrinsic::getGenXDeclaration( - CI.getModule(), ID, {FixedVectorType::get(I32Ty, 3)}); + EEI->getModule(), ID, {FixedVectorType::get(I32Ty, 3)}); Instruction *IntrI = - IntrinsicInst::Create(NewFDecl, {}, CI.getName() + ".esimd", &CI); + IntrinsicInst::Create(NewFDecl, {}, EEI->getName() + ".esimd", EEI); int ExtractIndex = getIndexForSuffix(Suff); assert(ExtractIndex != -1 && "Extract index is invalid."); Twine ExtractName = ValueName + Suff; + Instruction *ExtrI = ExtractElementInst::Create( - IntrI, ConstantInt::get(I32Ty, ExtractIndex), ExtractName, &CI); - Instruction *CastI = addCastInstIfNeeded(&CI, ExtrI); + IntrI, ConstantInt::get(I32Ty, ExtractIndex), ExtractName, EEI); + Instruction *CastI = addCastInstIfNeeded(EEI, ExtrI); return CastI; } -// Helper function to convert SPIRV intrinsic into GenX intrinsic, -// that has exact mapping. -// Example: -// %call = call spir_func i64 @_Z21__spirv_WorkgroupId_xv() -// => -// %group.id.x = tail call i32 @llvm.genx.group.id.x() -// %group.id.x.cast.ty = zext i32 %group.id.x to i64 -static Instruction *generateGenXForSpirv(CallInst &CI, StringRef Suff, +// Helper function to convert extractelement instruction associated with the +// load from SPIRV builtin global, into the GenX intrinsic. It also generates +// required cast instructions. Example: +// %0 = load <3 x i64>, <3 x i64> addrspace(4)* addrspacecast (<3 x i64> +// addrspace(1)* @__spirv_BuiltInWorkgroupId to <3 x i64> addrspace(4)*), align +// 32 %1 = extractelement <3 x i64> %0, i64 0 +// => +// %0 = load <3 x i64>, <3 x i64> addrspace(4)* addrspacecast (<3 x i64> +// addrspace(1)* @__spirv_BuiltInWorkgroupId to <3 x i64> addrspace(4)*), align +// 32 %group.id.x = call i32 @llvm.genx.group.id.x() %group.id.x.cast.ty = zext +// i32 %group.id.x to i64 +static Instruction *generateGenXForSpirv(ExtractElementInst *EEI, + StringRef Suff, const std::string &IntrinName) { std::string IntrName = std::string(GenXIntrinsic::getGenXIntrinsicPrefix()) + IntrinName + Suff.str(); auto ID = GenXIntrinsic::lookupGenXIntrinsicID(IntrName); Function *NewFDecl = - GenXIntrinsic::getGenXDeclaration(CI.getModule(), ID, {}); + GenXIntrinsic::getGenXDeclaration(EEI->getModule(), ID, {}); + Instruction *IntrI = - IntrinsicInst::Create(NewFDecl, {}, IntrinName + Suff.str(), &CI); - Instruction *CastI = addCastInstIfNeeded(&CI, IntrI); + IntrinsicInst::Create(NewFDecl, {}, IntrinName + Suff.str(), EEI); + Instruction *CastI = addCastInstIfNeeded(EEI, IntrI); return CastI; } -// This function translates SPIRV intrinsic into GenX intrinsic. -// TODO: Currently, we do not support mixing SYCL and ESIMD kernels. -// Later for ESIMD and SYCL kernels to coexist, we likely need to -// clone call graph that lead from ESIMD kernel to SPIRV intrinsic and -// translate SPIRV intrinsics to GenX intrinsics only in cloned subgraph. -static void -translateSpirvIntrinsic(CallInst *CI, StringRef SpirvIntrName, - SmallVector &ESIMDToErases) { - auto translateSpirvIntr = [&SpirvIntrName, &ESIMDToErases, - CI](StringRef SpvIName, auto TranslateFunc) { - if (SpirvIntrName.consume_front(SpvIName)) { - Value *TranslatedV = TranslateFunc(*CI, SpirvIntrName.substr(1, 1)); - CI->replaceAllUsesWith(TranslatedV); - ESIMDToErases.push_back(CI); - } - }; +// This function translates one occurence of SPIRV builtin use into GenX +// intrinsic. +static Value *translateSpirvGlobalUse(ExtractElementInst *EEI, + StringRef SpirvGlobalName) { + Value *IndexV = EEI->getIndexOperand(); + assert(isa(IndexV) && + "Extract element index should be a constant"); - translateSpirvIntr("WorkgroupSize", [](CallInst &CI, StringRef Suff) { - return generateVectorGenXForSpirv(CI, Suff, "local.size.v3i32", "wgsize."); - }); - translateSpirvIntr("LocalInvocationId", [](CallInst &CI, StringRef Suff) { - return generateVectorGenXForSpirv(CI, Suff, "local.id.v3i32", "local_id."); - }); - translateSpirvIntr("WorkgroupId", [](CallInst &CI, StringRef Suff) { - return generateGenXForSpirv(CI, Suff, "group.id."); - }); - translateSpirvIntr("GlobalInvocationId", [](CallInst &CI, StringRef Suff) { + // Get the suffix based on the index of extractelement instruction + ConstantInt *IndexC = cast(IndexV); + std::string Suff; + if (IndexC->equalsInt(0)) + Suff = 'x'; + else if (IndexC->equalsInt(1)) + Suff = 'y'; + else if (IndexC->equalsInt(2)) + Suff = 'z'; + else + assert(false && "Extract element index should be either 0, 1, or 2"); + + // Translate SPIRV into GenX intrinsic. + if (SpirvGlobalName == "WorkgroupSize") { + return generateVectorGenXForSpirv(EEI, Suff, "local.size.v3i32", "wgsize."); + } else if (SpirvGlobalName == "LocalInvocationId") { + return generateVectorGenXForSpirv(EEI, Suff, "local.id.v3i32", "local_id."); + } else if (SpirvGlobalName == "WorkgroupId") { + return generateGenXForSpirv(EEI, Suff, "group.id."); + } else if (SpirvGlobalName == "GlobalInvocationId") { // GlobalId = LocalId + WorkGroupSize * GroupId Instruction *LocalIdI = - generateVectorGenXForSpirv(CI, Suff, "local.id.v3i32", "local_id."); + generateVectorGenXForSpirv(EEI, Suff, "local.id.v3i32", "local_id."); Instruction *WGSizeI = - generateVectorGenXForSpirv(CI, Suff, "local.size.v3i32", "wgsize."); - Instruction *GroupIdI = generateGenXForSpirv(CI, Suff, "group.id."); + generateVectorGenXForSpirv(EEI, Suff, "local.size.v3i32", "wgsize."); + Instruction *GroupIdI = generateGenXForSpirv(EEI, Suff, "group.id."); Instruction *MulI = - BinaryOperator::CreateMul(WGSizeI, GroupIdI, "mul", &CI); - return BinaryOperator::CreateAdd(LocalIdI, MulI, "add", &CI); - }); - translateSpirvIntr("GlobalSize", [](CallInst &CI, StringRef Suff) { + BinaryOperator::CreateMul(WGSizeI, GroupIdI, "mul", EEI); + return BinaryOperator::CreateAdd(LocalIdI, MulI, "add", EEI); + } else if (SpirvGlobalName == "GlobalSize") { // GlobalSize = WorkGroupSize * NumWorkGroups Instruction *WGSizeI = - generateVectorGenXForSpirv(CI, Suff, "local.size.v3i32", "wgsize."); + generateVectorGenXForSpirv(EEI, Suff, "local.size.v3i32", "wgsize."); Instruction *NumWGI = generateVectorGenXForSpirv( - CI, Suff, "group.count.v3i32", "group_count."); - return BinaryOperator::CreateMul(WGSizeI, NumWGI, "mul", &CI); - }); - // TODO: Support GlobalOffset SPIRV intrinsics - translateSpirvIntr("GlobalOffset", [](CallInst &CI, StringRef Suff) { - return llvm::Constant::getNullValue(CI.getType()); - }); - translateSpirvIntr("NumWorkgroups", [](CallInst &CI, StringRef Suff) { - return generateVectorGenXForSpirv(CI, Suff, "group.count.v3i32", + EEI, Suff, "group.count.v3i32", "group_count."); + return BinaryOperator::CreateMul(WGSizeI, NumWGI, "mul", EEI); + } else if (SpirvGlobalName == "GlobalOffset") { + // TODO: Support GlobalOffset SPIRV intrinsics + return llvm::Constant::getNullValue(EEI->getType()); + } else if (SpirvGlobalName == "NumWorkgroups") { + return generateVectorGenXForSpirv(EEI, Suff, "group.count.v3i32", "group_count."); - }); + } + + return nullptr; } static void createESIMDIntrinsicArgs(const ESIMDIntrinDesc &Desc, @@ -1272,68 +1286,102 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, auto *CI = dyn_cast(&I); Function *Callee = nullptr; - if (!CI || !(Callee = CI->getCalledFunction())) - continue; - StringRef Name = Callee->getName(); + if (CI && (Callee = CI->getCalledFunction())) { - // See if the Name represents an ESIMD intrinsic and demangle only if it - // does. - if (!Name.consume_front(ESIMD_INTRIN_PREF0)) - continue; - // now skip the digits - Name = Name.drop_while([](char C) { return std::isdigit(C); }); - - // process ESIMD builtins that go through special handling instead of - // the translation procedure - if (Name.startswith("N2cl4sycl5INTEL3gpu8slm_init")) { - // tag the kernel with meta-data SLMSize, and remove this builtin - translateSLMInit(*CI); - ESIMDToErases.push_back(CI); - continue; - } - if (Name.startswith("__esimd_pack_mask")) { - translatePackMask(*CI); - ESIMDToErases.push_back(CI); - continue; - } - if (Name.startswith("__esimd_unpack_mask")) { - translateUnPackMask(*CI); - ESIMDToErases.push_back(CI); - continue; - } - // If vload/vstore is not about the vector-types used by - // those globals marked as genx_volatile, We can translate - // them directly into generic load/store inst. In this way - // those insts can be optimized by llvm ASAP. - if (Name.startswith("__esimd_vload")) { - if (translateVLoad(*CI, GVTS)) { + StringRef Name = Callee->getName(); + + // See if the Name represents an ESIMD intrinsic and demangle only if it + // does. + if (!Name.consume_front(ESIMD_INTRIN_PREF0)) + continue; + // now skip the digits + Name = Name.drop_while([](char C) { return std::isdigit(C); }); + + // process ESIMD builtins that go through special handling instead of + // the translation procedure + if (Name.startswith("N2cl4sycl5INTEL3gpu8slm_init")) { + // tag the kernel with meta-data SLMSize, and remove this builtin + translateSLMInit(*CI); ESIMDToErases.push_back(CI); continue; } - } - if (Name.startswith("__esimd_vstore")) { - if (translateVStore(*CI, GVTS)) { + if (Name.startswith("__esimd_pack_mask")) { + translatePackMask(*CI); ESIMDToErases.push_back(CI); continue; } - } + if (Name.startswith("__esimd_unpack_mask")) { + translateUnPackMask(*CI); + ESIMDToErases.push_back(CI); + continue; + } + // If vload/vstore is not about the vector-types used by + // those globals marked as genx_volatile, We can translate + // them directly into generic load/store inst. In this way + // those insts can be optimized by llvm ASAP. + if (Name.startswith("__esimd_vload")) { + if (translateVLoad(*CI, GVTS)) { + ESIMDToErases.push_back(CI); + continue; + } + } + if (Name.startswith("__esimd_vstore")) { + if (translateVStore(*CI, GVTS)) { + ESIMDToErases.push_back(CI); + continue; + } + } - if (Name.startswith("__esimd_get_value")) { - translateGetValue(*CI); - ESIMDToErases.push_back(CI); - continue; - } + if (Name.startswith("__esimd_get_value")) { + translateGetValue(*CI); + ESIMDToErases.push_back(CI); + continue; + } - if (Name.consume_front(SPIRV_INTRIN_PREF)) { - translateSpirvIntrinsic(CI, Name, ESIMDToErases); - // For now: if no match, just let it go untranslated. - continue; + if (Name.empty() || !Name.startswith(ESIMD_INTRIN_PREF1)) + continue; + // this is ESIMD intrinsic - record for later translation + ESIMDIntrCalls.push_back(CI); } - if (Name.empty() || !Name.startswith(ESIMD_INTRIN_PREF1)) - continue; - // this is ESIMD intrinsic - record for later translation - ESIMDIntrCalls.push_back(CI); + // Translate loads from SPIRV builtin globals into GenX intrinsics + auto *LI = dyn_cast(&I); + if (LI) { + Value *LoadPtrOp = LI->getPointerOperand(); + Value *SpirvGlobal = nullptr; + // Look through casts to find SPIRV builtin globals + auto *CE = dyn_cast(LoadPtrOp); + if (CE) { + assert(CE->isCast() && "ConstExpr should be a cast"); + SpirvGlobal = CE->getOperand(0); + } else { + SpirvGlobal = LoadPtrOp; + } + + if (!isa(SpirvGlobal) || + !SpirvGlobal->getName().startswith(SPIRV_INTRIN_PREF)) + continue; + + auto PrefLen = StringRef(SPIRV_INTRIN_PREF).size(); + + // Go through all the uses of the load instruction from SPIRV builtin + // globals, which are required to be extractelement instructions. + // Translate each of them. + for (auto *LU : LI->users()) { + auto *EEI = dyn_cast(LU); + assert(EEI && "User of load from global SPIRV builtin is not an " + "extractelement instruction"); + Value *TranslatedVal = translateSpirvGlobalUse( + EEI, SpirvGlobal->getName().drop_front(PrefLen)); + assert(TranslatedVal && + "Load from global SPIRV builtin was not translated"); + EEI->replaceAllUsesWith(TranslatedVal); + ESIMDToErases.push_back(EEI); + } + // After all users of load were translated, we get rid of the load + // itself. + ESIMDToErases.push_back(LI); + } } // Now demangle and translate found ESIMD intrinsic calls for (auto *CI : ESIMDIntrCalls) { diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll index aedd42865c8ad..ed57197e042d9 100644 --- a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll +++ b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll @@ -172,16 +172,6 @@ define dso_local spir_kernel void @FUNC_30() { ; CHECK-NEXT: ret void } -define dso_local spir_kernel void @FUNC_31() { -; CHECK: define dso_local spir_kernel void @FUNC_31() - %call = call spir_func i64 @_Z27__spirv_LocalInvocationId_xv() -; CHECK-NEXT: %call.esimd = call <3 x i32> @llvm.genx.local.id.v3i32() -; CHECK-NEXT: %local_id.x = extractelement <3 x i32> %call.esimd, i32 0 -; CHECK-NEXT: %local_id.x.cast.ty = zext i32 %local_id.x to i64 - ret void -; CHECK-NEXT: ret void -} - define dso_local spir_func <16 x i32> @FUNC_32() { %a_1 = alloca <16 x i32> %1 = load <16 x i32>, <16 x i32>* %a_1 @@ -318,7 +308,6 @@ define dso_local spir_func <16 x i32> @FUNC_44() { ret <16 x i32> %ret_val } -declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv() declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1) declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2) declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic2ILN2cm3gen14CmAtomicOpTypeE7EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_S7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i32> %2, <32 x i16> %3) diff --git a/llvm/test/SYCLLowerIR/esimd_lower_spirv_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_spirv_intrins.ll new file mode 100644 index 0000000000000..49300c110ceaf --- /dev/null +++ b/llvm/test/SYCLLowerIR/esimd_lower_spirv_intrins.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; +; RUN: opt < %s -LowerESIMD -S | FileCheck %s + +; This test checks the result of lowering a function that has +; LLVM-IR instructions that work with SPIR-V builtins. +; This is a complete test just to make sure the correct code gets generated. +; In this example, there are many duplicate calls to the same GenX +; intrinsics, which will be optimized by -early-cse pass. + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown-sycldevice" + +@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 + +define spir_kernel void @"__spirv_GlobalInvocationId_xyz"(i64 addrspace(1)* %_arg_) { +; CHECK-LABEL: @__spirv_GlobalInvocationId_xyz( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[DOTESIMD6:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() +; CHECK-NEXT: [[LOCAL_ID_X:%.*]] = extractelement <3 x i32> [[DOTESIMD6]], i32 0 +; CHECK-NEXT: [[LOCAL_ID_X_CAST_TY:%.*]] = zext i32 [[LOCAL_ID_X]] to i64 +; CHECK-NEXT: [[DOTESIMD7:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() +; CHECK-NEXT: [[WGSIZE_X:%.*]] = extractelement <3 x i32> [[DOTESIMD7]], i32 0 +; CHECK-NEXT: [[WGSIZE_X_CAST_TY:%.*]] = zext i32 [[WGSIZE_X]] to i64 +; CHECK-NEXT: [[GROUP_ID_X:%.*]] = call i32 @llvm.genx.group.id.x() +; CHECK-NEXT: [[GROUP_ID_X_CAST_TY:%.*]] = zext i32 [[GROUP_ID_X]] to i64 +; CHECK-NEXT: [[MUL8:%.*]] = mul i64 [[WGSIZE_X_CAST_TY]], [[GROUP_ID_X_CAST_TY]] +; CHECK-NEXT: [[ADD9:%.*]] = add i64 [[LOCAL_ID_X_CAST_TY]], [[MUL8]] +; CHECK-NEXT: [[PTRIDX_ASCAST_I18_I:%.*]] = addrspacecast i64 addrspace(1)* [[_ARG_:%.*]] to i64 addrspace(4)* +; CHECK-NEXT: store i64 [[ADD9]], i64 addrspace(4)* [[PTRIDX_ASCAST_I18_I]], align 8 +; CHECK-NEXT: [[DOTESIMD2:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() +; CHECK-NEXT: [[LOCAL_ID_Y:%.*]] = extractelement <3 x i32> [[DOTESIMD2]], i32 1 +; CHECK-NEXT: [[LOCAL_ID_Y_CAST_TY:%.*]] = zext i32 [[LOCAL_ID_Y]] to i64 +; CHECK-NEXT: [[DOTESIMD3:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() +; CHECK-NEXT: [[WGSIZE_Y:%.*]] = extractelement <3 x i32> [[DOTESIMD3]], i32 1 +; CHECK-NEXT: [[WGSIZE_Y_CAST_TY:%.*]] = zext i32 [[WGSIZE_Y]] to i64 +; CHECK-NEXT: [[GROUP_ID_Y:%.*]] = call i32 @llvm.genx.group.id.y() +; CHECK-NEXT: [[GROUP_ID_Y_CAST_TY:%.*]] = zext i32 [[GROUP_ID_Y]] to i64 +; CHECK-NEXT: [[MUL4:%.*]] = mul i64 [[WGSIZE_Y_CAST_TY]], [[GROUP_ID_Y_CAST_TY]] +; CHECK-NEXT: [[ADD5:%.*]] = add i64 [[LOCAL_ID_Y_CAST_TY]], [[MUL4]] +; CHECK-NEXT: [[PTRIDX_I12_I:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[_ARG_]], i64 1 +; CHECK-NEXT: [[PTRIDX_ASCAST_I13_I:%.*]] = addrspacecast i64 addrspace(1)* [[PTRIDX_I12_I]] to i64 addrspace(4)* +; CHECK-NEXT: store i64 [[ADD5]], i64 addrspace(4)* [[PTRIDX_ASCAST_I13_I]], align 8 +; CHECK-NEXT: [[DOTESIMD:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() +; CHECK-NEXT: [[LOCAL_ID_Z:%.*]] = extractelement <3 x i32> [[DOTESIMD]], i32 2 +; CHECK-NEXT: [[LOCAL_ID_Z_CAST_TY:%.*]] = zext i32 [[LOCAL_ID_Z]] to i64 +; CHECK-NEXT: [[DOTESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.size.v3i32() +; CHECK-NEXT: [[WGSIZE_Z:%.*]] = extractelement <3 x i32> [[DOTESIMD1]], i32 2 +; CHECK-NEXT: [[WGSIZE_Z_CAST_TY:%.*]] = zext i32 [[WGSIZE_Z]] to i64 +; CHECK-NEXT: [[GROUP_ID_Z:%.*]] = call i32 @llvm.genx.group.id.z() +; CHECK-NEXT: [[GROUP_ID_Z_CAST_TY:%.*]] = zext i32 [[GROUP_ID_Z]] to i64 +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[WGSIZE_Z_CAST_TY]], [[GROUP_ID_Z_CAST_TY]] +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[LOCAL_ID_Z_CAST_TY]], [[MUL]] +; CHECK-NEXT: [[PTRIDX_I_I:%.*]] = getelementptr inbounds i64, i64 addrspace(1)* [[_ARG_]], i64 2 +; CHECK-NEXT: [[PTRIDX_ASCAST_I_I:%.*]] = addrspacecast i64 addrspace(1)* [[PTRIDX_I_I]] to i64 addrspace(4)* +; CHECK-NEXT: store i64 [[ADD]], i64 addrspace(4)* [[PTRIDX_ASCAST_I_I]], align 8 +; CHECK-NEXT: ret void +; +entry: + %0 = load <3 x i64>, <3 x i64> addrspace(4)* addrspacecast (<3 x i64> addrspace(1)* @__spirv_BuiltInGlobalInvocationId to <3 x i64> addrspace(4)*), align 32 + %1 = extractelement <3 x i64> %0, i64 0 + %ptridx.ascast.i18.i = addrspacecast i64 addrspace(1)* %_arg_ to i64 addrspace(4)* + store i64 %1, i64 addrspace(4)* %ptridx.ascast.i18.i + %2 = extractelement <3 x i64> %0, i64 1 + %ptridx.i12.i = getelementptr inbounds i64, i64 addrspace(1)* %_arg_, i64 1 + %ptridx.ascast.i13.i = addrspacecast i64 addrspace(1)* %ptridx.i12.i to i64 addrspace(4)* + store i64 %2, i64 addrspace(4)* %ptridx.ascast.i13.i + %3 = extractelement <3 x i64> %0, i64 2 + %ptridx.i.i = getelementptr inbounds i64, i64 addrspace(1)* %_arg_, i64 2 + %ptridx.ascast.i.i = addrspacecast i64 addrspace(1)* %ptridx.i.i to i64 addrspace(4)* + store i64 %3, i64 addrspace(4)* %ptridx.ascast.i.i + ret void +} diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd/basic-esimd-lower.ll b/llvm/test/tools/sycl-post-link/sycl-esimd/basic-esimd-lower.ll index 2c3dd515392d6..5614c36c43ba6 100644 --- a/llvm/test/tools/sycl-post-link/sycl-esimd/basic-esimd-lower.ll +++ b/llvm/test/tools/sycl-post-link/sycl-esimd/basic-esimd-lower.ll @@ -21,11 +21,12 @@ target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" target triple = "spir64-unknown-linux-sycldevice" -declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() +@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 define dso_local spir_kernel void @ESIMD_kernel() #0 !sycl_explicit_simd !3 { entry: - %call = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() + %0 = load <3 x i64>, <3 x i64> addrspace(4)* addrspacecast (<3 x i64> addrspace(1)* @__spirv_BuiltInGlobalInvocationId to <3 x i64> addrspace(4)*), align 32 + %1 = extractelement <3 x i64> %0, i64 0 ret void } @@ -41,15 +42,14 @@ attributes #0 = { "sycl-module-id"="a.cpp" } !3 = !{} ; By default, no lowering is performed -; CHECK-NO-LOWERING: declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() ; CHECK-NO-LOWERING: define dso_local spir_kernel void @ESIMD_kernel() ; CHECK-NO-LOWERING: entry: -; CHECK-NO-LOWERING: %call = tail call spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() +; CHECK-NO-LOWERING: %0 = load <3 x i64>, {{.*}} addrspacecast {{.*}} @__spirv_BuiltInGlobalInvocationId +; CHECK-NO-LOWERING: %1 = extractelement <3 x i64> %0, i64 0 ; CHECK-NO-LOWERING: ret void ; CHECK-NO-LOWERING: } ; With -O0, we only lower ESIMD code, but no other optimizations -; CHECK-O0: declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() ; CHECK-O0: define dso_local spir_kernel void @ESIMD_kernel() #1 !sycl_explicit_simd !3 !intel_reqd_sub_group_size !4 { ; CHECK-O0: entry: ; CHECK-O0: call <3 x i32> @llvm.genx.local.id.v3i32() @@ -59,7 +59,6 @@ attributes #0 = { "sycl-module-id"="a.cpp" } ; CHECK-O0: } ; With -O2, unused call was optimized away -; CHECK-O2: declare dso_local spir_func i64 @_Z28__spirv_GlobalInvocationId_xv() ; CHECK-O2: define dso_local spir_kernel void @ESIMD_kernel() ; CHECK-O2: entry: ; CHECK-O2: ret void diff --git a/sycl/include/CL/__spirv/spirv_vars.hpp b/sycl/include/CL/__spirv/spirv_vars.hpp index 53315cd9f72eb..bbc6f75ddf87b 100644 --- a/sycl/include/CL/__spirv/spirv_vars.hpp +++ b/sycl/include/CL/__spirv/spirv_vars.hpp @@ -15,7 +15,7 @@ #define __SPIRV_VAR_QUALIFIERS extern "C" const -#if defined(__SYCL_NVPTX__) || defined(__SYCL_EXPLICIT_SIMD__) +#if defined(__SYCL_NVPTX__) SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_x(); SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_y(); diff --git a/sycl/test/esimd/spirv_intrins_trans.cpp b/sycl/test/esimd/spirv_intrins_trans.cpp index 4106e88e6f781..512d2391b1de4 100644 --- a/sycl/test/esimd/spirv_intrins_trans.cpp +++ b/sycl/test/esimd/spirv_intrins_trans.cpp @@ -1,41 +1,14 @@ -// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -O0 -S -emit-llvm -x c++ %s -o %t -// RUN: sycl-post-link -split-esimd -lower-esimd -O2 -S %t -o %t.table +// RUN: %clangxx -fsycl -fsycl-explicit-simd -fsycl-device-only -S -emit-llvm -x c++ %s -o %t +// RUN: sycl-post-link -split-esimd -lower-esimd -O0 -S %t -o %t.table // RUN: FileCheck %s -input-file=%t_esimd_0.ll -// This test checks that all SPIR-V intrinsics are correctly -// translated into GenX counterparts (implemented in LowerESIMD.cpp) +// This test checks that all LLVM-IR instructions that work with SPIR-V builtins +// are correctly translated into GenX counterparts (implemented in +// LowerESIMD.cpp) #include #include -SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_x(); -SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_y(); -SYCL_EXTERNAL size_t __spirv_GlobalInvocationId_z(); - -SYCL_EXTERNAL size_t __spirv_GlobalSize_x(); -SYCL_EXTERNAL size_t __spirv_GlobalSize_y(); -SYCL_EXTERNAL size_t __spirv_GlobalSize_z(); - -SYCL_EXTERNAL size_t __spirv_GlobalOffset_x(); -SYCL_EXTERNAL size_t __spirv_GlobalOffset_y(); -SYCL_EXTERNAL size_t __spirv_GlobalOffset_z(); - -SYCL_EXTERNAL size_t __spirv_NumWorkgroups_x(); -SYCL_EXTERNAL size_t __spirv_NumWorkgroups_y(); -SYCL_EXTERNAL size_t __spirv_NumWorkgroups_z(); - -SYCL_EXTERNAL size_t __spirv_WorkgroupSize_x(); -SYCL_EXTERNAL size_t __spirv_WorkgroupSize_y(); -SYCL_EXTERNAL size_t __spirv_WorkgroupSize_z(); - -SYCL_EXTERNAL size_t __spirv_WorkgroupId_x(); -SYCL_EXTERNAL size_t __spirv_WorkgroupId_y(); -SYCL_EXTERNAL size_t __spirv_WorkgroupId_z(); - -SYCL_EXTERNAL size_t __spirv_LocalInvocationId_x(); -SYCL_EXTERNAL size_t __spirv_LocalInvocationId_y(); -SYCL_EXTERNAL size_t __spirv_LocalInvocationId_z(); - template __attribute__((sycl_kernel)) void kernel(Func kernelFunc) { kernelFunc(); @@ -45,6 +18,10 @@ size_t caller() { size_t DoNotOpt; cl::sycl::buffer buf(&DoNotOpt, 1); + + size_t DoNotOptXYZ[3]; + cl::sycl::buffer bufXYZ(&DoNotOptXYZ[0], sycl::range<1>(3)); + cl::sycl::queue().submit([&](cl::sycl::handler &cgh) { auto DoNotOptimize = buf.get_access(cgh); @@ -203,6 +180,39 @@ size_t caller() { // CHECK-LABEL: @{{.*}}kernel_LocalInvocationId_z // CHECK: [[CALL_ESIMD:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD]], i32 2 + + // Tests below check correct translation of loads from SPIRV builtin + // globals, when load has multiple uses, e.g.: + // %0 = load <3 x i64>, <3 x i64> addrspace(4)* addrspacecast (<3 x i64> + // addrspace(1)* @__spirv_BuiltInWorkgroupId to <3 x i64> addrspace(4)*), + // align 32 %1 = extractelement <3 x i64> %0, i64 0 %2 = extractelement <3 + // x i64> %0, i64 1 %3 = extractelement <3 x i64> %0, i64 2 + // In this case we will generate 3 calls to the same GenX intrinsic, + // But -early-cse will later remove this redundancy. + auto DoNotOptimizeXYZ = + bufXYZ.get_access(cgh); + kernel([=]() SYCL_ESIMD_KERNEL { + DoNotOptimizeXYZ[0] = __spirv_LocalInvocationId_x(); + DoNotOptimizeXYZ[1] = __spirv_LocalInvocationId_y(); + DoNotOptimizeXYZ[2] = __spirv_LocalInvocationId_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_LocalInvocationId_xyz + // CHECK: [[CALL_ESIMD1:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD1]], i32 0 + // CHECK: [[CALL_ESIMD2:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD2]], i32 1 + // CHECK: [[CALL_ESIMD3:%.*]] = call <3 x i32> @llvm.genx.local.id.v3i32() + // CHECK: {{.*}} extractelement <3 x i32> [[CALL_ESIMD3]], i32 2 + + kernel([=]() SYCL_ESIMD_KERNEL { + DoNotOptimizeXYZ[0] = __spirv_WorkgroupId_x(); + DoNotOptimizeXYZ[1] = __spirv_WorkgroupId_y(); + DoNotOptimizeXYZ[2] = __spirv_WorkgroupId_z(); + }); + // CHECK-LABEL: @{{.*}}kernel_WorkgroupId_xyz + // CHECK: {{.*}} call i32 @llvm.genx.group.id.x() + // CHECK: {{.*}} call i32 @llvm.genx.group.id.y() + // CHECK: {{.*}} call i32 @llvm.genx.group.id.z() }); return DoNotOpt; }