From bd44f1b499596eefbba1f5374b7206ce38274257 Mon Sep 17 00:00:00 2001 From: Abid Qadeer Date: Thu, 10 Jul 2025 12:45:36 +0100 Subject: [PATCH] [OMPIRBuilder] Don't use invalid debug loc in reduction functions. We have this pattern of code in OMPIRBuilder for many functions that are used in reduction operations. Function *LtGRFunc = Function::Create BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); Builder.SetInsertPoint(EntryBlock); The insertion point is moved to the new function but the debug location is not updated. This means that reduction function will use the debug location that points to another function. This problem gets hidden because these functions gets inlined but the potential for failure exists. This patch resets the debug location when insertion point is moved to new function. Some InsertPointGuard have been added to make sure why restore the debug location correctly when we are done with the reduction function. --- llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 26 ++-- .../LLVMIR/omptarget-debug-reduc-fn-loc.mlir | 121 ++++++++++++++++++ 2 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 mlir/test/Target/LLVMIR/omptarget-debug-reduc-fn-loc.mlir diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index db792a3b52d24..170224616ac64 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2617,7 +2617,7 @@ void OpenMPIRBuilder::emitReductionListCopy( Expected OpenMPIRBuilder::emitInterWarpCopyFunction( const LocationDescription &Loc, ArrayRef ReductionInfos, AttributeList FuncAttrs) { - InsertPointTy SavedIP = Builder.saveIP(); + IRBuilder<>::InsertPointGuard IPG(Builder); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()}, @@ -2630,6 +2630,7 @@ Expected OpenMPIRBuilder::emitInterWarpCopyFunction( WcFunc->addParamAttr(1, Attribute::NoUndef); BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc); Builder.SetInsertPoint(EntryBB); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // ReduceList: thread local Reduce list. // At the stage of the computation when this function is called, partially @@ -2844,7 +2845,6 @@ Expected OpenMPIRBuilder::emitInterWarpCopyFunction( } Builder.CreateRetVoid(); - Builder.restoreIP(SavedIP); return WcFunc; } @@ -2853,6 +2853,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( ArrayRef ReductionInfos, Function *ReduceFn, AttributeList FuncAttrs) { LLVMContext &Ctx = M.getContext(); + IRBuilder<>::InsertPointGuard IPG(Builder); FunctionType *FuncTy = FunctionType::get(Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt16Ty(), @@ -2871,6 +2872,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( SarFunc->addParamAttr(3, Attribute::SExt); BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc); Builder.SetInsertPoint(EntryBB); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Thread local Reduce list used to host the values of data to be reduced. Argument *ReduceListArg = SarFunc->getArg(0); @@ -3017,7 +3019,7 @@ Function *OpenMPIRBuilder::emitShuffleAndReduceFunction( Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( ArrayRef ReductionInfos, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + IRBuilder<>::InsertPointGuard IPG(Builder); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3033,6 +3035,7 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); Builder.SetInsertPoint(EntryBlock); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGCFunc->getArg(0); @@ -3120,14 +3123,13 @@ Function *OpenMPIRBuilder::emitListToGlobalCopyFunction( } Builder.CreateRetVoid(); - Builder.restoreIP(OldIP); return LtGCFunc; } Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( ArrayRef ReductionInfos, Function *ReduceFn, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + IRBuilder<>::InsertPointGuard IPG(Builder); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3143,6 +3145,7 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); Builder.SetInsertPoint(EntryBlock); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGRFunc->getArg(0); @@ -3203,14 +3206,13 @@ Function *OpenMPIRBuilder::emitListToGlobalReduceFunction( Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList}) ->addFnAttr(Attribute::NoUnwind); Builder.CreateRetVoid(); - Builder.restoreIP(OldIP); return LtGRFunc; } Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( ArrayRef ReductionInfos, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + IRBuilder<>::InsertPointGuard IPG(Builder); LLVMContext &Ctx = M.getContext(); FunctionType *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3226,6 +3228,7 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc); Builder.SetInsertPoint(EntryBlock); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGCFunc->getArg(0); @@ -3311,14 +3314,13 @@ Function *OpenMPIRBuilder::emitGlobalToListCopyFunction( } Builder.CreateRetVoid(); - Builder.restoreIP(OldIP); return LtGCFunc; } Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( ArrayRef ReductionInfos, Function *ReduceFn, Type *ReductionsBufferTy, AttributeList FuncAttrs) { - OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP(); + IRBuilder<>::InsertPointGuard IPG(Builder); LLVMContext &Ctx = M.getContext(); auto *FuncTy = FunctionType::get( Builder.getVoidTy(), @@ -3334,6 +3336,7 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc); Builder.SetInsertPoint(EntryBlock); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Buffer: global reduction buffer. Argument *BufferArg = LtGRFunc->getArg(0); @@ -3394,7 +3397,6 @@ Function *OpenMPIRBuilder::emitGlobalToListReduceFunction( Builder.CreateCall(ReduceFn, {ReduceList, ReductionList}) ->addFnAttr(Attribute::NoUnwind); Builder.CreateRetVoid(); - Builder.restoreIP(OldIP); return LtGRFunc; } @@ -3407,6 +3409,7 @@ std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const { Expected OpenMPIRBuilder::createReductionFunction( StringRef ReducerName, ArrayRef ReductionInfos, ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) { + IRBuilder<>::InsertPointGuard IPG(Builder); auto *FuncTy = FunctionType::get(Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getPtrTy()}, /* IsVarArg */ false); @@ -3419,6 +3422,7 @@ Expected OpenMPIRBuilder::createReductionFunction( BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", ReductionFunc); Builder.SetInsertPoint(EntryBB); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); // Need to alloca memory here and deal with the pointers before getting // LHS/RHS pointers out @@ -3746,10 +3750,12 @@ static Error populateReductionFunction( Function *ReductionFunc, ArrayRef ReductionInfos, IRBuilder<> &Builder, ArrayRef IsByRef, bool IsGPU) { + IRBuilder<>::InsertPointGuard IPG(Builder); Module *Module = ReductionFunc->getParent(); BasicBlock *ReductionFuncBlock = BasicBlock::Create(Module->getContext(), "", ReductionFunc); Builder.SetInsertPoint(ReductionFuncBlock); + Builder.SetCurrentDebugLocation(llvm::DebugLoc()); Value *LHSArrayPtr = nullptr; Value *RHSArrayPtr = nullptr; if (IsGPU) { diff --git a/mlir/test/Target/LLVMIR/omptarget-debug-reduc-fn-loc.mlir b/mlir/test/Target/LLVMIR/omptarget-debug-reduc-fn-loc.mlir new file mode 100644 index 0000000000000..d889ef4f5700c --- /dev/null +++ b/mlir/test/Target/LLVMIR/omptarget-debug-reduc-fn-loc.mlir @@ -0,0 +1,121 @@ +// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s + +module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} { + omp.private {type = private} @_QFEi_private_i32 : i32 loc(#loc1) + omp.declare_reduction @add_reduction_i32 : i32 init { + ^bb0(%arg0: i32 loc("test.f90":8:7)): + %0 = llvm.mlir.constant(0 : i32) : i32 loc(#loc2) + omp.yield(%0 : i32) loc(#loc2) + } combiner { + ^bb0(%arg0: i32 loc("test.f90":8:7), %arg1: i32 loc("test.f90":8:7)): + %0 = llvm.add %arg0, %arg1 : i32 loc(#loc2) + omp.yield(%0 : i32) loc(#loc2) + } loc(#loc2) + llvm.func @_QQmain() { + %0 = llvm.mlir.constant(1 : i64) : i64 loc(#loc4) + %1 = llvm.alloca %0 x i32 {bindc_name = "x"} : (i64) -> !llvm.ptr<5> loc(#loc4) + %2 = llvm.addrspacecast %1 : !llvm.ptr<5> to !llvm.ptr loc(#loc4) + %3 = llvm.mlir.constant(1 : i64) : i64 loc(#loc1) + %4 = llvm.alloca %3 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr<5> loc(#loc1) + %5 = llvm.addrspacecast %4 : !llvm.ptr<5> to !llvm.ptr loc(#loc1) + %6 = llvm.mlir.constant(8191 : index) : i64 loc(#loc5) + %7 = llvm.mlir.constant(0 : index) : i64 loc(#loc5) + %8 = llvm.mlir.constant(1 : index) : i64 loc(#loc5) + %9 = llvm.mlir.constant(0 : i32) : i32 loc(#loc5) + %10 = llvm.mlir.constant(8192 : index) : i64 loc(#loc5) + %11 = llvm.mlir.addressof @_QFEarr : !llvm.ptr<1> loc(#loc6) + %12 = llvm.addrspacecast %11 : !llvm.ptr<1> to !llvm.ptr loc(#loc6) + llvm.store %9, %2 : i32, !llvm.ptr loc(#loc7) + %15 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"} loc(#loc4) + %16 = omp.map.info var_ptr(%5 : !llvm.ptr, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !llvm.ptr {name = "i"} loc(#loc7) + %17 = omp.map.bounds lower_bound(%7 : i64) upper_bound(%6 : i64) extent(%10 : i64) stride(%8 : i64) start_idx(%8 : i64) loc(#loc7) + %18 = omp.map.info var_ptr(%12 : !llvm.ptr, !llvm.array<8192 x i32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%17) -> !llvm.ptr {name = "arr"} loc(#loc7) + omp.target map_entries(%15 -> %arg0, %16 -> %arg1, %18 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) { + %19 = llvm.mlir.constant(8192 : i32) : i32 loc(#loc5) + %20 = llvm.mlir.constant(1 : i32) : i32 loc(#loc5) + %21 = llvm.mlir.constant(8192 : index) : i64 loc(#loc6) + omp.teams reduction(@add_reduction_i32 %arg0 -> %arg3 : !llvm.ptr) { + omp.parallel private(@_QFEi_private_i32 %arg1 -> %arg4 : !llvm.ptr) { + omp.distribute { + omp.wsloop reduction(@add_reduction_i32 %arg3 -> %arg5 : !llvm.ptr) { + omp.loop_nest (%arg6) : i32 = (%20) to (%19) inclusive step (%20) { + llvm.store %arg6, %arg4 : i32, !llvm.ptr loc(#loc2) + %22 = llvm.load %arg5 : !llvm.ptr -> i32 loc(#loc8) + %23 = llvm.load %arg4 : !llvm.ptr -> i32 loc(#loc8) + %34 = llvm.add %22, %23 : i32 loc(#loc8) + llvm.store %34, %arg5 : i32, !llvm.ptr loc(#loc8) + omp.yield loc(#loc2) + } loc(#loc2) + } {omp.composite} loc(#loc2) + } {omp.composite} loc(#loc2) + omp.terminator loc(#loc2) + } {omp.composite} loc(#loc2) + omp.terminator loc(#loc2) + } loc(#loc2) + omp.terminator loc(#loc2) + } loc(#loc13) + llvm.return loc(#loc9) + } loc(#loc12) + llvm.mlir.global internal @_QFEarr() {addr_space = 1 : i32} : !llvm.array<8192 x i32> { + %0 = llvm.mlir.zero : !llvm.array<8192 x i32> loc(#loc6) + llvm.return %0 : !llvm.array<8192 x i32> loc(#loc6) + } loc(#loc6) +} loc(#loc) + +#loc = loc("test.f90":4:18) +#loc1 = loc("test.f90":4:18) +#loc2 = loc("test.f90":8:7) +#loc3 = loc("test.f90":1:7) +#loc4 = loc("test.f90":3:18) +#loc5 = loc(unknown) +#loc6 = loc("test.f90":5:18) +#loc7 = loc("test.f90":6:7) +#loc8 = loc("test.f90":10:7) +#loc9 = loc("test.f90":16:7) + +#di_file = #llvm.di_file<"target7.f90" in ""> +#di_null_type = #llvm.di_null_type +#di_compile_unit = #llvm.di_compile_unit, + sourceLanguage = DW_LANG_Fortran95, file = #di_file, producer = "flang", + isOptimized = false, emissionKind = LineTablesOnly> +#di_subroutine_type = #llvm.di_subroutine_type< + callingConvention = DW_CC_program, types = #di_null_type> +#di_subprogram = #llvm.di_subprogram, + compileUnit = #di_compile_unit, scope = #di_file, name = "main", + file = #di_file, subprogramFlags = "Definition|MainSubprogram", + type = #di_subroutine_type> +#di_subprogram1 = #llvm.di_subprogram + + +#loc12 = loc(fused<#di_subprogram>[#loc3]) +#loc13 = loc(fused<#di_subprogram1>[#loc2]) + +// CHECK-DAG: define internal void @_omp_reduction_shuffle_and_reduce_func +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @_omp_reduction_inter_warp_copy_func +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @"__omp_offloading_{{.*}}__QQmain_l8_omp$reduction$reduction_func.1" +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @_omp_reduction_shuffle_and_reduce_func.2 +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @_omp_reduction_inter_warp_copy_func.3 +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @_omp_reduction_list_to_global_copy_func +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @_omp_reduction_list_to_global_reduce_func +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @_omp_reduction_global_to_list_copy_func +// CHECK-NOT: !dbg +// CHECK: } +// CHECK-DAG: define internal void @_omp_reduction_global_to_list_reduce_func +// CHECK-NOT: !dbg +// CHECK: }