Skip to content

Commit bb9363d

Browse files
committed
Use runtime lowerbound in multideps when building taskloop iterator dependent multidep
Closes llvm#99
1 parent ac70008 commit bb9363d

File tree

2 files changed

+160
-5
lines changed

2 files changed

+160
-5
lines changed

llvm/lib/Transforms/OmpSs/OmpSsTransform.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -886,23 +886,27 @@ struct OmpSs : public ModulePass {
886886
LoopInfo.StepSigned = 1;
887887

888888
Function *ComputeMultiDepFun = MDI.ComputeMultiDepFun;
889-
ArrayRef<Value *> Args = MDI.Args;
889+
auto Args = MDI.Args;
890+
891+
if (IsTaskLoop) {
892+
std::replace(Args.begin(), Args.end(), IndVar, NewIndVarLBound);
893+
}
890894

891895
for (size_t i = 0; i < MDI.Iters.size(); i++) {
892896
LoopInfo.IndVar = MDI.Iters[i];
893-
auto LBoundGen = [ComputeMultiDepFun, Args, i](IRBuilder<> &IRB) {
897+
auto LBoundGen = [ComputeMultiDepFun, &Args, i](IRBuilder<> &IRB) {
894898
Value *ComputeMultiDepCall = IRB.CreateCall(ComputeMultiDepFun, Args);
895899
return IRB.CreateExtractValue(ComputeMultiDepCall, i*(3 + 1) + 0);
896900
};
897-
auto RemapGen = [ComputeMultiDepFun, Args, i](IRBuilder<> &IRB) {
901+
auto RemapGen = [ComputeMultiDepFun, &Args, i](IRBuilder<> &IRB) {
898902
Value *ComputeMultiDepCall = IRB.CreateCall(ComputeMultiDepFun, Args);
899903
return IRB.CreateExtractValue(ComputeMultiDepCall, i*(3 + 1) + 1);
900904
};
901-
auto UBoundGen = [ComputeMultiDepFun, Args, i](IRBuilder<> &IRB) {
905+
auto UBoundGen = [ComputeMultiDepFun, &Args, i](IRBuilder<> &IRB) {
902906
Value *ComputeMultiDepCall = IRB.CreateCall(ComputeMultiDepFun, Args);
903907
return IRB.CreateExtractValue(ComputeMultiDepCall, i*(3 + 1) + 2);
904908
};
905-
auto IncrGen = [ComputeMultiDepFun, Args, i](IRBuilder<> &IRB) {
909+
auto IncrGen = [ComputeMultiDepFun, &Args, i](IRBuilder<> &IRB) {
906910
Value *ComputeMultiDepCall = IRB.CreateCall(ComputeMultiDepFun, Args);
907911
return IRB.CreateExtractValue(ComputeMultiDepCall, i*(3 + 1) + 3);
908912
};
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
; RUN: opt %s -ompss-2 -S | FileCheck %s
2+
; ModuleID = 'taskloop_multideps.ll'
3+
source_filename = "taskloop_multideps.ll"
4+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
5+
target triple = "x86_64-unknown-linux-gnu"
6+
7+
; This test checks we use nanos6 lower bound to build
8+
; multidep loop and call to register dep
9+
10+
; int v[10];
11+
; int main() {
12+
; #pragma oss taskloop out( { v[i], i=0;j } )
13+
; for (int j = 0; j < 10; ++j) { }
14+
; }
15+
16+
%struct._depend_unpack_t = type { i32, i32, i32, i32 }
17+
%struct._depend_unpack_t.0 = type { i32*, i64, i64, i64 }
18+
19+
@v = global [10 x i32] zeroinitializer, align 16
20+
21+
; Function Attrs: noinline nounwind optnone
22+
define i32 @main() #0 !dbg !6 {
23+
entry:
24+
%j = alloca i32, align 4
25+
%i = alloca i32, align 4
26+
store i32 0, i32* %j, align 4, !dbg !9
27+
store i32 0, i32* %i, align 4, !dbg !10
28+
%0 = call token @llvm.directive.region.entry() [ "DIR.OSS"([9 x i8] c"TASKLOOP\00"), "QUAL.OSS.SHARED"([10 x i32]* @v), "QUAL.OSS.PRIVATE"(i32* %i), "QUAL.OSS.PRIVATE"(i32* %j), "QUAL.OSS.LOOP.IND.VAR"(i32* %j), "QUAL.OSS.LOOP.LOWER.BOUND"(i32 0), "QUAL.OSS.LOOP.UPPER.BOUND"(i32 10), "QUAL.OSS.LOOP.STEP"(i32 1), "QUAL.OSS.LOOP.TYPE"(i64 0, i64 1, i64 1, i64 1, i64 1), "QUAL.OSS.CAPTURED"(i32 0, i32 10, i32 1), "QUAL.OSS.MULTIDEP.RANGE.OUT"(i32* %i, %struct._depend_unpack_t (i32*, i32*)* @compute_dep, i32* %i, i32* %j, [10 x i32]* @v, %struct._depend_unpack_t.0 (i32*, i32*, [10 x i32]*)* @compute_dep.1, i32* %i, i32* %j, [10 x i32]* @v) ], !dbg !9
29+
call void @llvm.directive.region.exit(token %0), !dbg !9
30+
ret i32 0, !dbg !11
31+
}
32+
33+
; Function Attrs: nounwind
34+
declare token @llvm.directive.region.entry() #1
35+
36+
; Function Attrs: nounwind
37+
declare void @llvm.directive.region.exit(token) #1
38+
39+
define internal %struct._depend_unpack_t @compute_dep(i32* %i, i32* %j) {
40+
entry:
41+
%return.val = alloca %struct._depend_unpack_t, align 4
42+
%0 = load i32, i32* %i, align 4, !dbg !10
43+
%1 = load i32, i32* %j, align 4, !dbg !10
44+
%2 = add i32 0, %1
45+
%3 = add i32 %2, -1
46+
%4 = getelementptr inbounds %struct._depend_unpack_t, %struct._depend_unpack_t* %return.val, i32 0, i32 0
47+
store i32 0, i32* %4, align 4
48+
%5 = getelementptr inbounds %struct._depend_unpack_t, %struct._depend_unpack_t* %return.val, i32 0, i32 1
49+
store i32 %0, i32* %5, align 4
50+
%6 = getelementptr inbounds %struct._depend_unpack_t, %struct._depend_unpack_t* %return.val, i32 0, i32 2
51+
store i32 %3, i32* %6, align 4
52+
%7 = getelementptr inbounds %struct._depend_unpack_t, %struct._depend_unpack_t* %return.val, i32 0, i32 3
53+
store i32 1, i32* %7, align 4
54+
%8 = load %struct._depend_unpack_t, %struct._depend_unpack_t* %return.val, align 4
55+
ret %struct._depend_unpack_t %8
56+
}
57+
58+
define internal %struct._depend_unpack_t.0 @compute_dep.1(i32* %i, i32* %j, [10 x i32]* %v) {
59+
entry:
60+
%return.val = alloca %struct._depend_unpack_t.0, align 8
61+
%0 = load i32, i32* %i, align 4, !dbg !10
62+
%1 = sext i32 %0 to i64
63+
%2 = add i64 %1, 1
64+
%arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %v, i64 0, i64 0, !dbg !10
65+
%3 = mul i64 %1, 4
66+
%4 = mul i64 %2, 4
67+
%5 = getelementptr inbounds %struct._depend_unpack_t.0, %struct._depend_unpack_t.0* %return.val, i32 0, i32 0
68+
store i32* %arraydecay, i32** %5, align 8
69+
%6 = getelementptr inbounds %struct._depend_unpack_t.0, %struct._depend_unpack_t.0* %return.val, i32 0, i32 1
70+
store i64 40, i64* %6, align 8
71+
%7 = getelementptr inbounds %struct._depend_unpack_t.0, %struct._depend_unpack_t.0* %return.val, i32 0, i32 2
72+
store i64 %3, i64* %7, align 8
73+
%8 = getelementptr inbounds %struct._depend_unpack_t.0, %struct._depend_unpack_t.0* %return.val, i32 0, i32 3
74+
store i64 %4, i64* %8, align 8
75+
%9 = load %struct._depend_unpack_t.0, %struct._depend_unpack_t.0* %return.val, align 8
76+
ret %struct._depend_unpack_t.0 %9
77+
}
78+
79+
; CHECK: define internal void @nanos6_unpacked_deps_main0([10 x i32]* %v, i32* %i, i32* %j, i32 %0, i32 %1, i32 %2, %nanos6_loop_bounds_t* %loop_bounds, i8* %handler) {
80+
; CHECK: entry:
81+
; CHECK-NEXT: %lb_gep = getelementptr %nanos6_loop_bounds_t, %nanos6_loop_bounds_t* %loop_bounds, i32 0, i32 0
82+
; CHECK-NEXT: %3 = load i64, i64* %lb_gep, align 8
83+
; CHECK-NEXT: %lb = trunc i64 %3 to i32
84+
; CHECK-NEXT: %ub_gep = getelementptr %nanos6_loop_bounds_t, %nanos6_loop_bounds_t* %loop_bounds, i32 0, i32 1
85+
; CHECK-NEXT: %4 = load i64, i64* %ub_gep, align 8
86+
; CHECK-NEXT: %5 = trunc i64 %4 to i32
87+
; CHECK-NEXT: %ub = sub i32 %5, 1
88+
; CHECK-NEXT: %j.lb = alloca i32, align 4
89+
; CHECK-NEXT: %j.ub = alloca i32, align 4
90+
; CHECK-NEXT: %6 = mul i32 1, %lb
91+
; CHECK-NEXT: %7 = add i32 %6, 0
92+
; CHECK-NEXT: store i32 %7, i32* %j.lb, align 4
93+
; CHECK-NEXT: %8 = mul i32 1, %ub
94+
; CHECK-NEXT: %9 = add i32 %8, 0
95+
; CHECK-NEXT: store i32 %9, i32* %j.ub, align 4
96+
; CHECK-NEXT: br label %10
97+
; CHECK: 10: ; preds = %entry
98+
; CHECK-NEXT: %i.remap = alloca i32, align 4
99+
; CHECK-NEXT: %11 = call %struct._depend_unpack_t @compute_dep(i32* %i, i32* %j.lb)
100+
; CHECK-NEXT: %12 = extractvalue %struct._depend_unpack_t %11, 0
101+
; CHECK-NEXT: store i32 %12, i32* %i, align 4
102+
; CHECK-NEXT: br label %for.cond
103+
; CHECK: for.cond: ; preds = %for.incr, %10
104+
; CHECK-NEXT: %13 = call %struct._depend_unpack_t @compute_dep(i32* %i, i32* %j.lb)
105+
; CHECK-NEXT: %14 = extractvalue %struct._depend_unpack_t %13, 2
106+
; CHECK-NEXT: %15 = load i32, i32* %i, align 4
107+
; CHECK-NEXT: %16 = icmp sle i32 %15, %14
108+
; CHECK-NEXT: br i1 %16, label %for.body, label %26
109+
; CHECK: for.body: ; preds = %for.cond
110+
; CHECK-NEXT: %17 = call %struct._depend_unpack_t @compute_dep(i32* %i, i32* %j.lb)
111+
; CHECK-NEXT: %18 = extractvalue %struct._depend_unpack_t %17, 1
112+
; CHECK-NEXT: store i32 %18, i32* %i.remap, align 4
113+
; CHECK-NEXT: %19 = call %struct._depend_unpack_t.0 @compute_dep.1(i32* %i.remap, i32* %j.lb, [10 x i32]* %v)
114+
; CHECK-NEXT: %20 = call %struct._depend_unpack_t.0 @compute_dep.1(i32* %i.remap, i32* %j.ub, [10 x i32]* %v)
115+
; CHECK-NEXT: %21 = extractvalue %struct._depend_unpack_t.0 %19, 0
116+
; CHECK-NEXT: %22 = bitcast i32* %21 to i8*
117+
; CHECK-NEXT: %23 = extractvalue %struct._depend_unpack_t.0 %19, 1
118+
; CHECK-NEXT: %24 = extractvalue %struct._depend_unpack_t.0 %19, 2
119+
; CHECK-NEXT: %25 = extractvalue %struct._depend_unpack_t.0 %20, 3
120+
; CHECK-NEXT: call void @nanos6_register_region_write_depinfo1(i8* %handler, i32 0, i8* null, i8* %22, i64 %23, i64 %24, i64 %25)
121+
; CHECK-NEXT: br label %for.incr
122+
; CHECK: 26: ; preds = %for.cond
123+
; CHECK-NEXT: ret void
124+
; CHECK: for.incr: ; preds = %for.body
125+
; CHECK-NEXT: %27 = call %struct._depend_unpack_t @compute_dep(i32* %i, i32* %j.lb)
126+
; CHECK-NEXT: %28 = extractvalue %struct._depend_unpack_t %27, 3
127+
; CHECK-NEXT: %29 = load i32, i32* %i, align 4
128+
; CHECK-NEXT: %30 = add i32 %29, %28
129+
; CHECK-NEXT: store i32 %30, i32* %i, align 4
130+
; CHECK-NEXT: br label %for.cond
131+
; CHECK-NEXT: }
132+
133+
attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
134+
attributes #1 = { nounwind }
135+
136+
!llvm.dbg.cu = !{!0}
137+
!llvm.module.flags = !{!3, !4}
138+
!llvm.ident = !{!5}
139+
140+
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "human", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, nameTableKind: None)
141+
!1 = !DIFile(filename: "<stdin>", directory: "")
142+
!2 = !{}
143+
!3 = !{i32 2, !"Debug Info Version", i32 3}
144+
!4 = !{i32 1, !"wchar_size", i32 4}
145+
!5 = !{!"clang version 11.0.0 "}
146+
!6 = distinct !DISubprogram(name: "main", scope: !7, file: !7, line: 2, type: !8, scopeLine: 2, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
147+
!7 = !DIFile(filename: "taskloop_multideps.ll", directory: "")
148+
!8 = !DISubroutineType(types: !2)
149+
!9 = !DILocation(line: 4, scope: !6)
150+
!10 = !DILocation(line: 3, scope: !6)
151+
!11 = !DILocation(line: 5, scope: !6)

0 commit comments

Comments
 (0)