@@ -627,7 +627,7 @@ for.end: ; preds = %for.body, %entry
627627 ret float %result.0.lcssa
628628}
629629
630- ; Sub we can create a reduction, but not inloop
630+ ; Sub we can create a reduction inloop
631631define i32 @reduction_sub_lhs (ptr noalias nocapture %A ) {
632632; CHECK-LABEL: @reduction_sub_lhs(
633633; CHECK-NEXT: entry:
@@ -636,15 +636,16 @@ define i32 @reduction_sub_lhs(ptr noalias nocapture %A) {
636636; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
637637; CHECK: vector.body:
638638; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
639- ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer , [[VECTOR_PH]] ], [ [[TMP1 :%.*]], [[VECTOR_BODY]] ]
639+ ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0 , [[VECTOR_PH]] ], [ [[TMP3 :%.*]], [[VECTOR_BODY]] ]
640640; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
641641; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
642- ; CHECK-NEXT: [[TMP1]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
642+ ; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> zeroinitializer, [[WIDE_LOAD]]
643+ ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
644+ ; CHECK-NEXT: [[TMP3]] = add i32 [[TMP1]], [[VEC_PHI]]
643645; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
644646; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
645647; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
646648; CHECK: middle.block:
647- ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
648649; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
649650; CHECK: scalar.ph:
650651; CHECK-NEXT: br label [[FOR_BODY:%.*]]
0 commit comments