Skip to content

Commit 8ef171e

Browse files
authored
[msan] Handle horizontal add/subtract intrinsic by applying to shadow (#124159)
Horizontal add (hadd) and subtract (hsub) are currently heuristically handled by `maybeHandleSimpleNomemIntrinsic()` (via `handleUnknownIntrinsic()`), which computes the shadow by bitwise OR'ing the two operands. This has false positives for hadd/hsub shadows. For example, suppose the shadows for the two operands are 00000000 and 11111111 respectively. The expected shadow for the result is 00001111, but `maybeHandleSimpleNomemIntrinsic` would compute it as 11111111. This patch handles horizontal add using `handleIntrinsicByApplyingToShadow` (from #114490), which has no false positives for hadd/hsub: if each pair of adjacent shadow values is zero (fully initialized), the result will be zero (fully initialized). More generally, it is precise for hadd/hsub if at least one of the two adjacent shadow values in each pair is zero. It does have some false negatives for hadd/hsub: if we add/subtract two adjacent non-zero shadow values, some bits of the result may incorrectly be zero. We consider this an acceptable tradeoff for performance. To make shadow propagation precise, we want the equivalent of "horizontal OR", but this is not available. Reducing horizontal OR to (permutation plus bitwise OR) is left as an exercise for the reader.
1 parent 45d83ae commit 8ef171e

File tree

7 files changed

+132
-61
lines changed

7 files changed

+132
-61
lines changed

llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3906,6 +3906,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
39063906
setOriginForNaryOp(I);
39073907
}
39083908

3909+
void handleAVXHorizontalAddSubIntrinsic(IntrinsicInst &I) {
3910+
// Approximation only:
3911+
// output = horizontal_add(A, B)
3912+
// => shadow[output] = horizontal_add(shadow[A], shadow[B])
3913+
//
3914+
// - If we add/subtract two adjacent zero (initialized) shadow values, the
3915+
// result always be zero i.e., no false positives.
3916+
// - If we add/subtract two shadows, one of which is uninitialized, the
3917+
// result will always be non-zero i.e., no false negative.
3918+
// - However, we can have false negatives if we subtract two non-zero
3919+
// shadows of the same value (or do an addition that wraps to zero); we
3920+
// consider this an acceptable tradeoff for performance.
3921+
// To make shadow propagation precise, we want the equivalent of
3922+
// "horizontal OR", but this is not available.
3923+
return handleIntrinsicByApplyingToShadow(I, /* trailingVerbatimArgs */ 0);
3924+
}
3925+
39093926
/// Handle Arm NEON vector store intrinsics (vst{2,3,4}, vst1x_{2,3,4},
39103927
/// and vst{2,3,4}lane).
39113928
///
@@ -4418,6 +4435,36 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
44184435
handleVtestIntrinsic(I);
44194436
break;
44204437

4438+
case Intrinsic::x86_sse3_hadd_ps:
4439+
case Intrinsic::x86_sse3_hadd_pd:
4440+
case Intrinsic::x86_ssse3_phadd_d:
4441+
case Intrinsic::x86_ssse3_phadd_d_128:
4442+
case Intrinsic::x86_ssse3_phadd_w:
4443+
case Intrinsic::x86_ssse3_phadd_w_128:
4444+
case Intrinsic::x86_ssse3_phadd_sw:
4445+
case Intrinsic::x86_ssse3_phadd_sw_128:
4446+
case Intrinsic::x86_avx_hadd_pd_256:
4447+
case Intrinsic::x86_avx_hadd_ps_256:
4448+
case Intrinsic::x86_avx2_phadd_d:
4449+
case Intrinsic::x86_avx2_phadd_w:
4450+
case Intrinsic::x86_avx2_phadd_sw:
4451+
case Intrinsic::x86_sse3_hsub_ps:
4452+
case Intrinsic::x86_sse3_hsub_pd:
4453+
case Intrinsic::x86_ssse3_phsub_d:
4454+
case Intrinsic::x86_ssse3_phsub_d_128:
4455+
case Intrinsic::x86_ssse3_phsub_w:
4456+
case Intrinsic::x86_ssse3_phsub_w_128:
4457+
case Intrinsic::x86_ssse3_phsub_sw:
4458+
case Intrinsic::x86_ssse3_phsub_sw_128:
4459+
case Intrinsic::x86_avx_hsub_pd_256:
4460+
case Intrinsic::x86_avx_hsub_ps_256:
4461+
case Intrinsic::x86_avx2_phsub_d:
4462+
case Intrinsic::x86_avx2_phsub_w:
4463+
case Intrinsic::x86_avx2_phsub_sw: {
4464+
handleAVXHorizontalAddSubIntrinsic(I);
4465+
break;
4466+
}
4467+
44214468
case Intrinsic::fshl:
44224469
case Intrinsic::fshr:
44234470
handleFunnelShift(I);

llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -435,10 +435,13 @@ define <4 x double> @test_x86_avx_hadd_pd_256(<4 x double> %a0, <4 x double> %a1
435435
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
436436
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
437437
; CHECK-NEXT: call void @llvm.donothing()
438-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
439-
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
438+
; CHECK-NEXT: [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
439+
; CHECK-NEXT: [[A1:%.*]] = bitcast <4 x i64> [[TMP2]] to <4 x double>
440+
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A0]], <4 x double> [[A1]])
441+
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
442+
; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> [[A2:%.*]], <4 x double> [[A3:%.*]])
440443
; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
441-
; CHECK-NEXT: ret <4 x double> [[RES]]
444+
; CHECK-NEXT: ret <4 x double> [[RES1]]
442445
;
443446
%res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
444447
ret <4 x double> %res
@@ -451,10 +454,13 @@ define <8 x float> @test_x86_avx_hadd_ps_256(<8 x float> %a0, <8 x float> %a1) #
451454
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
452455
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
453456
; CHECK-NEXT: call void @llvm.donothing()
454-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
455-
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
457+
; CHECK-NEXT: [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
458+
; CHECK-NEXT: [[A1:%.*]] = bitcast <8 x i32> [[TMP2]] to <8 x float>
459+
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
460+
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
461+
; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> [[A2:%.*]], <8 x float> [[A3:%.*]])
456462
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
457-
; CHECK-NEXT: ret <8 x float> [[RES]]
463+
; CHECK-NEXT: ret <8 x float> [[RES1]]
458464
;
459465
%res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
460466
ret <8 x float> %res
@@ -467,10 +473,13 @@ define <4 x double> @test_x86_avx_hsub_pd_256(<4 x double> %a0, <4 x double> %a1
467473
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
468474
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
469475
; CHECK-NEXT: call void @llvm.donothing()
470-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <4 x i64> [[TMP1]], [[TMP2]]
471-
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> [[A0:%.*]], <4 x double> [[A1:%.*]])
476+
; CHECK-NEXT: [[A0:%.*]] = bitcast <4 x i64> [[TMP1]] to <4 x double>
477+
; CHECK-NEXT: [[A1:%.*]] = bitcast <4 x i64> [[TMP2]] to <4 x double>
478+
; CHECK-NEXT: [[RES:%.*]] = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> [[A0]], <4 x double> [[A1]])
479+
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <4 x double> [[RES]] to <4 x i64>
480+
; CHECK-NEXT: [[RES1:%.*]] = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> [[A2:%.*]], <4 x double> [[A3:%.*]])
472481
; CHECK-NEXT: store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
473-
; CHECK-NEXT: ret <4 x double> [[RES]]
482+
; CHECK-NEXT: ret <4 x double> [[RES1]]
474483
;
475484
%res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1) ; <<4 x double>> [#uses=1]
476485
ret <4 x double> %res
@@ -483,10 +492,13 @@ define <8 x float> @test_x86_avx_hsub_ps_256(<8 x float> %a0, <8 x float> %a1) #
483492
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
484493
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
485494
; CHECK-NEXT: call void @llvm.donothing()
486-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
487-
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> [[A0:%.*]], <8 x float> [[A1:%.*]])
495+
; CHECK-NEXT: [[A0:%.*]] = bitcast <8 x i32> [[TMP1]] to <8 x float>
496+
; CHECK-NEXT: [[A1:%.*]] = bitcast <8 x i32> [[TMP2]] to <8 x float>
497+
; CHECK-NEXT: [[RES:%.*]] = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> [[A0]], <8 x float> [[A1]])
498+
; CHECK-NEXT: [[_MSPROP:%.*]] = bitcast <8 x float> [[RES]] to <8 x i32>
499+
; CHECK-NEXT: [[RES1:%.*]] = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> [[A2:%.*]], <8 x float> [[A3:%.*]])
488500
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
489-
; CHECK-NEXT: ret <8 x float> [[RES]]
501+
; CHECK-NEXT: ret <8 x float> [[RES1]]
490502
;
491503
%res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
492504
ret <8 x float> %res

llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -569,7 +569,7 @@ define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
569569
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
570570
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
571571
; CHECK-NEXT: call void @llvm.donothing()
572-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
572+
; CHECK-NEXT: [[_MSPROP:%.*]] = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
573573
; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
574574
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
575575
; CHECK-NEXT: ret <8 x i32> [[RES]]
@@ -585,7 +585,7 @@ define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
585585
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
586586
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
587587
; CHECK-NEXT: call void @llvm.donothing()
588-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
588+
; CHECK-NEXT: [[_MSPROP:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
589589
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
590590
; CHECK-NEXT: store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
591591
; CHECK-NEXT: ret <16 x i16> [[RES]]
@@ -601,7 +601,7 @@ define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
601601
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
602602
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
603603
; CHECK-NEXT: call void @llvm.donothing()
604-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
604+
; CHECK-NEXT: [[_MSPROP:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
605605
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
606606
; CHECK-NEXT: store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
607607
; CHECK-NEXT: ret <16 x i16> [[RES]]
@@ -617,7 +617,7 @@ define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) #0 {
617617
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
618618
; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
619619
; CHECK-NEXT: call void @llvm.donothing()
620-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
620+
; CHECK-NEXT: [[_MSPROP:%.*]] = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
621621
; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> [[A0:%.*]], <8 x i32> [[A1:%.*]])
622622
; CHECK-NEXT: store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
623623
; CHECK-NEXT: ret <8 x i32> [[RES]]
@@ -633,7 +633,7 @@ define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) #0 {
633633
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
634634
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
635635
; CHECK-NEXT: call void @llvm.donothing()
636-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
636+
; CHECK-NEXT: [[_MSPROP:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
637637
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
638638
; CHECK-NEXT: store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
639639
; CHECK-NEXT: ret <16 x i16> [[RES]]
@@ -649,7 +649,7 @@ define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) #0 {
649649
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
650650
; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
651651
; CHECK-NEXT: call void @llvm.donothing()
652-
; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]]
652+
; CHECK-NEXT: [[_MSPROP:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]])
653653
; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]])
654654
; CHECK-NEXT: store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
655655
; CHECK-NEXT: ret <16 x i16> [[RES]]

0 commit comments

Comments
 (0)