diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1bda187810a63..4347b1dec4cf0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31906,7 +31906,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // especially clever. // Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct - // lowering for SSID == SyncScope::SingleThread and !hasMFence + // lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID); // Finally we can emit the atomic load. @@ -31995,7 +31995,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { - if (Subtarget.hasMFence()) + if (!Subtarget.avoidMFence() && Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 722076ca88c9c..8f2d326a69398 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -280,6 +280,9 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// supports it. bool hasMFence() const { return hasSSE2() || is64Bit(); } + /// Avoid use of `mfence` for`fence seq_cst`, and instead use `lock or`. + bool avoidMFence() const { return is64Bit(); } + const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll index 91355bd64cade..020f9eb793102 100644 --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -14,7 +14,7 @@ define i8 @add8(ptr %p) #0 { ; X64-LABEL: add8: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl (%rdi), %eax ; X64-NEXT: retq ; @@ -47,7 +47,7 @@ define i8 @add8(ptr %p) #0 { define i16 @or16(ptr %p) #0 { ; X64-LABEL: or16: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzwl (%rdi), %eax ; X64-NEXT: retq ; @@ -80,7 +80,7 @@ define i16 @or16(ptr %p) #0 { define i32 @xor32(ptr %p) #0 { ; X64-LABEL: xor32: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq ; @@ -113,7 +113,7 @@ define i32 @xor32(ptr %p) #0 { define i64 @sub64(ptr %p) #0 { ; X64-LABEL: sub64: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq (%rdi), %rax ; X64-NEXT: retq ; @@ -265,7 +265,7 @@ define i128 @or128(ptr %p) #0 { define i32 @and32 (ptr %p) #0 { ; X64-LABEL: and32: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index 3fb994cdb751a..e8e0ee0b7ef49 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -2096,7 +2096,7 @@ define i64 @nofold_fence(ptr %p) { ; CHECK-LABEL: nofold_fence: ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: addq $15, %rax ; CHECK-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 @@ -2170,7 +2170,7 @@ define i64 @fold_constant_fence(i64 %arg) { ; CHECK-LABEL: fold_constant_fence: ; CHECK: # %bb.0: ; CHECK-NEXT: movq Constant(%rip), %rax -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: retq %v = load atomic i64, ptr @Constant unordered, align 8 @@ -2197,7 +2197,7 @@ define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) { ; CHECK-LABEL: fold_invariant_fence: ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: addq %rsi, %rax ; CHECK-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{} @@ -2321,7 +2321,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O0-LABEL: fold_cmp_over_fence: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movl (%rdi), %eax -; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: cmpl %eax, %esi ; CHECK-O0-NEXT: jne .LBB116_2 ; CHECK-O0-NEXT: # %bb.1: # %taken @@ -2335,7 +2335,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) { ; CHECK-O3-LABEL: fold_cmp_over_fence: ; CHECK-O3: # %bb.0: ; CHECK-O3-NEXT: movl (%rdi), %eax -; CHECK-O3-NEXT: mfence +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-NEXT: cmpl %eax, %esi ; CHECK-O3-NEXT: jne .LBB116_2 ; CHECK-O3-NEXT: # %bb.1: # %taken diff --git a/llvm/test/CodeGen/X86/implicit-null-check.ll b/llvm/test/CodeGen/X86/implicit-null-check.ll index fc81f703f5d40..de63c9ae209df 100644 --- a/llvm/test/CodeGen/X86/implicit-null-check.ll +++ b/llvm/test/CodeGen/X86/implicit-null-check.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -verify-machineinstrs -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s define i32 @imp_null_check_load(ptr %x) { @@ -465,7 +466,7 @@ define i32 @imp_null_check_load_fence2(ptr %x) { ; CHECK-NEXT: testq %rdi, %rdi ; CHECK-NEXT: je LBB17_1 ; CHECK-NEXT: ## %bb.2: ## %not_null -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movl (%rdi), %eax ; CHECK-NEXT: retq ; CHECK-NEXT: LBB17_1: ## %is_null diff --git a/llvm/test/CodeGen/X86/membarrier.ll b/llvm/test/CodeGen/X86/membarrier.ll index 55f2a2f210139..2773f01f7ab82 100644 --- a/llvm/test/CodeGen/X86/membarrier.ll +++ b/llvm/test/CodeGen/X86/membarrier.ll @@ -6,9 +6,9 @@ define i32 @t() { ; CHECK-LABEL: t: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: lock decl -{{[0-9]+}}(%rsp) -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: retq %i = alloca i32, align 4 diff --git a/llvm/test/CodeGen/X86/mfence.ll b/llvm/test/CodeGen/X86/mfence.ll index 8c29af8648712..ce74d2dd69f9b 100644 --- a/llvm/test/CodeGen/X86/mfence.ll +++ b/llvm/test/CodeGen/X86/mfence.ll @@ -5,10 +5,15 @@ ; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence. define void @test() { -; CHECK-LABEL: test: -; CHECK: # %bb.0: -; CHECK-NEXT: mfence -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: test: +; X86: # %bb.0: +; X86-NEXT: mfence +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # %bb.0: +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; X64-NEXT: retq fence seq_cst ret void } @@ -23,10 +28,25 @@ define i32 @fence(ptr %ptr) { ; ; X64-LABEL: fence: ; X64: # %bb.0: -; X64-NEXT: mfence +; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movl (%rdi), %eax ; X64-NEXT: retq %atomic = atomicrmw add ptr %ptr, i32 0 seq_cst ret i32 %atomic } +define void @mfence() nounwind { +; X32-LABEL: mfence: +; X32: # %bb.0: +; X32-NEXT: mfence +; X32-NEXT: retl +; +; CHECK-LABEL: mfence: +; CHECK: # %bb.0: +; CHECK-NEXT: mfence +; CHECK-NEXT: ret{{[l|q]}} + call void @llvm.x86.sse2.mfence() + ret void +} +declare void @llvm.x86.sse2.mfence() nounwind readnone +