Skip to content

Commit d95ac94

Browse files
vchuravygiordano
authored andcommitted
[X86] Prefer lock or over mfence (llvm#106555)
Originally discussed in https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. This switches to `lock or` on all x64 systems, and leaves `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit b334321)
1 parent 0c6be71 commit d95ac94

File tree

7 files changed

+44
-20
lines changed

7 files changed

+44
-20
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31789,7 +31789,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
3178931789
// especially clever.
3179031790

3179131791
// Use `fence seq_cst` over `llvm.x64.sse2.mfence` here to get the correct
31792-
// lowering for SSID == SyncScope::SingleThread and !hasMFence
31792+
// lowering for SSID == SyncScope::SingleThread and avoidMFence || !hasMFence
3179331793
Builder.CreateFence(AtomicOrdering::SequentiallyConsistent, SSID);
3179431794

3179531795
// Finally we can emit the atomic load.
@@ -31878,7 +31878,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
3187831878
// cross-thread fence.
3187931879
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
3188031880
FenceSSID == SyncScope::System) {
31881-
if (Subtarget.hasMFence())
31881+
if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
3188231882
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
3188331883

3188431884
SDValue Chain = Op.getOperand(0);

llvm/lib/Target/X86/X86Subtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
280280
/// supports it.
281281
bool hasMFence() const { return hasSSE2() || is64Bit(); }
282282

283+
/// Avoid use of `mfence` for`fence seq_cst`, and instead use `lock or`.
284+
bool avoidMFence() const { return is64Bit(); }
285+
283286
const Triple &getTargetTriple() const { return TargetTriple; }
284287

285288
bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }

llvm/test/CodeGen/X86/atomic-idempotent.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
define i8 @add8(ptr %p) #0 {
1515
; X64-LABEL: add8:
1616
; X64: # %bb.0:
17-
; X64-NEXT: mfence
17+
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
1818
; X64-NEXT: movzbl (%rdi), %eax
1919
; X64-NEXT: retq
2020
;
@@ -47,7 +47,7 @@ define i8 @add8(ptr %p) #0 {
4747
define i16 @or16(ptr %p) #0 {
4848
; X64-LABEL: or16:
4949
; X64: # %bb.0:
50-
; X64-NEXT: mfence
50+
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
5151
; X64-NEXT: movzwl (%rdi), %eax
5252
; X64-NEXT: retq
5353
;
@@ -80,7 +80,7 @@ define i16 @or16(ptr %p) #0 {
8080
define i32 @xor32(ptr %p) #0 {
8181
; X64-LABEL: xor32:
8282
; X64: # %bb.0:
83-
; X64-NEXT: mfence
83+
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
8484
; X64-NEXT: movl (%rdi), %eax
8585
; X64-NEXT: retq
8686
;
@@ -113,7 +113,7 @@ define i32 @xor32(ptr %p) #0 {
113113
define i64 @sub64(ptr %p) #0 {
114114
; X64-LABEL: sub64:
115115
; X64: # %bb.0:
116-
; X64-NEXT: mfence
116+
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
117117
; X64-NEXT: movq (%rdi), %rax
118118
; X64-NEXT: retq
119119
;
@@ -265,7 +265,7 @@ define i128 @or128(ptr %p) #0 {
265265
define i32 @and32 (ptr %p) #0 {
266266
; X64-LABEL: and32:
267267
; X64: # %bb.0:
268-
; X64-NEXT: mfence
268+
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
269269
; X64-NEXT: movl (%rdi), %eax
270270
; X64-NEXT: retq
271271
;

llvm/test/CodeGen/X86/atomic-unordered.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2096,7 +2096,7 @@ define i64 @nofold_fence(ptr %p) {
20962096
; CHECK-LABEL: nofold_fence:
20972097
; CHECK: # %bb.0:
20982098
; CHECK-NEXT: movq (%rdi), %rax
2099-
; CHECK-NEXT: mfence
2099+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
21002100
; CHECK-NEXT: addq $15, %rax
21012101
; CHECK-NEXT: retq
21022102
%v = load atomic i64, ptr %p unordered, align 8
@@ -2170,7 +2170,7 @@ define i64 @fold_constant_fence(i64 %arg) {
21702170
; CHECK-LABEL: fold_constant_fence:
21712171
; CHECK: # %bb.0:
21722172
; CHECK-NEXT: movq Constant(%rip), %rax
2173-
; CHECK-NEXT: mfence
2173+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
21742174
; CHECK-NEXT: addq %rdi, %rax
21752175
; CHECK-NEXT: retq
21762176
%v = load atomic i64, ptr @Constant unordered, align 8
@@ -2197,7 +2197,7 @@ define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) {
21972197
; CHECK-LABEL: fold_invariant_fence:
21982198
; CHECK: # %bb.0:
21992199
; CHECK-NEXT: movq (%rdi), %rax
2200-
; CHECK-NEXT: mfence
2200+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
22012201
; CHECK-NEXT: addq %rsi, %rax
22022202
; CHECK-NEXT: retq
22032203
%v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{}
@@ -2321,7 +2321,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
23212321
; CHECK-O0-LABEL: fold_cmp_over_fence:
23222322
; CHECK-O0: # %bb.0:
23232323
; CHECK-O0-NEXT: movl (%rdi), %eax
2324-
; CHECK-O0-NEXT: mfence
2324+
; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
23252325
; CHECK-O0-NEXT: cmpl %eax, %esi
23262326
; CHECK-O0-NEXT: jne .LBB116_2
23272327
; CHECK-O0-NEXT: # %bb.1: # %taken
@@ -2335,7 +2335,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
23352335
; CHECK-O3-LABEL: fold_cmp_over_fence:
23362336
; CHECK-O3: # %bb.0:
23372337
; CHECK-O3-NEXT: movl (%rdi), %eax
2338-
; CHECK-O3-NEXT: mfence
2338+
; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
23392339
; CHECK-O3-NEXT: cmpl %eax, %esi
23402340
; CHECK-O3-NEXT: jne .LBB116_2
23412341
; CHECK-O3-NEXT: # %bb.1: # %taken

llvm/test/CodeGen/X86/implicit-null-check.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc -verify-machineinstrs -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks < %s | FileCheck %s
23

34
define i32 @imp_null_check_load(ptr %x) {
@@ -465,7 +466,7 @@ define i32 @imp_null_check_load_fence2(ptr %x) {
465466
; CHECK-NEXT: testq %rdi, %rdi
466467
; CHECK-NEXT: je LBB17_1
467468
; CHECK-NEXT: ## %bb.2: ## %not_null
468-
; CHECK-NEXT: mfence
469+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
469470
; CHECK-NEXT: movl (%rdi), %eax
470471
; CHECK-NEXT: retq
471472
; CHECK-NEXT: LBB17_1: ## %is_null

llvm/test/CodeGen/X86/membarrier.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ define i32 @t() {
66
; CHECK-LABEL: t:
77
; CHECK: # %bb.0:
88
; CHECK-NEXT: movl $1, -{{[0-9]+}}(%rsp)
9-
; CHECK-NEXT: mfence
9+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
1010
; CHECK-NEXT: lock decl -{{[0-9]+}}(%rsp)
11-
; CHECK-NEXT: mfence
11+
; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
1212
; CHECK-NEXT: xorl %eax, %eax
1313
; CHECK-NEXT: retq
1414
%i = alloca i32, align 4

llvm/test/CodeGen/X86/mfence.ll

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,15 @@
55
; It doesn't matter if an x86-64 target has specified "no-sse2"; we still can use mfence.
66

77
define void @test() {
8-
; CHECK-LABEL: test:
9-
; CHECK: # %bb.0:
10-
; CHECK-NEXT: mfence
11-
; CHECK-NEXT: ret{{[l|q]}}
8+
; X86-LABEL: test:
9+
; X86: # %bb.0:
10+
; X86-NEXT: mfence
11+
; X86-NEXT: retl
12+
;
13+
; X64-LABEL: test:
14+
; X64: # %bb.0:
15+
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
16+
; X64-NEXT: retq
1217
fence seq_cst
1318
ret void
1419
}
@@ -23,10 +28,25 @@ define i32 @fence(ptr %ptr) {
2328
;
2429
; X64-LABEL: fence:
2530
; X64: # %bb.0:
26-
; X64-NEXT: mfence
31+
; X64-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
2732
; X64-NEXT: movl (%rdi), %eax
2833
; X64-NEXT: retq
2934
%atomic = atomicrmw add ptr %ptr, i32 0 seq_cst
3035
ret i32 %atomic
3136
}
3237

38+
define void @mfence() nounwind {
39+
; X32-LABEL: mfence:
40+
; X32: # %bb.0:
41+
; X32-NEXT: mfence
42+
; X32-NEXT: retl
43+
;
44+
; CHECK-LABEL: mfence:
45+
; CHECK: # %bb.0:
46+
; CHECK-NEXT: mfence
47+
; CHECK-NEXT: ret{{[l|q]}}
48+
call void @llvm.x86.sse2.mfence()
49+
ret void
50+
}
51+
declare void @llvm.x86.sse2.mfence() nounwind readnone
52+

0 commit comments

Comments
 (0)