-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[X86] Prefer lock or
over mfence
#106555
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[X86] Prefer lock or
over mfence
#106555
Conversation
@llvm/pr-subscribers-backend-x86 Author: Valentin Churavy (vchuravy) ChangesOriginally opened as https://reviews.llvm.org/D129947 LLVM currently emits Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the Full diff: https://github.com/llvm/llvm-project/pull/106555.diff 3 Files Affected:
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 988966fa6a6c46..dfa534a69e7024 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -754,6 +754,10 @@ def TuningUseGLMDivSqrtCosts
def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
"Target has branch hint feature">;
+def TuningAvoidMFENCE
+ : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true",
+ "Avoid MFENCE for fence seq_cst, and instead use lock or">;
+
//===----------------------------------------------------------------------===//
// X86 CPU Families
// TODO: Remove these - use general tuning features to determine codegen.
@@ -882,7 +886,8 @@ def ProcessorFeatures {
list<SubtargetFeature> NHMTuning = [TuningMacroFusion,
TuningSlowDivide64,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
// Westmere
list<SubtargetFeature> WSMAdditionalFeatures = [FeaturePCLMUL];
@@ -903,7 +908,8 @@ def ProcessorFeatures {
TuningFast15ByteNOP,
TuningPOPCNTFalseDeps,
TuningInsertVZEROUPPER,
- TuningNoDomainDelayMov];
+ TuningNoDomainDelayMov,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SNBFeatures =
!listconcat(WSMFeatures, SNBAdditionalFeatures);
@@ -969,7 +975,8 @@ def ProcessorFeatures {
TuningAllowLight256Bit,
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
- TuningNoDomainDelayBlend];
+ TuningNoDomainDelayBlend,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKLFeatures =
!listconcat(BDWFeatures, SKLAdditionalFeatures);
@@ -1004,7 +1011,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> SKXFeatures =
!listconcat(BDWFeatures, SKXAdditionalFeatures);
@@ -1047,7 +1055,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> CNLFeatures =
!listconcat(SKLFeatures, CNLAdditionalFeatures);
@@ -1076,7 +1085,8 @@ def ProcessorFeatures {
TuningNoDomainDelayMov,
TuningNoDomainDelayShuffle,
TuningNoDomainDelayBlend,
- TuningFastImmVectorShift];
+ TuningFastImmVectorShift,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ICLFeatures =
!listconcat(CNLFeatures, ICLAdditionalFeatures);
@@ -1222,7 +1232,8 @@ def ProcessorFeatures {
// Tremont
list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
FeatureGFNI];
- list<SubtargetFeature> TRMTuning = GLPTuning;
+ list<SubtargetFeature> TRMAdditionalTuning = [TuningAvoidMFENCE];
+ list<SubtargetFeature> TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning);
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPFeatures, TRMAdditionalFeatures);
@@ -1429,7 +1440,8 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningBranchFusion,
TuningSBBDepBreaking,
- TuningInsertVZEROUPPER];
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE];
// PileDriver
list<SubtargetFeature> BdVer2AdditionalFeatures = [FeatureF16C,
@@ -1509,7 +1521,8 @@ def ProcessorFeatures {
TuningSlowSHLD,
TuningSBBDepBreaking,
TuningInsertVZEROUPPER,
- TuningAllowLight256Bit];
+ TuningAllowLight256Bit,
+ TuningAvoidMFENCE];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
FeatureRDPRU,
@@ -1664,7 +1677,8 @@ def : ProcModel<"nocona", GenericPostRAModel, [
],
[
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
// Intel Core 2 Solo/Duo.
@@ -1684,7 +1698,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
foreach P = ["penryn", "core_2_duo_sse4_1"] in {
@@ -1703,7 +1718,8 @@ def : ProcModel<P, SandyBridgeModel, [
[
TuningMacroFusion,
TuningSlowUAMem16,
- TuningInsertVZEROUPPER
+ TuningInsertVZEROUPPER,
+ TuningAvoidMFENCE
]>;
}
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f011249d295040..aade718c1efe80 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31103,7 +31103,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
// cross-thread fence.
if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
FenceSSID == SyncScope::System) {
- if (Subtarget.hasMFence())
+ if (!Subtarget.avoidMFence() && Subtarget.hasMFence())
return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
SDValue Chain = Op.getOperand(0);
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..e8e0ee0b7ef492 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2096,7 +2096,7 @@ define i64 @nofold_fence(ptr %p) {
; CHECK-LABEL: nofold_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq $15, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8
@@ -2170,7 +2170,7 @@ define i64 @fold_constant_fence(i64 %arg) {
; CHECK-LABEL: fold_constant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq Constant(%rip), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rdi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr @Constant unordered, align 8
@@ -2197,7 +2197,7 @@ define i64 @fold_invariant_fence(ptr dereferenceable(8) %p, i64 %arg) {
; CHECK-LABEL: fold_invariant_fence:
; CHECK: # %bb.0:
; CHECK-NEXT: movq (%rdi), %rax
-; CHECK-NEXT: mfence
+; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: addq %rsi, %rax
; CHECK-NEXT: retq
%v = load atomic i64, ptr %p unordered, align 8, !invariant.load !{}
@@ -2321,7 +2321,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O0-LABEL: fold_cmp_over_fence:
; CHECK-O0: # %bb.0:
; CHECK-O0-NEXT: movl (%rdi), %eax
-; CHECK-O0-NEXT: mfence
+; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O0-NEXT: cmpl %eax, %esi
; CHECK-O0-NEXT: jne .LBB116_2
; CHECK-O0-NEXT: # %bb.1: # %taken
@@ -2335,7 +2335,7 @@ define i1 @fold_cmp_over_fence(ptr %p, i32 %v1) {
; CHECK-O3-LABEL: fold_cmp_over_fence:
; CHECK-O3: # %bb.0:
; CHECK-O3-NEXT: movl (%rdi), %eax
-; CHECK-O3-NEXT: mfence
+; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp)
; CHECK-O3-NEXT: cmpl %eax, %esi
; CHECK-O3-NEXT: jne .LBB116_2
; CHECK-O3-NEXT: # %bb.1: # %taken
|
|
0bfeaaf
to
796b0c1
Compare
Thank you |
796b0c1
to
d32af20
Compare
On a Genoa machine (AMD EPYC 9384X), a benchmark of mine takes 14.13s to execute with seq_cst defaulting to |
d32af20
to
707ca0e
Compare
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
Friendly bump! Can we get another round of review on this PR? Thanks! |
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit 4d502dd) (cherry picked from commit 707ca0e)
707ca0e
to
522ff6e
Compare
This extends this optimization for scenarios where the subtarget has `!hasMFence` or we have SyncScope SingleThread, by avoiding the direct usage of `llvm.x64.sse2.mfence`. Originally part of #106555
522ff6e
to
b5f6005
Compare
We try to only use X32 for gnux32 triple Noticed while reviewing #106555
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Instead of a tuning flag I'm very tempted to say that we create a general avoidMFence() method that always returns true for 64-bit targets - similar to what we do for hasCLFLUSH() etc.
@phoebewang any thoughts?
Sounds good. I don't like the tedious tunings for all modern targets. |
Originally opened as https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/ After another 2 years it doesn't look like anyone complained about the GCC switch. And there is still `__builtin_ia32_mfence` for folks who want this precise instruction.
b5f6005
to
422a1d1
Compare
@phoebewang @RKSimon adjusted it to be that. The only thing "lost" is the ability to flip it back |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
We try to only use X32 for gnux32 triple Noticed while reviewing llvm#106555 (cherry picked from commit bd3bde8)
Originally discussed in https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. This switches to `lock or` on all x64 systems, and leaves `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit b334321)
We try to only use X32 for gnux32 triple Noticed while reviewing llvm#106555
We're seeing a ~40% regression in a macrobenchmark after this change. Is Skylake-X considered modern enough to benefit from |
All Core architectures should benefit from this change. Is your macrobenchmark single-threaded or contention heavy? Skylake-X has 33 cycles for mfence and 18 for lock or according to uops.info |
Thanks for the information. I don't know much about the nature of the benchmark, except that it deals with I/O. It's being looked at now by someone else. I just wanted to give an early heads-up of a possible problem with this change. |
One thing to confirm is if the benchmark uses non-temporal stores in some ways, and additional if GCC has a similar regression (they did this change a few years ahead of us) |
We try to only use X32 for gnux32 triple Noticed while reviewing llvm#106555 (cherry picked from commit bd3bde8)
Originally discussed in https://reviews.llvm.org/D129947 LLVM currently emits `mfence` for `__atomic_thread_fence(seq_cst)`. On modern CPUs lock or is more efficient and provides the same sequential consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html) and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632 moved into this direction as well, but didn't touch fence seq_cst. This switches to `lock or` on all x64 systems, and leaves `__builtin_ia32_mfence` for folks who want this precise instruction. (cherry picked from commit b334321)
Originally opened as https://reviews.llvm.org/D129947
LLVM currently emits
mfence
for__atomic_thread_fence(seq_cst)
. Onmodern CPUs lock or is more efficient and provides the same sequential
consistency. GCC 11 made this switch as well (see https://gcc.gnu.org/pipermail/gcc-cvs/2020-July/314418.html)
and https://reviews.llvm.org/D61863 and https://reviews.llvm.org/D58632
moved into this direction as well, but didn't touch fence seq_cst.
Amusingly this came up elsewhere: https://www.reddit.com/r/cpp_questions/comments/16uer2g/how_do_i_stop_clang_generating_mfence/
After another 2 years it doesn't look like anyone complained about the
GCC switch. And there is still
__builtin_ia32_mfence
for folks whowant this precise instruction.
Fixes #91731