Skip to content

Commit cc0e78a

Browse files
committed
[X86] Allow speculative BSR/BSF instructions on targets with CMOV
1 parent 254da5a commit cc0e78a

20 files changed

+516
-682
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -3239,14 +3239,14 @@ bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
32393239

32403240
bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
32413241
// Speculate cttz only if we can directly use TZCNT or can promote to i32/i64.
3242-
return Subtarget.hasBMI() ||
3242+
return Subtarget.hasBMI() || Subtarget.canUseCMOV() ||
32433243
(!Ty->isVectorTy() &&
32443244
Ty->getScalarSizeInBits() < (Subtarget.is64Bit() ? 64u : 32u));
32453245
}
32463246

32473247
bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {
32483248
// Speculate ctlz only if we can directly use LZCNT.
3249-
return Subtarget.hasLZCNT();
3249+
return Subtarget.hasLZCNT() || Subtarget.canUseCMOV();
32503250
}
32513251

32523252
bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -4210,9 +4210,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
42104210
{ ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV
42114211
{ ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } },
42124212
{ ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } },
4213-
{ ISD::CTLZ, MVT::i64, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4213+
{ ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
42144214
{ ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR
4215-
{ ISD::CTTZ, MVT::i64, { 2, 2, 5, 5 } }, // TEST+BSF+CMOV/BRANCH
4215+
{ ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH
42164216
{ ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF
42174217
{ ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } },
42184218
{ ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } },
@@ -4241,9 +4241,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
42414241
{ ISD::BITREVERSE, MVT::i8, { 7, 9, 13, 14 } },
42424242
{ ISD::BSWAP, MVT::i32, { 1, 1, 1, 1 } },
42434243
{ ISD::BSWAP, MVT::i16, { 1, 2, 1, 2 } }, // ROL
4244-
{ ISD::CTLZ, MVT::i32, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4245-
{ ISD::CTLZ, MVT::i16, { 3, 2, 6, 6 } }, // BSR+XOR or BSR+XOR+CMOV
4246-
{ ISD::CTLZ, MVT::i8, { 3, 2, 7, 7 } }, // BSR+XOR or BSR+XOR+CMOV
4244+
{ ISD::CTLZ, MVT::i32, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4245+
{ ISD::CTLZ, MVT::i16, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV
4246+
{ ISD::CTLZ, MVT::i8, { 2, 2, 5, 6 } }, // BSR+XOR or BSR+XOR+CMOV
42474247
{ ISD::CTLZ_ZERO_UNDEF, MVT::i32,{ 1, 2, 2, 2 } }, // BSR+XOR
42484248
{ ISD::CTLZ_ZERO_UNDEF, MVT::i16,{ 2, 2, 2, 2 } }, // BSR+XOR
42494249
{ ISD::CTLZ_ZERO_UNDEF, MVT::i8, { 2, 2, 3, 3 } }, // BSR+XOR

llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
1717

1818
define i64 @var_ctlz_i64(i64 %a) {
1919
; NOLZCNT-LABEL: 'var_ctlz_i64'
20-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
20+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
2121
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
2222
;
2323
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
4343

4444
define i32 @var_ctlz_i32(i32 %a) {
4545
; NOLZCNT-LABEL: 'var_ctlz_i32'
46-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
46+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
4747
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
4848
;
4949
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
6969

7070
define i16 @var_ctlz_i16(i16 %a) {
7171
; NOLZCNT-LABEL: 'var_ctlz_i16'
72-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
72+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
7373
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
7474
;
7575
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
9595

9696
define i8 @var_ctlz_i8(i8 %a) {
9797
; NOLZCNT-LABEL: 'var_ctlz_i8'
98-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
98+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
9999
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
100100
;
101101
; LZCNT-LABEL: 'var_ctlz_i8'

llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
1717

1818
define i64 @var_ctlz_i64(i64 %a) {
1919
; NOLZCNT-LABEL: 'var_ctlz_i64'
20-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
20+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
2121
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
2222
;
2323
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
4343

4444
define i32 @var_ctlz_i32(i32 %a) {
4545
; NOLZCNT-LABEL: 'var_ctlz_i32'
46-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
46+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
4747
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
4848
;
4949
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
6969

7070
define i16 @var_ctlz_i16(i16 %a) {
7171
; NOLZCNT-LABEL: 'var_ctlz_i16'
72-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
72+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
7373
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
7474
;
7575
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
9595

9696
define i8 @var_ctlz_i8(i8 %a) {
9797
; NOLZCNT-LABEL: 'var_ctlz_i8'
98-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
98+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
9999
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
100100
;
101101
; LZCNT-LABEL: 'var_ctlz_i8'

llvm/test/Analysis/CostModel/X86/ctlz.ll

+4-4
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1)
1717

1818
define i64 @var_ctlz_i64(i64 %a) {
1919
; NOLZCNT-LABEL: 'var_ctlz_i64'
20-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
20+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
2121
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
2222
;
2323
; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
4343

4444
define i32 @var_ctlz_i32(i32 %a) {
4545
; NOLZCNT-LABEL: 'var_ctlz_i32'
46-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
46+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
4747
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
4848
;
4949
; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
6969

7070
define i16 @var_ctlz_i16(i16 %a) {
7171
; NOLZCNT-LABEL: 'var_ctlz_i16'
72-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
72+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
7373
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctlz
7474
;
7575
; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
9595

9696
define i8 @var_ctlz_i8(i8 %a) {
9797
; NOLZCNT-LABEL: 'var_ctlz_i8'
98-
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
98+
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
9999
; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctlz
100100
;
101101
; LZCNT-LABEL: 'var_ctlz_i8'

llvm/test/Analysis/CostModel/X86/cttz-codesize.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
1818

1919
define i64 @var_cttz_i64(i64 %a) {
2020
; NOBMI-LABEL: 'var_cttz_i64'
21-
; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
21+
; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
2222
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
2323
;
2424
; BMI-LABEL: 'var_cttz_i64'

llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1)
1818

1919
define i64 @var_cttz_i64(i64 %a) {
2020
; NOBMI-LABEL: 'var_cttz_i64'
21-
; NOBMI-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
21+
; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
2222
; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
2323
;
2424
; BMI-LABEL: 'var_cttz_i64'

llvm/test/CodeGen/X86/atomic-bit-test.ll

-1
Original file line numberDiff line numberDiff line change
@@ -582,7 +582,6 @@ define i32 @split_hoist_and(i32 %0) nounwind {
582582
; X64-NEXT: lock btsl $3, v32(%rip)
583583
; X64-NEXT: setb %al
584584
; X64-NEXT: shll $3, %eax
585-
; X64-NEXT: testl %edi, %edi
586585
; X64-NEXT: retq
587586
%2 = atomicrmw or ptr @v32, i32 8 monotonic, align 4
588587
%3 = tail call i32 @llvm.ctlz.i32(i32 %0, i1 false)

llvm/test/CodeGen/X86/bit_ceil.ll

+17-36
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,12 @@
88
define i32 @bit_ceil_i32(i32 %x) {
99
; NOBMI-LABEL: bit_ceil_i32:
1010
; NOBMI: # %bb.0:
11-
; NOBMI-NEXT: movl %edi, %eax
12-
; NOBMI-NEXT: decl %eax
13-
; NOBMI-NEXT: je .LBB0_1
14-
; NOBMI-NEXT: # %bb.2: # %cond.false
15-
; NOBMI-NEXT: bsrl %eax, %ecx
11+
; NOBMI-NEXT: # kill: def $edi killed $edi def $rdi
12+
; NOBMI-NEXT: leal -1(%rdi), %eax
13+
; NOBMI-NEXT: bsrl %eax, %eax
14+
; NOBMI-NEXT: movl $63, %ecx
15+
; NOBMI-NEXT: cmovnel %eax, %ecx
1616
; NOBMI-NEXT: xorl $31, %ecx
17-
; NOBMI-NEXT: jmp .LBB0_3
18-
; NOBMI-NEXT: .LBB0_1:
19-
; NOBMI-NEXT: movl $32, %ecx
20-
; NOBMI-NEXT: .LBB0_3: # %cond.end
2117
; NOBMI-NEXT: negb %cl
2218
; NOBMI-NEXT: movl $1, %edx
2319
; NOBMI-NEXT: movl $1, %eax
@@ -51,15 +47,10 @@ define i32 @bit_ceil_i32(i32 %x) {
5147
define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
5248
; NOBMI-LABEL: bit_ceil_i32_plus1:
5349
; NOBMI: # %bb.0: # %entry
54-
; NOBMI-NEXT: testl %edi, %edi
55-
; NOBMI-NEXT: je .LBB1_1
56-
; NOBMI-NEXT: # %bb.2: # %cond.false
57-
; NOBMI-NEXT: bsrl %edi, %ecx
50+
; NOBMI-NEXT: bsrl %edi, %eax
51+
; NOBMI-NEXT: movl $63, %ecx
52+
; NOBMI-NEXT: cmovnel %eax, %ecx
5853
; NOBMI-NEXT: xorl $31, %ecx
59-
; NOBMI-NEXT: jmp .LBB1_3
60-
; NOBMI-NEXT: .LBB1_1:
61-
; NOBMI-NEXT: movl $32, %ecx
62-
; NOBMI-NEXT: .LBB1_3: # %cond.end
6354
; NOBMI-NEXT: negb %cl
6455
; NOBMI-NEXT: movl $1, %edx
6556
; NOBMI-NEXT: movl $1, %eax
@@ -94,16 +85,11 @@ entry:
9485
define i64 @bit_ceil_i64(i64 %x) {
9586
; NOBMI-LABEL: bit_ceil_i64:
9687
; NOBMI: # %bb.0:
97-
; NOBMI-NEXT: movq %rdi, %rax
98-
; NOBMI-NEXT: decq %rax
99-
; NOBMI-NEXT: je .LBB2_1
100-
; NOBMI-NEXT: # %bb.2: # %cond.false
101-
; NOBMI-NEXT: bsrq %rax, %rcx
102-
; NOBMI-NEXT: xorq $63, %rcx
103-
; NOBMI-NEXT: jmp .LBB2_3
104-
; NOBMI-NEXT: .LBB2_1:
105-
; NOBMI-NEXT: movl $64, %ecx
106-
; NOBMI-NEXT: .LBB2_3: # %cond.end
88+
; NOBMI-NEXT: leaq -1(%rdi), %rax
89+
; NOBMI-NEXT: bsrq %rax, %rax
90+
; NOBMI-NEXT: movl $127, %ecx
91+
; NOBMI-NEXT: cmovneq %rax, %rcx
92+
; NOBMI-NEXT: xorl $63, %ecx
10793
; NOBMI-NEXT: negb %cl
10894
; NOBMI-NEXT: movl $1, %edx
10995
; NOBMI-NEXT: movl $1, %eax
@@ -136,15 +122,10 @@ define i64 @bit_ceil_i64(i64 %x) {
136122
define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
137123
; NOBMI-LABEL: bit_ceil_i64_plus1:
138124
; NOBMI: # %bb.0: # %entry
139-
; NOBMI-NEXT: testq %rdi, %rdi
140-
; NOBMI-NEXT: je .LBB3_1
141-
; NOBMI-NEXT: # %bb.2: # %cond.false
142-
; NOBMI-NEXT: bsrq %rdi, %rcx
143-
; NOBMI-NEXT: xorq $63, %rcx
144-
; NOBMI-NEXT: jmp .LBB3_3
145-
; NOBMI-NEXT: .LBB3_1:
146-
; NOBMI-NEXT: movl $64, %ecx
147-
; NOBMI-NEXT: .LBB3_3: # %cond.end
125+
; NOBMI-NEXT: bsrq %rdi, %rax
126+
; NOBMI-NEXT: movl $127, %ecx
127+
; NOBMI-NEXT: cmovneq %rax, %rcx
128+
; NOBMI-NEXT: xorl $63, %ecx
148129
; NOBMI-NEXT: negb %cl
149130
; NOBMI-NEXT: movl $1, %edx
150131
; NOBMI-NEXT: movl $1, %eax

llvm/test/CodeGen/X86/combine-or.ll

+20-27
Original file line numberDiff line numberDiff line change
@@ -213,21 +213,18 @@ define i64 @PR89533(<64 x i8> %a0) {
213213
; SSE-NEXT: shll $16, %ecx
214214
; SSE-NEXT: orl %eax, %ecx
215215
; SSE-NEXT: pcmpeqb %xmm4, %xmm2
216-
; SSE-NEXT: pmovmskb %xmm2, %edx
217-
; SSE-NEXT: xorl $65535, %edx # imm = 0xFFFF
216+
; SSE-NEXT: pmovmskb %xmm2, %eax
217+
; SSE-NEXT: xorl $65535, %eax # imm = 0xFFFF
218218
; SSE-NEXT: pcmpeqb %xmm4, %xmm3
219-
; SSE-NEXT: pmovmskb %xmm3, %eax
220-
; SSE-NEXT: notl %eax
221-
; SSE-NEXT: shll $16, %eax
222-
; SSE-NEXT: orl %edx, %eax
223-
; SSE-NEXT: shlq $32, %rax
224-
; SSE-NEXT: orq %rcx, %rax
225-
; SSE-NEXT: je .LBB11_2
226-
; SSE-NEXT: # %bb.1: # %cond.false
227-
; SSE-NEXT: rep bsfq %rax, %rax
228-
; SSE-NEXT: retq
229-
; SSE-NEXT: .LBB11_2: # %cond.end
219+
; SSE-NEXT: pmovmskb %xmm3, %edx
220+
; SSE-NEXT: notl %edx
221+
; SSE-NEXT: shll $16, %edx
222+
; SSE-NEXT: orl %eax, %edx
223+
; SSE-NEXT: shlq $32, %rdx
224+
; SSE-NEXT: orq %rcx, %rdx
225+
; SSE-NEXT: bsfq %rdx, %rcx
230226
; SSE-NEXT: movl $64, %eax
227+
; SSE-NEXT: cmovneq %rcx, %rax
231228
; SSE-NEXT: retq
232229
;
233230
; AVX1-LABEL: PR89533:
@@ -243,23 +240,19 @@ define i64 @PR89533(<64 x i8> %a0) {
243240
; AVX1-NEXT: shll $16, %ecx
244241
; AVX1-NEXT: orl %eax, %ecx
245242
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm0
246-
; AVX1-NEXT: vpmovmskb %xmm0, %edx
247-
; AVX1-NEXT: xorl $65535, %edx # imm = 0xFFFF
243+
; AVX1-NEXT: vpmovmskb %xmm0, %eax
244+
; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
248245
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
249246
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
250-
; AVX1-NEXT: vpmovmskb %xmm0, %eax
251-
; AVX1-NEXT: notl %eax
252-
; AVX1-NEXT: shll $16, %eax
253-
; AVX1-NEXT: orl %edx, %eax
254-
; AVX1-NEXT: shlq $32, %rax
255-
; AVX1-NEXT: orq %rcx, %rax
256-
; AVX1-NEXT: je .LBB11_2
257-
; AVX1-NEXT: # %bb.1: # %cond.false
258-
; AVX1-NEXT: rep bsfq %rax, %rax
259-
; AVX1-NEXT: vzeroupper
260-
; AVX1-NEXT: retq
261-
; AVX1-NEXT: .LBB11_2: # %cond.end
247+
; AVX1-NEXT: vpmovmskb %xmm0, %edx
248+
; AVX1-NEXT: notl %edx
249+
; AVX1-NEXT: shll $16, %edx
250+
; AVX1-NEXT: orl %eax, %edx
251+
; AVX1-NEXT: shlq $32, %rdx
252+
; AVX1-NEXT: orq %rcx, %rdx
253+
; AVX1-NEXT: bsfq %rdx, %rcx
262254
; AVX1-NEXT: movl $64, %eax
255+
; AVX1-NEXT: cmovneq %rcx, %rax
263256
; AVX1-NEXT: vzeroupper
264257
; AVX1-NEXT: retq
265258
;

0 commit comments

Comments
 (0)