Skip to content

Commit 5ee98b4

Browse files
committed
[X86] X86FixupVectorConstants - shrink vector load to movsd/movsd/movd/movq 'zero upper' instructions
If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits. I've extended the constant fixup code to always attempt to use the smallest possible constant load, but for the same constant width prefer the scalar vzload over broadcasts (better chance of avoiding a domain clash).
1 parent 72f10f7 commit 5ee98b4

File tree

103 files changed

+959
-1135
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+959
-1135
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

+116-61
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() {
6767
static std::optional<APInt> extractConstantBits(const Constant *C) {
6868
unsigned NumBits = C->getType()->getPrimitiveSizeInBits();
6969

70+
if (auto *CUndef = dyn_cast<UndefValue>(C))
71+
return APInt::getZero(NumBits);
72+
7073
if (auto *CInt = dyn_cast<ConstantInt>(C))
7174
return CInt->getValue();
7275

@@ -80,6 +83,18 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
8083
return APInt::getSplat(NumBits, *Bits);
8184
}
8285
}
86+
87+
APInt Bits = APInt::getZero(NumBits);
88+
for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) {
89+
Constant *Elt = CV->getOperand(I);
90+
std::optional<APInt> SubBits = extractConstantBits(Elt);
91+
if (!SubBits)
92+
return std::nullopt;
93+
assert(NumBits == (E * SubBits->getBitWidth()) &&
94+
"Illegal vector element size");
95+
Bits.insertBits(*SubBits, I * SubBits->getBitWidth());
96+
}
97+
return Bits;
8398
}
8499

85100
if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
@@ -223,6 +238,35 @@ static Constant *rebuildSplatableConstant(const Constant *C,
223238
return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
224239
}
225240

241+
static Constant *rebuildZeroUpperConstant(const Constant *C,
242+
unsigned ScalarBitWidth) {
243+
Type *Ty = C->getType();
244+
Type *SclTy = Ty->getScalarType();
245+
unsigned NumBits = Ty->getPrimitiveSizeInBits();
246+
unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
247+
LLVMContext &Ctx = C->getContext();
248+
249+
if (NumBits > ScalarBitWidth) {
250+
// Determine if the upper bits are all zero.
251+
if (std::optional<APInt> Bits = extractConstantBits(C)) {
252+
if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
253+
// If the original constant was made of smaller elements, try to retain
254+
// those types.
255+
if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
256+
return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);
257+
258+
// Fallback to raw integer bits.
259+
APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
260+
return ConstantInt::get(Ctx, RawBits);
261+
}
262+
}
263+
}
264+
265+
return nullptr;
266+
}
267+
268+
typedef std::function<Constant *(const Constant *, unsigned)> RebuildFn;
269+
226270
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
227271
MachineBasicBlock &MBB,
228272
MachineInstr &MI) {
@@ -233,117 +277,128 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
233277
bool HasBWI = ST->hasBWI();
234278
bool HasVLX = ST->hasVLX();
235279

236-
auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
237-
unsigned OpBcst64, unsigned OpBcst32,
238-
unsigned OpBcst16, unsigned OpBcst8,
239-
unsigned OperandNo) {
240-
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
241-
"Unexpected number of operands!");
242-
243-
if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
244-
// Attempt to detect a suitable splat from increasing splat widths.
245-
std::pair<unsigned, unsigned> Broadcasts[] = {
246-
{8, OpBcst8}, {16, OpBcst16}, {32, OpBcst32},
247-
{64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256},
248-
};
249-
for (auto [BitWidth, OpBcst] : Broadcasts) {
250-
if (OpBcst) {
251-
// Construct a suitable splat constant and adjust the MI to
252-
// use the new constant pool entry.
253-
if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) {
254-
unsigned NewCPI =
255-
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
256-
MI.setDesc(TII->get(OpBcst));
257-
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
258-
return true;
280+
auto FixupConstant =
281+
[&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64,
282+
unsigned OpBcst32, unsigned OpBcst16, unsigned OpBcst8,
283+
unsigned OpUpper64, unsigned OpUpper32, unsigned OperandNo) {
284+
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
285+
"Unexpected number of operands!");
286+
287+
if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
288+
// Attempt to detect a suitable splat/vzload from increasing constant
289+
// bitwidths.
290+
// Prefer vzload vs broadcast for same bitwidth to avoid domain flips.
291+
std::tuple<unsigned, unsigned, RebuildFn> FixupLoad[] = {
292+
{8, OpBcst8, rebuildSplatableConstant},
293+
{16, OpBcst16, rebuildSplatableConstant},
294+
{32, OpUpper32, rebuildZeroUpperConstant},
295+
{32, OpBcst32, rebuildSplatableConstant},
296+
{64, OpUpper64, rebuildZeroUpperConstant},
297+
{64, OpBcst64, rebuildSplatableConstant},
298+
{128, OpBcst128, rebuildSplatableConstant},
299+
{256, OpBcst256, rebuildSplatableConstant},
300+
};
301+
for (auto [BitWidth, Op, RebuildConstant] : FixupLoad) {
302+
if (Op) {
303+
// Construct a suitable constant and adjust the MI to use the new
304+
// constant pool entry.
305+
if (Constant *NewCst = RebuildConstant(C, BitWidth)) {
306+
unsigned NewCPI =
307+
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
308+
MI.setDesc(TII->get(Op));
309+
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
310+
return true;
311+
}
312+
}
259313
}
260314
}
261-
}
262-
}
263-
return false;
264-
};
315+
return false;
316+
};
265317

266-
// Attempt to convert full width vector loads into broadcast loads.
318+
// Attempt to convert full width vector loads into broadcast/vzload loads.
267319
switch (Opc) {
268320
/* FP Loads */
269321
case X86::MOVAPDrm:
270322
case X86::MOVAPSrm:
271323
case X86::MOVUPDrm:
272324
case X86::MOVUPSrm:
273325
// TODO: SSE3 MOVDDUP Handling
274-
return false;
326+
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVSDrm, X86::MOVSSrm, 1);
275327
case X86::VMOVAPDrm:
276328
case X86::VMOVAPSrm:
277329
case X86::VMOVUPDrm:
278330
case X86::VMOVUPSrm:
279-
return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
280-
1);
331+
return FixupConstant(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
332+
X86::VMOVSDrm, X86::VMOVSSrm, 1);
281333
case X86::VMOVAPDYrm:
282334
case X86::VMOVAPSYrm:
283335
case X86::VMOVUPDYrm:
284336
case X86::VMOVUPSYrm:
285-
return ConvertToBroadcast(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
286-
X86::VBROADCASTSSYrm, 0, 0, 1);
337+
return FixupConstant(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
338+
X86::VBROADCASTSSYrm, 0, 0, 0, 0, 1);
287339
case X86::VMOVAPDZ128rm:
288340
case X86::VMOVAPSZ128rm:
289341
case X86::VMOVUPDZ128rm:
290342
case X86::VMOVUPSZ128rm:
291-
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
292-
X86::VBROADCASTSSZ128rm, 0, 0, 1);
343+
return FixupConstant(0, 0, X86::VMOVDDUPZ128rm, X86::VBROADCASTSSZ128rm, 0,
344+
0, X86::VMOVSDZrm, X86::VMOVSSZrm, 1);
293345
case X86::VMOVAPDZ256rm:
294346
case X86::VMOVAPSZ256rm:
295347
case X86::VMOVUPDZ256rm:
296348
case X86::VMOVUPSZ256rm:
297-
return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
298-
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
299-
0, 0, 1);
349+
return FixupConstant(0, X86::VBROADCASTF32X4Z256rm, X86::VBROADCASTSDZ256rm,
350+
X86::VBROADCASTSSZ256rm, 0, 0, 0, 0, 1);
300351
case X86::VMOVAPDZrm:
301352
case X86::VMOVAPSZrm:
302353
case X86::VMOVUPDZrm:
303354
case X86::VMOVUPSZrm:
304-
return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
305-
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
306-
1);
355+
return FixupConstant(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
356+
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 0, 0,
357+
1);
307358
/* Integer Loads */
359+
case X86::MOVDQArm:
360+
case X86::MOVDQUrm:
361+
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVQI2PQIrm, X86::MOVDI2PDIrm,
362+
1);
308363
case X86::VMOVDQArm:
309364
case X86::VMOVDQUrm:
310-
return ConvertToBroadcast(
311-
0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
312-
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
313-
HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
314-
1);
365+
return FixupConstant(0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
366+
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
367+
HasAVX2 ? X86::VPBROADCASTWrm : 0,
368+
HasAVX2 ? X86::VPBROADCASTBrm : 0, X86::VMOVQI2PQIrm,
369+
X86::VMOVDI2PDIrm, 1);
315370
case X86::VMOVDQAYrm:
316371
case X86::VMOVDQUYrm:
317-
return ConvertToBroadcast(
372+
return FixupConstant(
318373
0, HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm,
319374
HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm,
320375
HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm,
321376
HasAVX2 ? X86::VPBROADCASTWYrm : 0, HasAVX2 ? X86::VPBROADCASTBYrm : 0,
322-
1);
377+
0, 0, 1);
323378
case X86::VMOVDQA32Z128rm:
324379
case X86::VMOVDQA64Z128rm:
325380
case X86::VMOVDQU32Z128rm:
326381
case X86::VMOVDQU64Z128rm:
327-
return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
328-
X86::VPBROADCASTDZ128rm,
329-
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
330-
HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
382+
return FixupConstant(0, 0, X86::VPBROADCASTQZ128rm, X86::VPBROADCASTDZ128rm,
383+
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
384+
HasBWI ? X86::VPBROADCASTBZ128rm : 0,
385+
X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1);
331386
case X86::VMOVDQA32Z256rm:
332387
case X86::VMOVDQA64Z256rm:
333388
case X86::VMOVDQU32Z256rm:
334389
case X86::VMOVDQU64Z256rm:
335-
return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm,
336-
X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
337-
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
338-
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
390+
return FixupConstant(0, X86::VBROADCASTI32X4Z256rm, X86::VPBROADCASTQZ256rm,
391+
X86::VPBROADCASTDZ256rm,
392+
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
393+
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 0, 0, 1);
339394
case X86::VMOVDQA32Zrm:
340395
case X86::VMOVDQA64Zrm:
341396
case X86::VMOVDQU32Zrm:
342397
case X86::VMOVDQU64Zrm:
343-
return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
344-
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
345-
HasBWI ? X86::VPBROADCASTWZrm : 0,
346-
HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
398+
return FixupConstant(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
399+
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
400+
HasBWI ? X86::VPBROADCASTWZrm : 0,
401+
HasBWI ? X86::VPBROADCASTBZrm : 0, 0, 0, 1);
347402
}
348403

349404
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
@@ -368,7 +423,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
368423

369424
if (OpBcst32 || OpBcst64) {
370425
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
371-
return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
426+
return FixupConstant(0, 0, OpBcst64, OpBcst32, 0, 0, 0, 0, OpNo);
372427
}
373428
return false;
374429
};

llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
88
; CHECK-LABEL: ui_to_fp_conv:
99
; CHECK: # %bb.0: # %allocas
10-
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
10+
; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
1111
; CHECK-NEXT: xorps %xmm1, %xmm1
1212
; CHECK-NEXT: movups %xmm1, 16(%rsi)
1313
; CHECK-NEXT: movups %xmm0, (%rsi)

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
10531053
; SSE42-NEXT: paddb 48(%rsi), %xmm2
10541054
; SSE42-NEXT: paddb (%rsi), %xmm0
10551055
; SSE42-NEXT: paddb 32(%rsi), %xmm1
1056-
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1056+
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
10571057
; SSE42-NEXT: pshufb %xmm3, %xmm1
10581058
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
10591059
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
10751075
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
10761076
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
10771077
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1078-
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1079-
; AVX-NEXT: # xmm3 = mem[0,0]
1078+
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
10801079
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10811080
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
10821081
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
875875
; SSE42-NEXT: movdqa (%rdi), %xmm0
876876
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
877877
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
878-
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
878+
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
879879
; SSE42-NEXT: pshufb %xmm3, %xmm1
880880
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
881881
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
894894
; AVX-NEXT: vmovdqa (%rdi), %xmm0
895895
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
896896
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
897-
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
898-
; AVX-NEXT: # xmm3 = mem[0,0]
897+
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
899898
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
900899
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
901900
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]

llvm/test/CodeGen/X86/avx-load-store.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ define void @f_f() nounwind {
220220
; CHECK-NEXT: testb %al, %al
221221
; CHECK-NEXT: jne .LBB9_4
222222
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
223-
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
223+
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
224224
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
225225
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
226226
;

llvm/test/CodeGen/X86/avx2-arith.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
234234
define <8 x i32> @mul_const9(<8 x i32> %x) {
235235
; CHECK-LABEL: mul_const9:
236236
; CHECK: # %bb.0:
237-
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
237+
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
238238
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
239239
; CHECK-NEXT: ret{{[l|q]}}
240240
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>

llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
1313
; CHECK-NEXT: .cfi_def_cfa_register %rbp
1414
; CHECK-NEXT: andq $-64, %rsp
1515
; CHECK-NEXT: subq $128, %rsp
16-
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
16+
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0]
1717
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
1818
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp)
1919
; CHECK-NEXT: vmovaps (%rsp), %xmm0

llvm/test/CodeGen/X86/bitreverse.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() {
587587
;
588588
; X64-LABEL: fold_v2i16:
589589
; X64: # %bb.0:
590-
; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
590+
; X64-NEXT: movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
591591
; X64-NEXT: retq
592592
;
593593
; X86XOP-LABEL: fold_v2i16:
594594
; X86XOP: # %bb.0:
595-
; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
595+
; X86XOP-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
596596
; X86XOP-NEXT: retl
597597
;
598598
; GFNI-LABEL: fold_v2i16:
599599
; GFNI: # %bb.0:
600-
; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
600+
; GFNI-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
601601
; GFNI-NEXT: retq
602602
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
603603
ret <2 x i16> %b

llvm/test/CodeGen/X86/combine-srl.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
356356
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
357357
; SSE: # %bb.0:
358358
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
359-
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
359+
; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
360360
; SSE-NEXT: movdqa %xmm1, %xmm2
361361
; SSE-NEXT: pshufb %xmm0, %xmm2
362362
; SSE-NEXT: psrlw $4, %xmm0
@@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
378378
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
379379
; AVX: # %bb.0:
380380
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
381-
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
381+
; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
382382
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
383383
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
384384
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3

llvm/test/CodeGen/X86/combine-subo.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
217217
define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
218218
; SSE-LABEL: never_usub_const_vector:
219219
; SSE: # %bb.0:
220-
; SSE-NEXT: movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
220+
; SSE-NEXT: movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
221221
; SSE-NEXT: xorps %xmm1, %xmm1
222222
; SSE-NEXT: retq
223223
;
224224
; AVX-LABEL: never_usub_const_vector:
225225
; AVX: # %bb.0:
226-
; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
226+
; AVX-NEXT: vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
227227
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
228228
; AVX-NEXT: retq
229229
%x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)

0 commit comments

Comments
 (0)