Skip to content

Commit 8b43c1b

Browse files
authored
[X86] X86FixupVectorConstants - shrink vector load to movsd/movsd/movd/movq 'zero upper' instructions (#79000)
If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits. Always chose the vzload/broadcast with the smallest constant load, and prefer vzload over broadcasts for same bitwidth to avoid domain flips (mainly a AVX1 issue). Fixes #73783
1 parent 182ab1c commit 8b43c1b

File tree

103 files changed

+959
-1135
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

103 files changed

+959
-1135
lines changed

llvm/lib/Target/X86/X86FixupVectorConstants.cpp

+116-61
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() {
6767
static std::optional<APInt> extractConstantBits(const Constant *C) {
6868
unsigned NumBits = C->getType()->getPrimitiveSizeInBits();
6969

70+
if (auto *CUndef = dyn_cast<UndefValue>(C))
71+
return APInt::getZero(NumBits);
72+
7073
if (auto *CInt = dyn_cast<ConstantInt>(C))
7174
return CInt->getValue();
7275

@@ -80,6 +83,18 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
8083
return APInt::getSplat(NumBits, *Bits);
8184
}
8285
}
86+
87+
APInt Bits = APInt::getZero(NumBits);
88+
for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) {
89+
Constant *Elt = CV->getOperand(I);
90+
std::optional<APInt> SubBits = extractConstantBits(Elt);
91+
if (!SubBits)
92+
return std::nullopt;
93+
assert(NumBits == (E * SubBits->getBitWidth()) &&
94+
"Illegal vector element size");
95+
Bits.insertBits(*SubBits, I * SubBits->getBitWidth());
96+
}
97+
return Bits;
8398
}
8499

85100
if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
@@ -223,6 +238,35 @@ static Constant *rebuildSplatableConstant(const Constant *C,
223238
return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
224239
}
225240

241+
static Constant *rebuildZeroUpperConstant(const Constant *C,
242+
unsigned ScalarBitWidth) {
243+
Type *Ty = C->getType();
244+
Type *SclTy = Ty->getScalarType();
245+
unsigned NumBits = Ty->getPrimitiveSizeInBits();
246+
unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
247+
LLVMContext &Ctx = C->getContext();
248+
249+
if (NumBits > ScalarBitWidth) {
250+
// Determine if the upper bits are all zero.
251+
if (std::optional<APInt> Bits = extractConstantBits(C)) {
252+
if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
253+
// If the original constant was made of smaller elements, try to retain
254+
// those types.
255+
if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
256+
return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);
257+
258+
// Fallback to raw integer bits.
259+
APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
260+
return ConstantInt::get(Ctx, RawBits);
261+
}
262+
}
263+
}
264+
265+
return nullptr;
266+
}
267+
268+
typedef std::function<Constant *(const Constant *, unsigned)> RebuildFn;
269+
226270
bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
227271
MachineBasicBlock &MBB,
228272
MachineInstr &MI) {
@@ -233,117 +277,128 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
233277
bool HasBWI = ST->hasBWI();
234278
bool HasVLX = ST->hasVLX();
235279

236-
auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
237-
unsigned OpBcst64, unsigned OpBcst32,
238-
unsigned OpBcst16, unsigned OpBcst8,
239-
unsigned OperandNo) {
240-
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
241-
"Unexpected number of operands!");
242-
243-
if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
244-
// Attempt to detect a suitable splat from increasing splat widths.
245-
std::pair<unsigned, unsigned> Broadcasts[] = {
246-
{8, OpBcst8}, {16, OpBcst16}, {32, OpBcst32},
247-
{64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256},
248-
};
249-
for (auto [BitWidth, OpBcst] : Broadcasts) {
250-
if (OpBcst) {
251-
// Construct a suitable splat constant and adjust the MI to
252-
// use the new constant pool entry.
253-
if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) {
254-
unsigned NewCPI =
255-
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
256-
MI.setDesc(TII->get(OpBcst));
257-
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
258-
return true;
280+
auto FixupConstant =
281+
[&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64,
282+
unsigned OpBcst32, unsigned OpBcst16, unsigned OpBcst8,
283+
unsigned OpUpper64, unsigned OpUpper32, unsigned OperandNo) {
284+
assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
285+
"Unexpected number of operands!");
286+
287+
if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
288+
// Attempt to detect a suitable splat/vzload from increasing constant
289+
// bitwidths.
290+
// Prefer vzload vs broadcast for same bitwidth to avoid domain flips.
291+
std::tuple<unsigned, unsigned, RebuildFn> FixupLoad[] = {
292+
{8, OpBcst8, rebuildSplatableConstant},
293+
{16, OpBcst16, rebuildSplatableConstant},
294+
{32, OpUpper32, rebuildZeroUpperConstant},
295+
{32, OpBcst32, rebuildSplatableConstant},
296+
{64, OpUpper64, rebuildZeroUpperConstant},
297+
{64, OpBcst64, rebuildSplatableConstant},
298+
{128, OpBcst128, rebuildSplatableConstant},
299+
{256, OpBcst256, rebuildSplatableConstant},
300+
};
301+
for (auto [BitWidth, Op, RebuildConstant] : FixupLoad) {
302+
if (Op) {
303+
// Construct a suitable constant and adjust the MI to use the new
304+
// constant pool entry.
305+
if (Constant *NewCst = RebuildConstant(C, BitWidth)) {
306+
unsigned NewCPI =
307+
CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
308+
MI.setDesc(TII->get(Op));
309+
MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
310+
return true;
311+
}
312+
}
259313
}
260314
}
261-
}
262-
}
263-
return false;
264-
};
315+
return false;
316+
};
265317

266-
// Attempt to convert full width vector loads into broadcast loads.
318+
// Attempt to convert full width vector loads into broadcast/vzload loads.
267319
switch (Opc) {
268320
/* FP Loads */
269321
case X86::MOVAPDrm:
270322
case X86::MOVAPSrm:
271323
case X86::MOVUPDrm:
272324
case X86::MOVUPSrm:
273325
// TODO: SSE3 MOVDDUP Handling
274-
return false;
326+
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVSDrm, X86::MOVSSrm, 1);
275327
case X86::VMOVAPDrm:
276328
case X86::VMOVAPSrm:
277329
case X86::VMOVUPDrm:
278330
case X86::VMOVUPSrm:
279-
return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
280-
1);
331+
return FixupConstant(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
332+
X86::VMOVSDrm, X86::VMOVSSrm, 1);
281333
case X86::VMOVAPDYrm:
282334
case X86::VMOVAPSYrm:
283335
case X86::VMOVUPDYrm:
284336
case X86::VMOVUPSYrm:
285-
return ConvertToBroadcast(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
286-
X86::VBROADCASTSSYrm, 0, 0, 1);
337+
return FixupConstant(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
338+
X86::VBROADCASTSSYrm, 0, 0, 0, 0, 1);
287339
case X86::VMOVAPDZ128rm:
288340
case X86::VMOVAPSZ128rm:
289341
case X86::VMOVUPDZ128rm:
290342
case X86::VMOVUPSZ128rm:
291-
return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
292-
X86::VBROADCASTSSZ128rm, 0, 0, 1);
343+
return FixupConstant(0, 0, X86::VMOVDDUPZ128rm, X86::VBROADCASTSSZ128rm, 0,
344+
0, X86::VMOVSDZrm, X86::VMOVSSZrm, 1);
293345
case X86::VMOVAPDZ256rm:
294346
case X86::VMOVAPSZ256rm:
295347
case X86::VMOVUPDZ256rm:
296348
case X86::VMOVUPSZ256rm:
297-
return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
298-
X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
299-
0, 0, 1);
349+
return FixupConstant(0, X86::VBROADCASTF32X4Z256rm, X86::VBROADCASTSDZ256rm,
350+
X86::VBROADCASTSSZ256rm, 0, 0, 0, 0, 1);
300351
case X86::VMOVAPDZrm:
301352
case X86::VMOVAPSZrm:
302353
case X86::VMOVUPDZrm:
303354
case X86::VMOVUPSZrm:
304-
return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
305-
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
306-
1);
355+
return FixupConstant(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
356+
X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 0, 0,
357+
1);
307358
/* Integer Loads */
359+
case X86::MOVDQArm:
360+
case X86::MOVDQUrm:
361+
return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVQI2PQIrm, X86::MOVDI2PDIrm,
362+
1);
308363
case X86::VMOVDQArm:
309364
case X86::VMOVDQUrm:
310-
return ConvertToBroadcast(
311-
0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
312-
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
313-
HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
314-
1);
365+
return FixupConstant(0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
366+
HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
367+
HasAVX2 ? X86::VPBROADCASTWrm : 0,
368+
HasAVX2 ? X86::VPBROADCASTBrm : 0, X86::VMOVQI2PQIrm,
369+
X86::VMOVDI2PDIrm, 1);
315370
case X86::VMOVDQAYrm:
316371
case X86::VMOVDQUYrm:
317-
return ConvertToBroadcast(
372+
return FixupConstant(
318373
0, HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm,
319374
HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm,
320375
HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm,
321376
HasAVX2 ? X86::VPBROADCASTWYrm : 0, HasAVX2 ? X86::VPBROADCASTBYrm : 0,
322-
1);
377+
0, 0, 1);
323378
case X86::VMOVDQA32Z128rm:
324379
case X86::VMOVDQA64Z128rm:
325380
case X86::VMOVDQU32Z128rm:
326381
case X86::VMOVDQU64Z128rm:
327-
return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
328-
X86::VPBROADCASTDZ128rm,
329-
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
330-
HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
382+
return FixupConstant(0, 0, X86::VPBROADCASTQZ128rm, X86::VPBROADCASTDZ128rm,
383+
HasBWI ? X86::VPBROADCASTWZ128rm : 0,
384+
HasBWI ? X86::VPBROADCASTBZ128rm : 0,
385+
X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1);
331386
case X86::VMOVDQA32Z256rm:
332387
case X86::VMOVDQA64Z256rm:
333388
case X86::VMOVDQU32Z256rm:
334389
case X86::VMOVDQU64Z256rm:
335-
return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm,
336-
X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
337-
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
338-
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
390+
return FixupConstant(0, X86::VBROADCASTI32X4Z256rm, X86::VPBROADCASTQZ256rm,
391+
X86::VPBROADCASTDZ256rm,
392+
HasBWI ? X86::VPBROADCASTWZ256rm : 0,
393+
HasBWI ? X86::VPBROADCASTBZ256rm : 0, 0, 0, 1);
339394
case X86::VMOVDQA32Zrm:
340395
case X86::VMOVDQA64Zrm:
341396
case X86::VMOVDQU32Zrm:
342397
case X86::VMOVDQU64Zrm:
343-
return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
344-
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
345-
HasBWI ? X86::VPBROADCASTWZrm : 0,
346-
HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
398+
return FixupConstant(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
399+
X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
400+
HasBWI ? X86::VPBROADCASTWZrm : 0,
401+
HasBWI ? X86::VPBROADCASTBZrm : 0, 0, 0, 1);
347402
}
348403

349404
auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
@@ -368,7 +423,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
368423

369424
if (OpBcst32 || OpBcst64) {
370425
unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
371-
return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
426+
return FixupConstant(0, 0, OpBcst64, OpBcst32, 0, 0, 0, 0, OpNo);
372427
}
373428
return false;
374429
};

llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
88
; CHECK-LABEL: ui_to_fp_conv:
99
; CHECK: # %bb.0: # %allocas
10-
; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
10+
; CHECK-NEXT: movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
1111
; CHECK-NEXT: xorps %xmm1, %xmm1
1212
; CHECK-NEXT: movups %xmm1, 16(%rsi)
1313
; CHECK-NEXT: movups %xmm0, (%rsi)

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
10531053
; SSE42-NEXT: paddb 48(%rsi), %xmm2
10541054
; SSE42-NEXT: paddb (%rsi), %xmm0
10551055
; SSE42-NEXT: paddb 32(%rsi), %xmm1
1056-
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
1056+
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
10571057
; SSE42-NEXT: pshufb %xmm3, %xmm1
10581058
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
10591059
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
10751075
; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2
10761076
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
10771077
; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm1
1078-
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
1079-
; AVX-NEXT: # xmm3 = mem[0,0]
1078+
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
10801079
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
10811080
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
10821081
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

+2-3
Original file line numberDiff line numberDiff line change
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
875875
; SSE42-NEXT: movdqa (%rdi), %xmm0
876876
; SSE42-NEXT: movdqa 32(%rdi), %xmm1
877877
; SSE42-NEXT: movdqa 48(%rdi), %xmm2
878-
; SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
878+
; SSE42-NEXT: movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
879879
; SSE42-NEXT: pshufb %xmm3, %xmm1
880880
; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
881881
; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
894894
; AVX-NEXT: vmovdqa (%rdi), %xmm0
895895
; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
896896
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
897-
; AVX-NEXT: vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
898-
; AVX-NEXT: # xmm3 = mem[0,0]
897+
; AVX-NEXT: vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
899898
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm1
900899
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
901900
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]

llvm/test/CodeGen/X86/avx-load-store.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ define void @f_f() nounwind {
220220
; CHECK-NEXT: testb %al, %al
221221
; CHECK-NEXT: jne .LBB9_4
222222
; CHECK-NEXT: # %bb.3: # %cif_mixed_test_all
223-
; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
223+
; CHECK-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
224224
; CHECK-NEXT: vmaskmovps %ymm0, %ymm0, (%rax)
225225
; CHECK-NEXT: .LBB9_4: # %cif_mixed_test_any_check
226226
;

llvm/test/CodeGen/X86/avx2-arith.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
234234
define <8 x i32> @mul_const9(<8 x i32> %x) {
235235
; CHECK-LABEL: mul_const9:
236236
; CHECK: # %bb.0:
237-
; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
237+
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [2,0,0,0]
238238
; CHECK-NEXT: vpmulld %ymm1, %ymm0, %ymm0
239239
; CHECK-NEXT: ret{{[l|q]}}
240240
%y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>

llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
1313
; CHECK-NEXT: .cfi_def_cfa_register %rbp
1414
; CHECK-NEXT: andq $-64, %rsp
1515
; CHECK-NEXT: subq $128, %rsp
16-
; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
16+
; CHECK-NEXT: vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0]
1717
; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0
1818
; CHECK-NEXT: vmovdqa64 %zmm0, (%rsp)
1919
; CHECK-NEXT: vmovaps (%rsp), %xmm0

llvm/test/CodeGen/X86/bitreverse.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() {
587587
;
588588
; X64-LABEL: fold_v2i16:
589589
; X64: # %bb.0:
590-
; X64-NEXT: movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
590+
; X64-NEXT: movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
591591
; X64-NEXT: retq
592592
;
593593
; X86XOP-LABEL: fold_v2i16:
594594
; X86XOP: # %bb.0:
595-
; X86XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
595+
; X86XOP-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
596596
; X86XOP-NEXT: retl
597597
;
598598
; GFNI-LABEL: fold_v2i16:
599599
; GFNI: # %bb.0:
600-
; GFNI-NEXT: vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
600+
; GFNI-NEXT: vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
601601
; GFNI-NEXT: retq
602602
%b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
603603
ret <2 x i16> %b

llvm/test/CodeGen/X86/combine-srl.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
356356
; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
357357
; SSE: # %bb.0:
358358
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
359-
; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
359+
; SSE-NEXT: movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
360360
; SSE-NEXT: movdqa %xmm1, %xmm2
361361
; SSE-NEXT: pshufb %xmm0, %xmm2
362362
; SSE-NEXT: psrlw $4, %xmm0
@@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
378378
; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
379379
; AVX: # %bb.0:
380380
; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
381-
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
381+
; AVX-NEXT: vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
382382
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm2
383383
; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0
384384
; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3

llvm/test/CodeGen/X86/combine-subo.ll

+2-2
Original file line numberDiff line numberDiff line change
@@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
217217
define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
218218
; SSE-LABEL: never_usub_const_vector:
219219
; SSE: # %bb.0:
220-
; SSE-NEXT: movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
220+
; SSE-NEXT: movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
221221
; SSE-NEXT: xorps %xmm1, %xmm1
222222
; SSE-NEXT: retq
223223
;
224224
; AVX-LABEL: never_usub_const_vector:
225225
; AVX: # %bb.0:
226-
; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
226+
; AVX-NEXT: vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
227227
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
228228
; AVX-NEXT: retq
229229
%x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)

0 commit comments

Comments
 (0)