|
8 | 8 | ; RUN: llc < %s -mtriple=i686-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X86-AVX512
|
9 | 9 | ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64-AVX512
|
10 | 10 |
|
11 |
| -; FIXME: PR78897 - Don't vectorize a mul if we still need the extract |
| 11 | +; PR78897 - Don't vectorize a mul of extracted values if we'd still need the extract. |
| 12 | +; TODO: We should vectorize on 32-bit targets. |
12 | 13 | define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
|
13 | 14 | ; X86-SSE2-LABEL: produceShuffleVectorForByte:
|
14 | 15 | ; X86-SSE2: # %bb.0: # %entry
|
@@ -70,21 +71,13 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
|
70 | 71 | ; X64-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
|
71 | 72 | ; X64-SSE2-NEXT: pand %xmm0, %xmm1
|
72 | 73 | ; X64-SSE2-NEXT: movq %xmm1, %rax
|
73 |
| -; X64-SSE2-NEXT: movdqa %xmm1, %xmm2 |
74 |
| -; X64-SSE2-NEXT: psrlq $32, %xmm2 |
75 |
| -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440] |
76 |
| -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm2 |
77 |
| -; X64-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [286331153,286331153] |
78 |
| -; X64-SSE2-NEXT: pmuludq %xmm1, %xmm4 |
79 |
| -; X64-SSE2-NEXT: paddq %xmm2, %xmm4 |
80 |
| -; X64-SSE2-NEXT: psllq $32, %xmm4 |
81 |
| -; X64-SSE2-NEXT: pmuludq %xmm3, %xmm1 |
82 |
| -; X64-SSE2-NEXT: paddq %xmm4, %xmm1 |
83 |
| -; X64-SSE2-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
84 |
| -; X64-SSE2-NEXT: xorq %rax, %rcx |
85 |
| -; X64-SSE2-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 74 | +; X64-SSE2-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 75 | +; X64-SSE2-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 76 | +; X64-SSE2-NEXT: xorq %rax, %rdx |
86 | 77 | ; X64-SSE2-NEXT: imulq %rcx, %rax
|
87 |
| -; X64-SSE2-NEXT: movq %rax, %xmm2 |
| 78 | +; X64-SSE2-NEXT: movq %rax, %xmm1 |
| 79 | +; X64-SSE2-NEXT: imulq %rcx, %rdx |
| 80 | +; X64-SSE2-NEXT: movq %rdx, %xmm2 |
88 | 81 | ; X64-SSE2-NEXT: pand %xmm0, %xmm1
|
89 | 82 | ; X64-SSE2-NEXT: pandn %xmm2, %xmm0
|
90 | 83 | ; X64-SSE2-NEXT: por %xmm1, %xmm0
|
@@ -147,24 +140,16 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
|
147 | 140 | ; X64-SSE42-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
|
148 | 141 | ; X64-SSE42-NEXT: pxor %xmm0, %xmm0
|
149 | 142 | ; X64-SSE42-NEXT: pcmpeqb %xmm1, %xmm0
|
150 |
| -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] |
151 |
| -; X64-SSE42-NEXT: pand %xmm0, %xmm2 |
152 |
| -; X64-SSE42-NEXT: movq %xmm2, %rax |
153 |
| -; X64-SSE42-NEXT: movdqa %xmm2, %xmm1 |
154 |
| -; X64-SSE42-NEXT: psrlq $32, %xmm1 |
155 |
| -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440] |
156 |
| -; X64-SSE42-NEXT: pmuludq %xmm3, %xmm1 |
157 |
| -; X64-SSE42-NEXT: movdqa {{.*#+}} xmm4 = [286331153,286331153] |
158 |
| -; X64-SSE42-NEXT: pmuludq %xmm2, %xmm4 |
159 |
| -; X64-SSE42-NEXT: paddq %xmm1, %xmm4 |
160 |
| -; X64-SSE42-NEXT: psllq $32, %xmm4 |
161 |
| -; X64-SSE42-NEXT: pmuludq %xmm3, %xmm2 |
162 |
| -; X64-SSE42-NEXT: paddq %xmm4, %xmm2 |
163 |
| -; X64-SSE42-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
164 |
| -; X64-SSE42-NEXT: xorq %rax, %rcx |
165 |
| -; X64-SSE42-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 143 | +; X64-SSE42-NEXT: movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u] |
| 144 | +; X64-SSE42-NEXT: pand %xmm0, %xmm1 |
| 145 | +; X64-SSE42-NEXT: movq %xmm1, %rax |
| 146 | +; X64-SSE42-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 147 | +; X64-SSE42-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 148 | +; X64-SSE42-NEXT: xorq %rax, %rdx |
166 | 149 | ; X64-SSE42-NEXT: imulq %rcx, %rax
|
167 |
| -; X64-SSE42-NEXT: movq %rax, %xmm1 |
| 150 | +; X64-SSE42-NEXT: movq %rax, %xmm2 |
| 151 | +; X64-SSE42-NEXT: imulq %rcx, %rdx |
| 152 | +; X64-SSE42-NEXT: movq %rdx, %xmm1 |
168 | 153 | ; X64-SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm1
|
169 | 154 | ; X64-SSE42-NEXT: movdqa %xmm1, %xmm0
|
170 | 155 | ; X64-SSE42-NEXT: psrlw $4, %xmm0
|
@@ -220,19 +205,13 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
|
220 | 205 | ; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
|
221 | 206 | ; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
|
222 | 207 | ; X64-AVX2-NEXT: vmovq %xmm1, %rax
|
223 |
| -; X64-AVX2-NEXT: vpsrlq $32, %xmm1, %xmm2 |
224 |
| -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440] |
225 |
| -; X64-AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4 |
226 |
| -; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 |
227 |
| -; X64-AVX2-NEXT: vpaddq %xmm2, %xmm4, %xmm2 |
228 |
| -; X64-AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 |
229 |
| -; X64-AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 |
230 |
| -; X64-AVX2-NEXT: vpaddq %xmm2, %xmm1, %xmm1 |
231 |
| -; X64-AVX2-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
232 |
| -; X64-AVX2-NEXT: xorq %rax, %rcx |
233 |
| -; X64-AVX2-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 208 | +; X64-AVX2-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 209 | +; X64-AVX2-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 210 | +; X64-AVX2-NEXT: xorq %rax, %rdx |
234 | 211 | ; X64-AVX2-NEXT: imulq %rcx, %rax
|
235 |
| -; X64-AVX2-NEXT: vmovq %rax, %xmm2 |
| 212 | +; X64-AVX2-NEXT: vmovq %rax, %xmm1 |
| 213 | +; X64-AVX2-NEXT: imulq %rcx, %rdx |
| 214 | +; X64-AVX2-NEXT: vmovq %rdx, %xmm2 |
236 | 215 | ; X64-AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
|
237 | 216 | ; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm1
|
238 | 217 | ; X64-AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
|
@@ -280,16 +259,17 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
|
280 | 259 | ; X64-AVX512-NEXT: vpbroadcastb %edi, %xmm0
|
281 | 260 | ; X64-AVX512-NEXT: vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
|
282 | 261 | ; X64-AVX512-NEXT: vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
|
283 |
| -; X64-AVX512-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1 |
284 | 262 | ; X64-AVX512-NEXT: vmovq %xmm0, %rax
|
285 |
| -; X64-AVX512-NEXT: movabsq $76861433640456465, %rcx # imm = 0x111111111111111 |
286 |
| -; X64-AVX512-NEXT: xorq %rax, %rcx |
287 |
| -; X64-AVX512-NEXT: movabsq $1229782938247303440, %rax # imm = 0x1111111111111110 |
| 263 | +; X64-AVX512-NEXT: movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110 |
| 264 | +; X64-AVX512-NEXT: movabsq $76861433640456465, %rdx # imm = 0x111111111111111 |
| 265 | +; X64-AVX512-NEXT: xorq %rax, %rdx |
288 | 266 | ; X64-AVX512-NEXT: imulq %rcx, %rax
|
289 | 267 | ; X64-AVX512-NEXT: vmovq %rax, %xmm0
|
290 |
| -; X64-AVX512-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} |
291 |
| -; X64-AVX512-NEXT: vpsrlw $4, %xmm0, %xmm1 |
292 |
| -; X64-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] |
| 268 | +; X64-AVX512-NEXT: imulq %rcx, %rdx |
| 269 | +; X64-AVX512-NEXT: vmovq %rdx, %xmm1 |
| 270 | +; X64-AVX512-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} |
| 271 | +; X64-AVX512-NEXT: vpsrlw $4, %xmm1, %xmm0 |
| 272 | +; X64-AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] |
293 | 273 | ; X64-AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
|
294 | 274 | ; X64-AVX512-NEXT: retq
|
295 | 275 | entry:
|
|
0 commit comments