@@ -1365,6 +1365,282 @@ define <32 x i8> @blend_mask_cond_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %z
1365
1365
ret <32 x i8 > %r
1366
1366
}
1367
1367
1368
+ define void @store_blend_load_v4i64 (ptr %a0 , ptr %a1 , ptr %a2 ) {
1369
+ ; AVX1-LABEL: store_blend_load_v4i64:
1370
+ ; AVX1: # %bb.0:
1371
+ ; AVX1-NEXT: vmovapd (%rsi), %ymm0
1372
+ ; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
1373
+ ; AVX1-NEXT: # xmm1 = mem[0,0]
1374
+ ; AVX1-NEXT: vpxor 16(%rdi), %xmm1, %xmm2
1375
+ ; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775815,9223372036854775815]
1376
+ ; AVX1-NEXT: # xmm3 = mem[0,0]
1377
+ ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
1378
+ ; AVX1-NEXT: vpxor (%rdi), %xmm1, %xmm1
1379
+ ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
1380
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1381
+ ; AVX1-NEXT: vblendvpd %ymm1, (%rdi), %ymm0, %ymm0
1382
+ ; AVX1-NEXT: vmovapd %ymm0, (%rdx)
1383
+ ; AVX1-NEXT: vzeroupper
1384
+ ; AVX1-NEXT: retq
1385
+ ;
1386
+ ; AVX2-LABEL: store_blend_load_v4i64:
1387
+ ; AVX2: # %bb.0:
1388
+ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1389
+ ; AVX2-NEXT: vmovapd (%rsi), %ymm1
1390
+ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
1391
+ ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
1392
+ ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775815,9223372036854775815,9223372036854775815,9223372036854775815]
1393
+ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
1394
+ ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
1395
+ ; AVX2-NEXT: vmovapd %ymm0, (%rdx)
1396
+ ; AVX2-NEXT: vzeroupper
1397
+ ; AVX2-NEXT: retq
1398
+ ;
1399
+ ; AVX512F-LABEL: store_blend_load_v4i64:
1400
+ ; AVX512F: # %bb.0:
1401
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1402
+ ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
1403
+ ; AVX512F-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %k1
1404
+ ; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
1405
+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
1406
+ ; AVX512F-NEXT: vzeroupper
1407
+ ; AVX512F-NEXT: retq
1408
+ ;
1409
+ ; AVX512VL-LABEL: store_blend_load_v4i64:
1410
+ ; AVX512VL: # %bb.0:
1411
+ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1412
+ ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1
1413
+ ; AVX512VL-NEXT: vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1
1414
+ ; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
1415
+ ; AVX512VL-NEXT: vmovdqa %ymm1, (%rdx)
1416
+ ; AVX512VL-NEXT: vzeroupper
1417
+ ; AVX512VL-NEXT: retq
1418
+ ;
1419
+ ; XOP-LABEL: store_blend_load_v4i64:
1420
+ ; XOP: # %bb.0:
1421
+ ; XOP-NEXT: vmovapd (%rsi), %ymm0
1422
+ ; XOP-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,7]
1423
+ ; XOP-NEXT: vpcomltuq 16(%rdi), %xmm1, %xmm2
1424
+ ; XOP-NEXT: vpcomltuq (%rdi), %xmm1, %xmm1
1425
+ ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1426
+ ; XOP-NEXT: vblendvpd %ymm1, (%rdi), %ymm0, %ymm0
1427
+ ; XOP-NEXT: vmovapd %ymm0, (%rdx)
1428
+ ; XOP-NEXT: vzeroupper
1429
+ ; XOP-NEXT: retq
1430
+ %v0 = load <4 x i64 >, ptr %a0
1431
+ %v1 = load <4 x i64 >, ptr %a1
1432
+ %cmp = icmp ugt <4 x i64 > %v0 , <i64 7 , i64 7 , i64 7 , i64 7 >
1433
+ %res = select <4 x i1 > %cmp , <4 x i64 > %v0 , <4 x i64 > %v1
1434
+ store <4 x i64 > %res , ptr %a2
1435
+ ret void
1436
+ }
1437
+
1438
+ define void @store_blend_load_v8i32 (ptr %a0 , ptr %a1 , ptr %a2 ) {
1439
+ ; AVX1-LABEL: store_blend_load_v8i32:
1440
+ ; AVX1: # %bb.0:
1441
+ ; AVX1-NEXT: vmovaps (%rsi), %ymm0
1442
+ ; AVX1-NEXT: vmovdqa (%rdi), %xmm1
1443
+ ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
1444
+ ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [8,8,8,8]
1445
+ ; AVX1-NEXT: vpmaxud %xmm3, %xmm2, %xmm4
1446
+ ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
1447
+ ; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm3
1448
+ ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
1449
+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1450
+ ; AVX1-NEXT: vblendvps %ymm1, (%rdi), %ymm0, %ymm0
1451
+ ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1452
+ ; AVX1-NEXT: vzeroupper
1453
+ ; AVX1-NEXT: retq
1454
+ ;
1455
+ ; AVX2-LABEL: store_blend_load_v8i32:
1456
+ ; AVX2: # %bb.0:
1457
+ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1458
+ ; AVX2-NEXT: vmovaps (%rsi), %ymm1
1459
+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8]
1460
+ ; AVX2-NEXT: vpmaxud %ymm2, %ymm0, %ymm2
1461
+ ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm2
1462
+ ; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
1463
+ ; AVX2-NEXT: vmovaps %ymm0, (%rdx)
1464
+ ; AVX2-NEXT: vzeroupper
1465
+ ; AVX2-NEXT: retq
1466
+ ;
1467
+ ; AVX512F-LABEL: store_blend_load_v8i32:
1468
+ ; AVX512F: # %bb.0:
1469
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1470
+ ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
1471
+ ; AVX512F-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %k1
1472
+ ; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
1473
+ ; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
1474
+ ; AVX512F-NEXT: vzeroupper
1475
+ ; AVX512F-NEXT: retq
1476
+ ;
1477
+ ; AVX512VL-LABEL: store_blend_load_v8i32:
1478
+ ; AVX512VL: # %bb.0:
1479
+ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1480
+ ; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1
1481
+ ; AVX512VL-NEXT: vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1
1482
+ ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
1483
+ ; AVX512VL-NEXT: vmovdqa %ymm1, (%rdx)
1484
+ ; AVX512VL-NEXT: vzeroupper
1485
+ ; AVX512VL-NEXT: retq
1486
+ ;
1487
+ ; XOP-LABEL: store_blend_load_v8i32:
1488
+ ; XOP: # %bb.0:
1489
+ ; XOP-NEXT: vmovaps (%rsi), %ymm0
1490
+ ; XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
1491
+ ; XOP-NEXT: vpcomltud 16(%rdi), %xmm1, %xmm2
1492
+ ; XOP-NEXT: vpcomltud (%rdi), %xmm1, %xmm1
1493
+ ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1494
+ ; XOP-NEXT: vblendvps %ymm1, (%rdi), %ymm0, %ymm0
1495
+ ; XOP-NEXT: vmovaps %ymm0, (%rdx)
1496
+ ; XOP-NEXT: vzeroupper
1497
+ ; XOP-NEXT: retq
1498
+ %v0 = load <8 x i32 >, ptr %a0
1499
+ %v1 = load <8 x i32 >, ptr %a1
1500
+ %cmp = icmp ugt <8 x i32 > %v0 , <i32 7 , i32 7 , i32 7 , i32 7 , i32 7 , i32 7 , i32 7 , i32 7 >
1501
+ %res = select <8 x i1 > %cmp , <8 x i32 > %v0 , <8 x i32 > %v1
1502
+ store <8 x i32 > %res , ptr %a2
1503
+ ret void
1504
+ }
1505
+
1506
+ define void @store_blend_load_v16i16 (ptr %a0 , ptr %a1 , ptr %a2 ) {
1507
+ ; AVX1-LABEL: store_blend_load_v16i16:
1508
+ ; AVX1: # %bb.0:
1509
+ ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1510
+ ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1511
+ ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
1512
+ ; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm3
1513
+ ; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
1514
+ ; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2
1515
+ ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
1516
+ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1517
+ ; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
1518
+ ; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
1519
+ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1520
+ ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1521
+ ; AVX1-NEXT: vzeroupper
1522
+ ; AVX1-NEXT: retq
1523
+ ;
1524
+ ; AVX2-LABEL: store_blend_load_v16i16:
1525
+ ; AVX2: # %bb.0:
1526
+ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1527
+ ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
1528
+ ; AVX2-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
1529
+ ; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
1530
+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1531
+ ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1532
+ ; AVX2-NEXT: vzeroupper
1533
+ ; AVX2-NEXT: retq
1534
+ ;
1535
+ ; AVX512F-LABEL: store_blend_load_v16i16:
1536
+ ; AVX512F: # %bb.0:
1537
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1538
+ ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
1539
+ ; AVX512F-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
1540
+ ; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
1541
+ ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1542
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
1543
+ ; AVX512F-NEXT: vzeroupper
1544
+ ; AVX512F-NEXT: retq
1545
+ ;
1546
+ ; AVX512VL-LABEL: store_blend_load_v16i16:
1547
+ ; AVX512VL: # %bb.0:
1548
+ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1549
+ ; AVX512VL-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1550
+ ; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm1
1551
+ ; AVX512VL-NEXT: vpternlogq $202, (%rsi), %ymm0, %ymm1
1552
+ ; AVX512VL-NEXT: vmovdqa %ymm1, (%rdx)
1553
+ ; AVX512VL-NEXT: vzeroupper
1554
+ ; AVX512VL-NEXT: retq
1555
+ ;
1556
+ ; XOP-LABEL: store_blend_load_v16i16:
1557
+ ; XOP: # %bb.0:
1558
+ ; XOP-NEXT: vmovdqa (%rdi), %ymm0
1559
+ ; XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7]
1560
+ ; XOP-NEXT: vpcomltuw 16(%rdi), %xmm1, %xmm2
1561
+ ; XOP-NEXT: vpcomltuw (%rdi), %xmm1, %xmm1
1562
+ ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1563
+ ; XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0
1564
+ ; XOP-NEXT: vmovdqa %ymm0, (%rdx)
1565
+ ; XOP-NEXT: vzeroupper
1566
+ ; XOP-NEXT: retq
1567
+ %v0 = load <16 x i16 >, ptr %a0
1568
+ %v1 = load <16 x i16 >, ptr %a1
1569
+ %cmp = icmp ugt <16 x i16 > %v0 , <i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 , i16 7 >
1570
+ %res = select <16 x i1 > %cmp , <16 x i16 > %v0 , <16 x i16 > %v1
1571
+ store <16 x i16 > %res , ptr %a2
1572
+ ret void
1573
+ }
1574
+
1575
+ define void @store_blend_load_v32i8 (ptr %a0 , ptr %a1 , ptr %a2 ) {
1576
+ ; AVX1-LABEL: store_blend_load_v32i8:
1577
+ ; AVX1: # %bb.0:
1578
+ ; AVX1-NEXT: vmovdqa (%rdi), %xmm0
1579
+ ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
1580
+ ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
1581
+ ; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm3
1582
+ ; AVX1-NEXT: vpcmpeqb %xmm3, %xmm1, %xmm1
1583
+ ; AVX1-NEXT: vpmaxub %xmm2, %xmm0, %xmm2
1584
+ ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
1585
+ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
1586
+ ; AVX1-NEXT: vandnps (%rsi), %ymm0, %ymm1
1587
+ ; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
1588
+ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
1589
+ ; AVX1-NEXT: vmovaps %ymm0, (%rdx)
1590
+ ; AVX1-NEXT: vzeroupper
1591
+ ; AVX1-NEXT: retq
1592
+ ;
1593
+ ; AVX2-LABEL: store_blend_load_v32i8:
1594
+ ; AVX2: # %bb.0:
1595
+ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0
1596
+ ; AVX2-NEXT: vmovdqa (%rsi), %ymm1
1597
+ ; AVX2-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
1598
+ ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
1599
+ ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1600
+ ; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
1601
+ ; AVX2-NEXT: vzeroupper
1602
+ ; AVX2-NEXT: retq
1603
+ ;
1604
+ ; AVX512F-LABEL: store_blend_load_v32i8:
1605
+ ; AVX512F: # %bb.0:
1606
+ ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
1607
+ ; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
1608
+ ; AVX512F-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
1609
+ ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
1610
+ ; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
1611
+ ; AVX512F-NEXT: vmovdqa %ymm0, (%rdx)
1612
+ ; AVX512F-NEXT: vzeroupper
1613
+ ; AVX512F-NEXT: retq
1614
+ ;
1615
+ ; AVX512VL-LABEL: store_blend_load_v32i8:
1616
+ ; AVX512VL: # %bb.0:
1617
+ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
1618
+ ; AVX512VL-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1619
+ ; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1
1620
+ ; AVX512VL-NEXT: vpternlogq $202, (%rsi), %ymm0, %ymm1
1621
+ ; AVX512VL-NEXT: vmovdqa %ymm1, (%rdx)
1622
+ ; AVX512VL-NEXT: vzeroupper
1623
+ ; AVX512VL-NEXT: retq
1624
+ ;
1625
+ ; XOP-LABEL: store_blend_load_v32i8:
1626
+ ; XOP: # %bb.0:
1627
+ ; XOP-NEXT: vmovdqa (%rdi), %ymm0
1628
+ ; XOP-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1629
+ ; XOP-NEXT: vpcomltub 16(%rdi), %xmm1, %xmm2
1630
+ ; XOP-NEXT: vpcomltub (%rdi), %xmm1, %xmm1
1631
+ ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
1632
+ ; XOP-NEXT: vpcmov %ymm1, (%rsi), %ymm0, %ymm0
1633
+ ; XOP-NEXT: vmovdqa %ymm0, (%rdx)
1634
+ ; XOP-NEXT: vzeroupper
1635
+ ; XOP-NEXT: retq
1636
+ %v0 = load <32 x i8 >, ptr %a0
1637
+ %v1 = load <32 x i8 >, ptr %a1
1638
+ %cmp = icmp ugt <32 x i8 > %v0 , <i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 , i8 7 >
1639
+ %res = select <32 x i1 > %cmp , <32 x i8 > %v0 , <32 x i8 > %v1
1640
+ store <32 x i8 > %res , ptr %a2
1641
+ ret void
1642
+ }
1643
+
1368
1644
define void @PR46531 (ptr %x , ptr %y , ptr %z ) {
1369
1645
; AVX12-LABEL: PR46531:
1370
1646
; AVX12: # %bb.0:
0 commit comments