sx-aurora-dev · kaz7 · Jan 4, 2022 · Dec 21, 2021 · Dec 21, 2021 · Dec 21, 2021
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -708,7 +708,8 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C, Type *Ty,
   // is all undef or zero, we know what it loads.
   if (auto *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(C))) {
     if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
-      if (GV->getInitializer()->isNullValue())
+      if (GV->getInitializer()->isNullValue() && !Ty->isX86_MMXTy() &&
+          !Ty->isX86_AMXTy())
         return Constant::getNullValue(Ty);
       if (isa<UndefValue>(GV->getInitializer()))
         return UndefValue::get(Ty);

diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td
@@ -21,8 +21,14 @@
 ///// V(E) - VP internal nodes
 // fp node types
 
-def SDTFPBinOpVVP : SDTypeProfile<1, 4, [   // vvp_fadd, etc.
-  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisInt<3>, SDTCisSameNumEltsAs<0, 3>, IsVLVT<4>
+// BinaryFPOp(x,y,mask,vl)
+def SDTFPBinOpVVP : SDTypeProfile<1, 4, [      // vvp_fadd, etc.
+  SDTCisSameAs<0, 1>,
+  SDTCisSameAs<0, 2>,
+  SDTCisFP<0>,
+  SDTCisInt<3>,
+  SDTCisSameNumEltsAs<0, 3>,
+  IsVLVT<4>
 ]>;
 
 def SDTFPTernaryOpVVP : SDTypeProfile<1, 5, [  // vvp_ffma
@@ -207,10 +213,12 @@ def vvp_reduce_umax         : SDNode<"VEISD::VVP_REDUCE_UMAX", SDTReduceVVP>;
 // math funcs
 def vvp_fsqrt                : SDNode<"VEISD::VVP_FSQRT", SDTFPUnaryOpVVP>;
 
+// Binary operator commutative pattern.
 class vvp_commutative<SDNode RootOp> :
-  PatFrags<(ops node:$lhs, node:$rhs, node:$mask, node:$vlen),
-                       [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
-                        (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
+  PatFrags<
+  (ops node:$lhs, node:$rhs, node:$mask, node:$vlen),
+  [(RootOp node:$lhs, node:$rhs, node:$mask, node:$vlen),
+   (RootOp node:$rhs, node:$lhs, node:$mask, node:$vlen)]>;
 
 class vvp_fma_commutative<SDNode RootOp> :
   PatFrags<(ops node:$X, node:$Y, node:$Z, node:$mask, node:$vlen),

diff --git a/llvm/lib/Target/VE/VVPInstrPatternsVec.td b/llvm/lib/Target/VE/VVPInstrPatternsVec.td
@@ -229,6 +229,14 @@ multiclass Binary_rv_vv<
   defm : Binary_vv<OpNode, DataVT, MaskVT, OpBaseName>;
 }
 
+multiclass Binary_rv_vr_vv<
+    SDPatternOperator OpNode,
+    ValueType ScalarVT, ValueType DataVT, ValueType MaskVT,
+    string OpBaseName> {
+  defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+  defm : Binary_vr_vv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
+}
+
 // Expand both 64bit and 32 bit variant (256 elements)
 multiclass Binary_rv_vv_ShortLong<
     SDPatternOperator OpNode,
@@ -254,23 +262,6 @@ multiclass Binary_vr_vv_ShortLong<
                       ShortOpBaseName>;
 }
 
-// Binary operators that support broadcasts on LHS and RHS.
-multiclass Binary_all<
-    SDPatternOperator OpNode,
-    ValueType ScalarVT, ValueType DataVT,
-    ValueType MaskVT, string OpBaseName> {
-  defm : Binary_rv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
-  defm : Binary_vr_vv<OpNode, ScalarVT, DataVT, MaskVT, OpBaseName>;
-}
-
-multiclass Binary_ShortLong<
-    SDPatternOperator OpNode,
-    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
-    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
-  defm : Binary_all<OpNode, LongScalarVT, LongDataVT, v256i1, LongOpBaseName>;
-  defm : Binary_all<OpNode, ShortScalarVT, ShortDataVT, v256i1, ShortOpBaseName>;
-}
-
 multiclass Ternary<
     SDPatternOperator OpNode,
     ValueType ScalarVT, ValueType DataVT,
@@ -332,6 +323,18 @@ multiclass Ternary_ShortLong<
 // Integer arithmetic (256 elements)
 defm : Unary_ShortLong<vvp_ctpop, i64, v256i64, "VPCNT", i32, v256i32, "PVPCNTLO">;
 
+multiclass Binary_rv_vr_vv_ShortLong<
+    SDPatternOperator OpNode,
+    ValueType LongScalarVT, ValueType LongDataVT, string LongOpBaseName,
+    ValueType ShortScalarVT, ValueType ShortDataVT, string ShortOpBaseName> {
+  defm : Binary_rv_vr_vv<OpNode,
+                      LongScalarVT, LongDataVT, v256i1,
+                      LongOpBaseName>;
+  defm : Binary_rv_vr_vv<OpNode,
+                      ShortScalarVT, ShortDataVT, v256i1,
+                      ShortOpBaseName>;
+}
+
 defm : Binary_rv_vv_ShortLong<c_vvp_add,
                               i64, v256i64, "VADDSL",
                               i32, v256i32, "VADDSWSX">;
@@ -341,6 +344,12 @@ defm : Binary_rv_vv_ShortLong<vvp_sub,
 defm : Binary_rv_vv_ShortLong<c_vvp_mul,
                               i64, v256i64, "VMULSL",
                               i32, v256i32, "VMULSWSX">;
+defm : Binary_rv_vr_vv_ShortLong<vvp_sdiv,
+                              i64, v256i64, "VDIVSL",
+                              i32, v256i32, "VDIVSWSX">;
+defm : Binary_rv_vr_vv_ShortLong<vvp_udiv,
+                              i64, v256i64, "VDIVUL",
+                              i32, v256i32, "VDIVUW">;
 defm : Binary_rv_vv_ShortLong<c_vvp_and,
                               i64, v256i64, "VAND",
                               i32, v256i32, "PVANDLO">;
@@ -351,9 +360,6 @@ defm : Binary_rv_vv_ShortLong<c_vvp_xor,
                               i64, v256i64, "VXOR",
                               i32, v256i32, "PVXORLO">;
 
-defm : Binary_ShortLong<vvp_sdiv, i64, v256i64, "VDIVSL", i32, v256i32, "VDIVSWSX">;
-defm : Binary_ShortLong<vvp_udiv, i64, v256i64, "VDIVUL", i32, v256i32, "VDIVUW">;
-
 defm : Binary_vr_vv_ShortLong<vvp_shl,
                               i64, v256i64, "VSLL",
                               i32, v256i32, "PVSLLLO">;
@@ -365,19 +371,36 @@ defm : Binary_vr_vv_ShortLong<vvp_srl,
                               i32, v256i32, "PVSRLLO">;
 
 // Floating-point arithmetic (256 elements)
-defm : Unary_ShortLong<vvp_frcp, f64, v256f64, "VRCPD", f32, v256f32, "VRCPS">; 
-defm : Unary_ShortLong<vvp_fsqrt, f64, v256f64, "VFSQRTD", f32, v256f32, "VFSQRTS">; 
-defm : Binary_rv_vv_ShortLong<c_vvp_fadd, f64, v256f64, "VFADDD", f32, v256f32, "PVFADDUP">;
-defm : Binary_rv_vv_ShortLong<vvp_fsub,  f64, v256f64, "VFSUBD", f32, v256f32, "PVFSUBUP">;
-defm : Binary_rv_vv_ShortLong<c_vvp_fmul, f64, v256f64, "VFMULD", f32, v256f32, "PVFMULUP">;
-defm : Binary_ShortLong<vvp_fdiv,  f64, v256f64, "VFDIVD", f32, v256f32, "VFDIVS">;
-
-defm : Binary_rv_vv_ShortLong<c_vvp_fminnum, f64, v256f64, "VFMIND", f32, v256f32, "VFMINS">;
-defm : Binary_rv_vv_ShortLong<c_vvp_fmaxnum, f64, v256f64, "VFMAXD", f32, v256f32, "VFMAXS">;
-
-defm : Ternary_ShortLong<c_vvp_ffma, f64, v256f64, "VFMADD", f32, v256f32, "VFMADS">;
-defm : Ternary_ShortLong<c_vvp_ffms, f64, v256f64, "VFMSBD", f32, v256f32, "VFMSBS">;
-defm : Ternary_ShortLong<c_vvp_ffmsn, f64, v256f64, "VFNMSBD", f32, v256f32, "VFNMSBS">;
+defm : Unary_ShortLong<vvp_frcp,
+                       f64, v256f64, "VRCPD", f32, v256f32, "VRCPS">; 
+defm : Unary_ShortLong<vvp_fsqrt,
+                       f64, v256f64, "VFSQRTD", f32, v256f32, "VFSQRTS">; 
+defm : Binary_rv_vv_ShortLong<c_vvp_fadd,
+                              f64, v256f64, "VFADDD",
+                              f32, v256f32, "PVFADDUP">;
+defm : Binary_rv_vv_ShortLong<c_vvp_fmul,
+                              f64, v256f64, "VFMULD",
+                              f32, v256f32, "PVFMULUP">;
+defm : Binary_rv_vv_ShortLong<vvp_fsub,
+                              f64, v256f64, "VFSUBD",
+                              f32, v256f32, "PVFSUBUP">;
+defm : Binary_rv_vr_vv_ShortLong<vvp_fdiv,
+                              f64, v256f64, "VFDIVD",
+                              f32, v256f32, "VFDIVS">;
+
+defm : Binary_rv_vv_ShortLong<c_vvp_fminnum,
+                              f64, v256f64, "VFMIND",
+                              f32, v256f32, "VFMINS">;
+defm : Binary_rv_vv_ShortLong<c_vvp_fmaxnum,
+                              f64, v256f64, "VFMAXD",
+                              f32, v256f32, "VFMAXS">;
+
+defm : Ternary_ShortLong<c_vvp_ffma,
+                         f64, v256f64, "VFMADD", f32, v256f32, "VFMADS">;
+defm : Ternary_ShortLong<c_vvp_ffms,
+                         f64, v256f64, "VFMSBD", f32, v256f32, "VFMSBS">;
+defm : Ternary_ShortLong<c_vvp_ffmsn,
+                         f64, v256f64, "VFNMSBD", f32, v256f32, "VFNMSBS">;
 // TODO: vvp_ffman
 
 ///// Selection /////

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -305,8 +305,9 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
     else if (auto *LI = dyn_cast<LoadInst>(U)) {
       // A load from zeroinitializer is always zeroinitializer, regardless of
       // any applied offset.
-      if (Init->isNullValue()) {
-        LI->replaceAllUsesWith(Constant::getNullValue(LI->getType()));
+      Type *Ty = LI->getType();
+      if (Init->isNullValue() && !Ty->isX86_MMXTy() && !Ty->isX86_AMXTy()) {
+        LI->replaceAllUsesWith(Constant::getNullValue(Ty));
         EraseFromParent(LI);
         continue;
       }
@@ -316,8 +317,7 @@ static bool CleanupConstantGlobalUsers(GlobalVariable *GV,
       PtrOp = PtrOp->stripAndAccumulateConstantOffsets(
           DL, Offset, /* AllowNonInbounds */ true);
       if (PtrOp == GV) {
-        if (auto *Value = ConstantFoldLoadFromConst(Init, LI->getType(),
-                                                    Offset, DL)) {
+        if (auto *Value = ConstantFoldLoadFromConst(Init, Ty, Offset, DL)) {
           LI->replaceAllUsesWith(Value);
           EraseFromParent(LI);
         }

diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -664,10 +664,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
     return nullptr;
 
   // When processing loads, we need to propagate two bits of information to the
-  // sunk load: whether it is volatile, and what its alignment is.  We currently
-  // don't sink loads when some have their alignment specified and some don't.
-  // visitLoadInst will propagate an alignment onto the load when TD is around,
-  // and if TD isn't around, we can't handle the mixed case.
+  // sunk load: whether it is volatile, and what its alignment is.
   bool isVolatile = FirstLI->isVolatile();
   Align LoadAlignment = FirstLI->getAlign();
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
@@ -699,7 +696,7 @@ Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
         !isSafeAndProfitableToSinkLoad(LI))
       return nullptr;
 
-    LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
+    LoadAlignment = std::min(LoadAlignment, LI->getAlign());
 
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from

diff --git a/llvm/test/CodeGen/VE/Vector/vp_fadd.ll b/llvm/test/CodeGen/VE/Vector/vp_fadd.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
 
-define fastcc <256 x float> @test_vp_fadd_256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
-; CHECK-LABEL: test_vp_fadd_256f32:
+declare <256 x float> @llvm.vp.fadd.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32)
+
+define fastcc <256 x float> @test_vp_fadd_v256f32_vv(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fadd_v256f32_vv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    and %s0, %s0, (32)0
 ; CHECK-NEXT:    lvl %s0
@@ -12,5 +14,68 @@ define fastcc <256 x float> @test_vp_fadd_256f32(<256 x float> %i0, <256 x float
   ret <256 x float> %r0
 }
 
-; integer arith
-declare <256 x float> @llvm.vp.fadd.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32)
+define fastcc <256 x float> @test_vp_fadd_v256f32_rv(float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fadd_v256f32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    pvfadd.up %v0, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fadd.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fadd_v256f32_vr(<256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fadd_v256f32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    pvfadd.up %v0, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fadd.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+
+declare <256 x double> @llvm.vp.fadd.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32)
+
+define fastcc <256 x double> @test_vp_fadd_v256f64_vv(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fadd_v256f64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfadd.d %v0, %v0, %v1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fadd_v256f64_rv(double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fadd_v256f64_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfadd.d %v0, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s0, i32 0
+  %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fadd_v256f64_vr(<256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fadd_v256f64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfadd.d %v0, %s0, %v0
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fadd.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
diff --git a/llvm/test/CodeGen/VE/Vector/vp_fdiv.ll b/llvm/test/CodeGen/VE/Vector/vp_fdiv.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=ve -mattr=+vpu | FileCheck %s
+
+declare <256 x float> @llvm.vp.fdiv.v256f32(<256 x float>, <256 x float>, <256 x i1>, i32)
+
+define fastcc <256 x float> @test_vp_fdiv_v256f32_vv(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v256f32_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfdiv.s %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fdiv_v256f32_rv(float %s0, <256 x float> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v256f32_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfdiv.s %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x float> undef, float %s0, i32 0
+  %i0 = shufflevector <256 x float> %xins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+define fastcc <256 x float> @test_vp_fdiv_v256f32_vr(<256 x float> %i0, float %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v256f32_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfdiv.s %v0, %v0, %s0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x float> undef, float %s1, i32 0
+  %i1 = shufflevector <256 x float> %yins, <256 x float> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x float> @llvm.vp.fdiv.v256f32(<256 x float> %i0, <256 x float> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x float> %r0
+}
+
+
+declare <256 x double> @llvm.vp.fdiv.v256f64(<256 x double>, <256 x double>, <256 x i1>, i32)
+
+define fastcc <256 x double> @test_vp_fdiv_v256f64_vv(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v256f64_vv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lvl %s0
+; CHECK-NEXT:    vfdiv.d %v0, %v0, %v1, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %r0 = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fdiv_v256f64_rv(double %s0, <256 x double> %i1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v256f64_rv:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfdiv.d %v0, %s0, %v0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %xins = insertelement <256 x double> undef, double %s0, i32 0
+  %i0 = shufflevector <256 x double> %xins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}
+
+define fastcc <256 x double> @test_vp_fdiv_v256f64_vr(<256 x double> %i0, double %s1, <256 x i1> %m, i32 %n) {
+; CHECK-LABEL: test_vp_fdiv_v256f64_vr:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and %s1, %s1, (32)0
+; CHECK-NEXT:    lvl %s1
+; CHECK-NEXT:    vfdiv.d %v0, %v0, %s0, %vm1
+; CHECK-NEXT:    b.l.t (, %s10)
+  %yins = insertelement <256 x double> undef, double %s1, i32 0
+  %i1 = shufflevector <256 x double> %yins, <256 x double> undef, <256 x i32> zeroinitializer
+  %r0 = call <256 x double> @llvm.vp.fdiv.v256f64(<256 x double> %i0, <256 x double> %i1, <256 x i1> %m, i32 %n)
+  ret <256 x double> %r0
+}