sx-aurora-dev
diff --git a/‎.travis.yml
Lines changed: 10 additions & 0 deletions b/‎.travis.yml
Lines changed: 10 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 2 additions & 0 deletions b/‎README.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/include/llvm/CodeGen/ISDOpcodes.h
Lines changed: 2 additions & 73 deletions b/‎llvm/include/llvm/CodeGen/ISDOpcodes.h
Lines changed: 2 additions & 73 deletions
diff --git a/‎llvm/include/llvm/IR/Intrinsics.td
Lines changed: 5 additions & 0 deletions b/‎llvm/include/llvm/IR/Intrinsics.td
Lines changed: 5 additions & 0 deletions
diff --git a/‎llvm/include/llvm/IR/VPIntrinsics.def
Lines changed: 24 additions & 0 deletions b/‎llvm/include/llvm/IR/VPIntrinsics.def
Lines changed: 24 additions & 0 deletions
diff --git a/‎llvm/lib/Target/VE/VVPInstrInfo.td
Lines changed: 8 additions & 0 deletions b/‎llvm/lib/Target/VE/VVPInstrInfo.td
Lines changed: 8 additions & 0 deletions
diff --git a/‎llvm/lib/Target/VE/VVPInstrPatterns.td
Lines changed: 2 additions & 0 deletions b/‎llvm/lib/Target/VE/VVPInstrPatterns.td
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/lib/Target/VE/VVPNodes.inc
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/VE/VVPNodes.inc
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/VE/Vector/extract_insert_vector_elt.ll
Lines changed: 12 additions & 12 deletions b/‎llvm/test/CodeGen/VE/Vector/extract_insert_vector_elt.ll
Lines changed: 12 additions & 12 deletions
@@ -0,0 +1,10 @@
+language: cpp
+dist: focal
+compiler: gcc
+before_install:
+- sudo apt-get -y install ninja-build
+script:
+  - mkdir -p build
+  - cd build/
+  - cmake ../llvm -DLLVM_TARGETS_TO_BUILD=VE -DCMAKE_BUILD_TYPE=Release -G Ninja -DBUILD_SHARED_LIBS=on -DCMAKE_CXX_FLAGS_RELEASE="-O0 -DNDEBUG" -DLLVM_ENABLE_ASSERTIONS=OFF
+  - ninja lib/libLLVMVEAsmParser.so lib/libLLVMVEDesc.so lib/libLLVMVEInfo.so lib/libLLVMVECodeGen.so lib/libLLVMVEDisassembler.so
@@ -1,5 +1,7 @@
 # LLVM for NEC SX-Aurora VE (llvm-ve-rv 1.8-dev)
 
+[![Build Status](https://travis-ci.com/sx-aurora-dev/llvm-project.svg?branch=hpce%2Fdevelop)](https://travis-ci.com/sx-aurora-dev/llvm-project)
+
 This is a fork of the LLVM repositoy with support for the NEC
 SX-Aurora TSUBASA Vector Engine (VE).
 
 
@@ -228,14 +228,8 @@ enum NodeType {
   SREM,
   UREM,
 
-  // Vector-predicated integer binary arithmetic
-  VP_ADD,
-  VP_SUB,
-  VP_MUL,
-  VP_SDIV,
-  VP_UDIV,
-  VP_SREM,
-  VP_UREM,
+#define BEGIN_REGISTER_VP_SDNODE(VPSDNAME, ...) VPSDNAME,
+#include "llvm/IR/VPIntrinsics.def"
 
   /// SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing
   /// a signed/unsigned value of type i[2*N], and return the full value as
@@ -354,9 +348,6 @@ enum NodeType {
   FDIV,
   FREM,
 
-  // Vector predicated floating point ops.
-  VP_FADD, VP_FSUB, VP_FMUL, VP_FDIV, VP_FREM,
-
   /// Constrained versions of the binary floating point operators.
   /// These will be lowered to the simple operators before final selection.
   /// They are used to limit optimizations while the DAG is being
@@ -441,7 +432,6 @@ enum NodeType {
 
   /// FMA - Perform a * b + c with no intermediate rounding step.
   FMA,
-  VP_FMA,
 
   /// FMAD - Perform a * b + c, while getting the same result as the
   /// separately rounded operations.
@@ -532,19 +522,6 @@ enum NodeType {
   /// in terms of the element size of VEC1/VEC2, not in terms of bytes.
   VECTOR_SHUFFLE,
 
-  /// VP_VSHIFT(VEC1, AMOUNT, MASK, VLEN) - Returns a vector, of the same type as
-  /// VEC1. AMOUNT is an integer value. The returned vector is equivalent
-  /// to VEC1 shifted by AMOUNT (RETURNED_VEC[idx] = VEC1[idx + AMOUNT]).
-  VP_VSHIFT,
-
-  /// VP_COMPRESS(VEC1, MASK, VLEN) - Returns a vector, of the same type as
-  /// VEC1.
-  VP_COMPRESS,
-
-  /// VP_EXPAND(VEC1, MASK, VLEN) - Returns a vector, of the same type as
-  /// VEC1.
-  VP_EXPAND,
-
   /// SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a
   /// scalar value into element 0 of the resultant vector type.  The top
   /// elements 1 to N-1 of the N-element vector are undefined.  The type
@@ -578,9 +555,6 @@ enum NodeType {
   OR,
   XOR,
 
-  // Vector-predicated bitwise operators
-  VP_AND, VP_OR, VP_XOR,
-
   /// ABS - Determine the unsigned absolute value of a signed integer value of
   /// the same bitwidth.
   /// Note: A value of INT_MIN will return INT_MIN, no saturation or overflow
@@ -609,7 +583,6 @@ enum NodeType {
   ROTR,
   FSHL,
   FSHR,
-  VP_SHL, VP_SRA, VP_SRL,
 
   /// Byte Swap and Counting operators.
   BSWAP,
@@ -634,7 +607,6 @@ enum NodeType {
   /// change the condition type in order to match the VSELECT node using a
   /// pattern. The condition follows the BooleanContent format of the target.
   VSELECT,
-  VP_SELECT,
 
   /// Select with condition operator - This selects between a true value and
   /// a false value (ops #2 and #3) based on the boolean result of comparing
@@ -649,7 +621,6 @@ enum NodeType {
   /// them with (op #2) as a CondCodeSDNode. If the operands are vector types
   /// then the result type must also be a vector type.
   SETCC,
-  VP_SETCC,
 
   /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but
   /// op #2 is a boolean indicating if there is an incoming carry. This
@@ -688,8 +659,6 @@ enum NodeType {
   /// depends on the first letter) to floating point.
   SINT_TO_FP,
   UINT_TO_FP,
-  VP_SINT_TO_FP,
-  VP_UINT_TO_FP,
 
   /// SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to
   /// sign extend a small value in a large integer register (e.g. sign
@@ -736,8 +705,6 @@ enum NodeType {
   /// the FP value cannot fit in the integer type, the results are undefined.
   FP_TO_SINT,
   FP_TO_UINT,
-  VP_FP_TO_SINT,
-  VP_FP_TO_UINT,
 
   /// X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type
   /// down to the precision of the destination VT.  TRUNC is a flag, which is
@@ -763,7 +730,6 @@ enum NodeType {
 
   /// X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
   FP_EXTEND,
-  VP_FP_EXTEND,
 
   /// BITCAST - This operator converts between integer, vector and FP
   /// values, as if the value was stored to memory with one type and loaded
@@ -821,12 +787,6 @@ enum NodeType {
   LRINT,
   LLRINT,
 
-  // Vector-predicated unary floating-point ops
-  VP_FNEG, VP_FABS, VP_FSQRT, VP_FCBRT, VP_FSIN, VP_FCOS, VP_FPOWI, VP_FPOW,
-  VP_FLOG, VP_FLOG2, VP_FLOG10, VP_FEXP, VP_FEXP2,
-  VP_FCEIL, VP_FTRUNC, VP_FRINT, VP_FNEARBYINT, VP_FROUND, VP_FFLOOR,
-  VP_LROUND, VP_LLROUND, VP_LRINT, VP_LLRINT,
-
   /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two
   /// values.
   //
@@ -836,7 +796,6 @@ enum NodeType {
   /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0.
   FMINNUM,
   FMAXNUM,
-  VP_FMINNUM, VP_FMAXNUM,
 
   /// FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on
   /// two values, following the IEEE-754 2008 definition. This differs from
@@ -1086,7 +1045,6 @@ enum NodeType {
   // OutChain = MSTORE(Value, BasePtr, Mask)
   MLOAD,
   MSTORE,
-  VP_LOAD, VP_STORE,
 
   // Masked gather and scatter - load and store operations for a vector of
   // random addresses with additional mask operand that prevents memory
@@ -1100,17 +1058,6 @@ enum NodeType {
   MGATHER,
   MSCATTER,
 
-  // VP gather and scatter - load and store operations for a vector of
-  // random addresses with additional mask and vector length operand that
-  // prevents memory accesses to the masked-off lanes.
-  //
-  // Val, OutChain = VP_GATHER(InChain, BasePtr, Index, Scale, Mask, EVL)
-  // OutChain = VP_SCATTER(InChain, Value, BasePtr, Index, Scale, Mask, EVL)
-  //
-  // The Index operand can have more vector elements than the other operands
-  // due to type legalization. The extra elements are ignored.
-  VP_GATHER, VP_SCATTER,
-
   /// This corresponds to the llvm.lifetime.* intrinsics. The first operand
   /// is the chain and the second operand is the alloca pointer.
   LIFETIME_START,
@@ -1143,7 +1090,6 @@ enum NodeType {
   /// is the vector to reduce.
   VECREDUCE_STRICT_FADD,
   VECREDUCE_STRICT_FMUL,
-  VP_REDUCE_STRICT_FADD, VP_REDUCE_STRICT_FMUL,
 
   /// These reductions are non-strict, and have a single vector operand.
   VECREDUCE_FADD,
@@ -1164,23 +1110,6 @@ enum NodeType {
   VECREDUCE_UMAX,
   VECREDUCE_UMIN,
 
-  // Vector-predicated reduction operators
-  VP_REDUCE_FADD,
-  VP_REDUCE_FMUL,
-  VP_REDUCE_ADD,
-  VP_REDUCE_MUL,
-  VP_REDUCE_AND,
-  VP_REDUCE_OR,
-  VP_REDUCE_XOR,
-  VP_REDUCE_SMAX,
-  VP_REDUCE_SMIN,
-  VP_REDUCE_UMAX,
-  VP_REDUCE_UMIN,
-
-  /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
-  VP_REDUCE_FMAX,
-  VP_REDUCE_FMIN,
-
   /// BUILTIN_OP_END - This must be the last enum value in this list.
   /// The target-specific pre-isel opcode values start here.
   BUILTIN_OP_END
 
@@ -1375,6 +1375,11 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, Mask<3>, VectorLeng
                                 LLVMMatchType<0>,
                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                 llvm_i32_ty]>;
+// Element-wise bitops
+  def int_vp_ctpop : Intrinsic<[ llvm_anyvector_ty ],
+                               [ LLVMMatchType<0>,
+                                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                 llvm_i32_ty]>;
 
 // Logical operators
   def int_vp_ashr : Intrinsic<[ llvm_anyvector_ty ],
 
@@ -111,6 +111,13 @@
 
 ///// Integer Arithmetic /////
 
+// llvm.vp.ctpop(x,mask,vlen)
+BEGIN_REGISTER_VP_INTRINSIC(vp_ctpop, 1, 2)
+BEGIN_REGISTER_VP_SDNODE(VP_CTPOP, -1, vp_ctpop, 1, 2)
+HANDLE_VP_TO_INTRIN(ctpop)
+HANDLE_VP_IS_UNARY
+END_REGISTER_CASES(vp_ctpop, VP_CTPOP)
+
 // llvm.vp.add(x,y,mask,vlen)
 BEGIN_REGISTER_VP_INTRINSIC(vp_add, 2, 3)
 BEGIN_REGISTER_VP_SDNODE(VP_ADD, -1, vp_add, 2, 3)
@@ -490,6 +497,16 @@ HANDLE_VP_IS_MEMOP(1, 0)
 END_REGISTER_CASES(vp_store, VP_STORE)
 
 // llvm.vp.scatter(ptr,val,mask,vlen)
+// VP gather and scatter - load and store operations for a vector of
+// random addresses with additional mask and vector length operand that
+// prevents memory accesses to the masked-off lanes.
+//
+// Val, OutChain = VP_GATHER(InChain, BasePtr, Index, Scale, Mask, EVL)
+// OutChain = VP_SCATTER(InChain, Value, BasePtr, Index, Scale, Mask, EVL)
+//
+// The Index operand can have more vector elements than the other operands
+// due to type legalization. The extra elements are ignored.
+
 BEGIN_REGISTER_VP_INTRINSIC(vp_scatter, 2, 3)
 BEGIN_REGISTER_VP_SDNODE(VP_SCATTER, 1, vp_scatter, 3, 4)
 HANDLE_VP_TO_INTRIN(masked_scatter)
@@ -514,16 +531,23 @@ END_REGISTER_CASES(vp_gather, VP_GATHER)
 ///// Shuffle & Blend /////
 
 // llvm.vp.compress(x,mask,vlen)
+/// VP_COMPRESS(VEC1, MASK, VLEN) - Returns a vector, of the same type as
+/// VEC1.
 BEGIN_REGISTER_VP_INTRINSIC(vp_compress, 1, 2)
 BEGIN_REGISTER_VP_SDNODE(VP_COMPRESS, -1, vp_compress, 1, 2)
 END_REGISTER_CASES(vp_compress, VP_COMPRESS)
 
 // llvm.vp.expand(x,mask,vlen)
+/// VP_EXPAND(VEC1, MASK, VLEN) - Returns a vector, of the same type as
+/// VEC1.
 BEGIN_REGISTER_VP_INTRINSIC(vp_expand, 1, 2)
 BEGIN_REGISTER_VP_SDNODE(VP_EXPAND, -1, vp_expand, 1, 2)
 END_REGISTER_CASES(vp_expand, VP_EXPAND)
 
 // llvm.vp.vshift(x,amount,mask,vlen)
+/// VP_VSHIFT(VEC1, AMOUNT, MASK, VLEN) - Returns a vector, of the same type as
+/// VEC1. AMOUNT is an integer value. The returned vector is equivalent
+/// to VEC1 shifted by AMOUNT (RETURNED_VEC[idx] = VEC1[idx + AMOUNT]).
 BEGIN_REGISTER_VP_INTRINSIC(vp_vshift, 2, 3)
 BEGIN_REGISTER_VP_SDNODE(VP_VSHIFT, -1, vp_vshift, 2, 3)
 END_REGISTER_CASES(vp_vshift, VP_VSHIFT)
 
@@ -68,6 +68,11 @@ def SDTFPUnaryOpVVP  : SDTypeProfile<1, 3, [   // fneg, fsqrt, etc
   SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisSameNumEltsAs<0, 2>, IsVLVT<3>
 ]>;
 
+// unary int
+def SDTUnaryOpVVP  : SDTypeProfile<1, 3, [   // ctpop
+  SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisSameNumEltsAs<0, 2>, IsVLVT<3>
+]>;
+
 // gather scatter
 def vvp_scatter : SDNode<"VEISD::VVP_SCATTER",  SDTScatterVVP,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -145,6 +150,9 @@ def vvp_sext       : SDNode<"VEISD::VVP_SEXT", SDTIntExtendOpVVP>;
 def vvp_zext       : SDNode<"VEISD::VVP_ZEXT", SDTIntExtendOpVVP>;
 def vvp_trunc      : SDNode<"VEISD::VVP_TRUNC", SDTIntTruncOpVVP>;
 
+// element-wise bitops
+def vvp_ctpop :  SDNode<"VEISD::VVP_CTPOP", SDTUnaryOpVVP>;
+
 // reductions
 def vvp_reduce_fadd         : SDNode<"VEISD::VVP_REDUCE_FADD", SDTReduceVVP>;
 def vvp_reduce_strict_fadd  : SDNode<"VEISD::VVP_REDUCE_STRICT_FADD", SDTReduceStartVVP>;
 
@@ -241,6 +241,8 @@ multiclass VectorTernaryArith_ShortLong<SDPatternOperator OpNode, ValueType Long
 
 
 // Integer arithmetic (256 elements)
+defm : VectorUnaryArith_ShortLong<vvp_ctpop, i64, v256i64, "VPCNT", i32, v256i32, "PVPCNTLO">;
+
 defm : VectorBinaryArith_ShortLong<c_vv_add, i64, v256i64, "VADDSL", i32, v256i32, "VADDSWSX">;
 
 defm : VectorBinaryArith_ShortLong<vvp_sub,  i64, v256i64, "VSUBSL", i32, v256i32, "VSUBSWSX">;
 
@@ -95,6 +95,10 @@ ADD_VVP_OP(VVP_ZEXT)      REGISTER_ICONV_VVP_OP(VVP_ZEXT,ZERO_EXTEND)    // HAND
 ADD_VVP_OP(VVP_FPEXT)     REGISTER_FPCONV_VVP_OP(VVP_FPEXT,FP_EXTEND)    HANDLE_VP_TO_VVP(VP_FP_EXTEND,VVP_FPEXT)
 ADD_VVP_OP(VVP_FPROUND)   REGISTER_FPCONV_VVP_OP(VVP_FPROUND,FP_ROUND)   HANDLE_VP_TO_VVP(VP_FROUND,VVP_FPROUND)
 
+// element-wise bitops
+ADD_VVP_OP(VVP_CTPOP)     REGISTER_UNARY_VVP_OP(VVP_CTPOP,CTPOP)   // HANDLE_VP_TO_VVP(VP_FROUND,VVP_FPROUND) // TODO as VP -opt
+
+
 #if 0
 // Disabled, this gets expanded instead
 ADD_VVP_OP(VVP_FFLOOR)    REGISTER_UNARY_VVP_OP(VVP_FFLOOR, FFLOOR)      HANDLE_VP_TO_VVP(VP_FFLOOR, VVP_FFLOOR)
 
@@ -101,14 +101,14 @@ define x86_regcallcc <512 x i32> @__regcall3__insert_v512i32r(<512 x i32>, i32)
 ; CHECK-NEXT:    lea %s2, 1024(, %s1)
 ; CHECK-NEXT:    lea %s3, 256
 ; CHECK-NEXT:    lvl %s3
-; CHECK-NEXT:    vstl %v1,4,%s2
-; CHECK-NEXT:    vstl %v0,4,%s1
+; CHECK-NEXT:    vstl %v1, 4, %s2
+; CHECK-NEXT:    vstl %v0, 4, %s1
 ; CHECK-NEXT:    and %s0, %s0, (55)0
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    or %s4, 2, (0)1
 ; CHECK-NEXT:    stl %s4, 176(%s0, %s11)
-; CHECK-NEXT:    vldl.zx %v0,4,%s1
-; CHECK-NEXT:    vldl.zx %v1,4,%s2
+; CHECK-NEXT:    vldl.zx %v0, 4, %s1
+; CHECK-NEXT:    vldl.zx %v1, 4, %s2
 ; CHECK-NEXT:    or %s11, 0, %s9
   %3 = insertelement <512 x i32> %0, i32 2, i32 %1
   ret <512 x i32> %3
@@ -122,8 +122,8 @@ define x86_regcallcc i32 @__regcall3__extract_v512i32r(<512 x i32>, i32) {
 ; CHECK-NEXT:    lea %s2, 1024(, %s1)
 ; CHECK-NEXT:    lea %s3, 256
 ; CHECK-NEXT:    lvl %s3
-; CHECK-NEXT:    vstl %v1,4,%s2
-; CHECK-NEXT:    vstl %v0,4,%s1
+; CHECK-NEXT:    vstl %v1, 4, %s2
+; CHECK-NEXT:    vstl %v0, 4, %s1
 ; CHECK-NEXT:    and %s0, %s0, (55)0
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    ldl.sx %s0, 176(%s0, %s11)
@@ -140,14 +140,14 @@ define x86_regcallcc <512 x float> @__regcall3__insert_v512f32r(<512 x float>, i
 ; CHECK-NEXT:    lea %s2, 1024(, %s1)
 ; CHECK-NEXT:    lea %s3, 256
 ; CHECK-NEXT:    lvl %s3
-; CHECK-NEXT:    vstu %v1,4,%s2
-; CHECK-NEXT:    vstu %v0,4,%s1
+; CHECK-NEXT:    vstu %v1, 4, %s2
+; CHECK-NEXT:    vstu %v0, 4, %s1
 ; CHECK-NEXT:    and %s0, %s0, (55)0
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    lea %s4, 1065353216
 ; CHECK-NEXT:    stl %s4, 176(%s0, %s11)
-; CHECK-NEXT:    vldu %v0,4,%s1
-; CHECK-NEXT:    vldu %v1,4,%s2
+; CHECK-NEXT:    vldu %v0, 4, %s1
+; CHECK-NEXT:    vldu %v1, 4, %s2
 ; CHECK-NEXT:    or %s11, 0, %s9
   %3 = insertelement <512 x float> %0, float 1.0, i32 %1
   ret <512 x float> %3
@@ -161,8 +161,8 @@ define x86_regcallcc float @__regcall3__extract_v512f32r(<512 x float>, i32) {
 ; CHECK-NEXT:    lea %s2, 1024(, %s1)
 ; CHECK-NEXT:    lea %s3, 256
 ; CHECK-NEXT:    lvl %s3
-; CHECK-NEXT:    vstu %v1,4,%s2
-; CHECK-NEXT:    vstu %v0,4,%s1
+; CHECK-NEXT:    vstu %v1, 4, %s2
+; CHECK-NEXT:    vstu %v0, 4, %s1
 ; CHECK-NEXT:    and %s0, %s0, (55)0
 ; CHECK-NEXT:    sll %s0, %s0, 2
 ; CHECK-NEXT:    ldu %s0, 176(%s0, %s11)