diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 000000000000..b48339d8328d --- /dev/null +++ b/.travis.yml @@ -0,0 +1,10 @@ +language: cpp +dist: focal +compiler: gcc +before_install: +- sudo apt-get -y install ninja-build +script: + - mkdir -p build + - cd build/ + - cmake ../llvm -DLLVM_TARGETS_TO_BUILD=VE -DCMAKE_BUILD_TYPE=Release -G Ninja -DBUILD_SHARED_LIBS=on -DCMAKE_CXX_FLAGS_RELEASE="-O0 -DNDEBUG" -DLLVM_ENABLE_ASSERTIONS=OFF + - ninja lib/libLLVMVEAsmParser.so lib/libLLVMVEDesc.so lib/libLLVMVEInfo.so lib/libLLVMVECodeGen.so lib/libLLVMVEDisassembler.so diff --git a/README.md b/README.md index bab402248440..9135698a315b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # LLVM for NEC SX-Aurora VE (llvm-ve-rv 1.8-dev) +[![Build Status](https://travis-ci.com/sx-aurora-dev/llvm-project.svg?branch=hpce%2Fdevelop)](https://travis-ci.com/sx-aurora-dev/llvm-project) + This is a fork of the LLVM repositoy with support for the NEC SX-Aurora TSUBASA Vector Engine (VE). diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S index 51bb9b0688fd..3dd7be40cd00 100644 --- a/libunwind/src/UnwindRegistersSave.S +++ b/libunwind/src/UnwindRegistersSave.S @@ -1100,6 +1100,88 @@ DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) li a0, 0 // return UNW_ESUCCESS ret // jump to ra + +#elif defined(__ve__) + +# +# extern int __unw_getcontext(unw_context_t* thread_state) +# +# On entry: +# thread_state pointer is in %s0 +# +DEFINE_LIBUNWIND_FUNCTION(__unw_getcontext) + st %s1, 8(,%s0) + sic %s1 + st %s1, 512(,%s0) + st %s0, 0(,%s0) + st %s2, 16(,%s0) + st %s3, 24(,%s0) + st %s4, 32(,%s0) + st %s5, 40(,%s0) + st %s6, 48(,%s0) + st %s7, 56(,%s0) + st %s8, 64(,%s0) + st %s9, 72(,%s0) + st %s10, 80(,%s0) + st %s11, 88(,%s0) + st %s12, 96(,%s0) + st %s13, 104(,%s0) + st %s14, 112(,%s0) + st %s15, 120(,%s0) + st %s15, 120(,%s0) + st %s16, 128(,%s0) + st %s17, 136(,%s0) + st %s18, 144(,%s0) + st %s19, 152(,%s0) + st %s20, 160(,%s0) + st %s21, 168(,%s0) + st %s22, 176(,%s0) + st %s23, 184(,%s0) + st %s24, 192(,%s0) + st %s25, 200(,%s0) + st %s26, 208(,%s0) + st %s27, 216(,%s0) + st %s28, 224(,%s0) + st %s29, 232(,%s0) + st %s30, 240(,%s0) + st %s31, 248(,%s0) + st %s32, 256(,%s0) + st %s33, 264(,%s0) + st %s34, 272(,%s0) + st %s35, 280(,%s0) + st %s36, 288(,%s0) + st %s37, 296(,%s0) + st %s38, 304(,%s0) + st %s39, 312(,%s0) + st %s40, 320(,%s0) + st %s41, 328(,%s0) + st %s42, 336(,%s0) + st %s43, 344(,%s0) + st %s44, 352(,%s0) + st %s45, 360(,%s0) + st %s46, 368(,%s0) + st %s47, 376(,%s0) + st %s48, 384(,%s0) + st %s49, 392(,%s0) + st %s50, 400(,%s0) + st %s51, 408(,%s0) + st %s52, 416(,%s0) + st %s53, 424(,%s0) + st %s54, 432(,%s0) + st %s55, 440(,%s0) + st %s56, 448(,%s0) + st %s57, 456(,%s0) + st %s58, 464(,%s0) + st %s59, 472(,%s0) + st %s60, 480(,%s0) + st %s61, 488(,%s0) + st %s62, 496(,%s0) + st %s63, 504(,%s0) + svl %s1 + st %s1, 520(,%s0) + ld %s1, 8(%s0) + or %s0, 0, 0(1) + b.l (,%lr) #endif WEAK_ALIAS(__unw_getcontext, unw_getcontext) diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake index 90e5d327c757..b2e9dfdd6feb 100644 --- a/llvm/cmake/config-ix.cmake +++ b/llvm/cmake/config-ix.cmake @@ -435,6 +435,8 @@ elseif (LLVM_NATIVE_ARCH MATCHES "riscv32") set(LLVM_NATIVE_ARCH RISCV) elseif (LLVM_NATIVE_ARCH MATCHES "riscv64") set(LLVM_NATIVE_ARCH RISCV) +elseif (LLVM_NATIVE_ARCH MATCHES "ve") + set(LLVM_NATIVE_ARCH VE) else () message(FATAL_ERROR "Unknown architecture ${LLVM_NATIVE_ARCH}") endif () diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index a252071c9871..6cc34345b6dc 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -228,14 +228,8 @@ enum NodeType { SREM, UREM, - // Vector-predicated integer binary arithmetic - VP_ADD, - VP_SUB, - VP_MUL, - VP_SDIV, - VP_UDIV, - VP_SREM, - VP_UREM, +#define BEGIN_REGISTER_VP_SDNODE(VPSDNAME, ...) VPSDNAME, +#include "llvm/IR/VPIntrinsics.def" /// SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing /// a signed/unsigned value of type i[2*N], and return the full value as @@ -354,9 +348,6 @@ enum NodeType { FDIV, FREM, - // Vector predicated floating point ops. - VP_FADD, VP_FSUB, VP_FMUL, VP_FDIV, VP_FREM, - /// Constrained versions of the binary floating point operators. /// These will be lowered to the simple operators before final selection. /// They are used to limit optimizations while the DAG is being @@ -441,7 +432,6 @@ enum NodeType { /// FMA - Perform a * b + c with no intermediate rounding step. FMA, - VP_FMA, /// FMAD - Perform a * b + c, while getting the same result as the /// separately rounded operations. @@ -532,19 +522,6 @@ enum NodeType { /// in terms of the element size of VEC1/VEC2, not in terms of bytes. VECTOR_SHUFFLE, - /// VP_VSHIFT(VEC1, AMOUNT, MASK, VLEN) - Returns a vector, of the same type as - /// VEC1. AMOUNT is an integer value. The returned vector is equivalent - /// to VEC1 shifted by AMOUNT (RETURNED_VEC[idx] = VEC1[idx + AMOUNT]). - VP_VSHIFT, - - /// VP_COMPRESS(VEC1, MASK, VLEN) - Returns a vector, of the same type as - /// VEC1. - VP_COMPRESS, - - /// VP_EXPAND(VEC1, MASK, VLEN) - Returns a vector, of the same type as - /// VEC1. - VP_EXPAND, - /// SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a /// scalar value into element 0 of the resultant vector type. The top /// elements 1 to N-1 of the N-element vector are undefined. The type @@ -578,9 +555,6 @@ enum NodeType { OR, XOR, - // Vector-predicated bitwise operators - VP_AND, VP_OR, VP_XOR, - /// ABS - Determine the unsigned absolute value of a signed integer value of /// the same bitwidth. /// Note: A value of INT_MIN will return INT_MIN, no saturation or overflow @@ -609,7 +583,6 @@ enum NodeType { ROTR, FSHL, FSHR, - VP_SHL, VP_SRA, VP_SRL, /// Byte Swap and Counting operators. BSWAP, @@ -634,7 +607,6 @@ enum NodeType { /// change the condition type in order to match the VSELECT node using a /// pattern. The condition follows the BooleanContent format of the target. VSELECT, - VP_SELECT, /// Select with condition operator - This selects between a true value and /// a false value (ops #2 and #3) based on the boolean result of comparing @@ -649,7 +621,6 @@ enum NodeType { /// them with (op #2) as a CondCodeSDNode. If the operands are vector types /// then the result type must also be a vector type. SETCC, - VP_SETCC, /// Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but /// op #2 is a boolean indicating if there is an incoming carry. This @@ -688,8 +659,6 @@ enum NodeType { /// depends on the first letter) to floating point. SINT_TO_FP, UINT_TO_FP, - VP_SINT_TO_FP, - VP_UINT_TO_FP, /// SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to /// sign extend a small value in a large integer register (e.g. sign @@ -736,8 +705,6 @@ enum NodeType { /// the FP value cannot fit in the integer type, the results are undefined. FP_TO_SINT, FP_TO_UINT, - VP_FP_TO_SINT, - VP_FP_TO_UINT, /// X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type /// down to the precision of the destination VT. TRUNC is a flag, which is @@ -763,7 +730,6 @@ enum NodeType { /// X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type. FP_EXTEND, - VP_FP_EXTEND, /// BITCAST - This operator converts between integer, vector and FP /// values, as if the value was stored to memory with one type and loaded @@ -821,12 +787,6 @@ enum NodeType { LRINT, LLRINT, - // Vector-predicated unary floating-point ops - VP_FNEG, VP_FABS, VP_FSQRT, VP_FCBRT, VP_FSIN, VP_FCOS, VP_FPOWI, VP_FPOW, - VP_FLOG, VP_FLOG2, VP_FLOG10, VP_FEXP, VP_FEXP2, - VP_FCEIL, VP_FTRUNC, VP_FRINT, VP_FNEARBYINT, VP_FROUND, VP_FFLOOR, - VP_LROUND, VP_LLROUND, VP_LRINT, VP_LLRINT, - /// FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two /// values. // @@ -836,7 +796,6 @@ enum NodeType { /// The return value of (FMINNUM 0.0, -0.0) could be either 0.0 or -0.0. FMINNUM, FMAXNUM, - VP_FMINNUM, VP_FMAXNUM, /// FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimum or maximum on /// two values, following the IEEE-754 2008 definition. This differs from @@ -1086,7 +1045,6 @@ enum NodeType { // OutChain = MSTORE(Value, BasePtr, Mask) MLOAD, MSTORE, - VP_LOAD, VP_STORE, // Masked gather and scatter - load and store operations for a vector of // random addresses with additional mask operand that prevents memory @@ -1100,17 +1058,6 @@ enum NodeType { MGATHER, MSCATTER, - // VP gather and scatter - load and store operations for a vector of - // random addresses with additional mask and vector length operand that - // prevents memory accesses to the masked-off lanes. - // - // Val, OutChain = VP_GATHER(InChain, BasePtr, Index, Scale, Mask, EVL) - // OutChain = VP_SCATTER(InChain, Value, BasePtr, Index, Scale, Mask, EVL) - // - // The Index operand can have more vector elements than the other operands - // due to type legalization. The extra elements are ignored. - VP_GATHER, VP_SCATTER, - /// This corresponds to the llvm.lifetime.* intrinsics. The first operand /// is the chain and the second operand is the alloca pointer. LIFETIME_START, @@ -1143,7 +1090,6 @@ enum NodeType { /// is the vector to reduce. VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL, - VP_REDUCE_STRICT_FADD, VP_REDUCE_STRICT_FMUL, /// These reductions are non-strict, and have a single vector operand. VECREDUCE_FADD, @@ -1164,23 +1110,6 @@ enum NodeType { VECREDUCE_UMAX, VECREDUCE_UMIN, - // Vector-predicated reduction operators - VP_REDUCE_FADD, - VP_REDUCE_FMUL, - VP_REDUCE_ADD, - VP_REDUCE_MUL, - VP_REDUCE_AND, - VP_REDUCE_OR, - VP_REDUCE_XOR, - VP_REDUCE_SMAX, - VP_REDUCE_SMIN, - VP_REDUCE_UMAX, - VP_REDUCE_UMIN, - - /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants. - VP_REDUCE_FMAX, - VP_REDUCE_FMIN, - /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index a3ff98055632..ff5fd9547c2b 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1375,6 +1375,11 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn, Mask<3>, VectorLeng LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i32_ty]>; +// Element-wise bitops + def int_vp_ctpop : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; // Logical operators def int_vp_ashr : Intrinsic<[ llvm_anyvector_ty ], diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 72987e9d1076..696f02c254eb 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -111,6 +111,13 @@ ///// Integer Arithmetic ///// +// llvm.vp.ctpop(x,mask,vlen) +BEGIN_REGISTER_VP_INTRINSIC(vp_ctpop, 1, 2) +BEGIN_REGISTER_VP_SDNODE(VP_CTPOP, -1, vp_ctpop, 1, 2) +HANDLE_VP_TO_INTRIN(ctpop) +HANDLE_VP_IS_UNARY +END_REGISTER_CASES(vp_ctpop, VP_CTPOP) + // llvm.vp.add(x,y,mask,vlen) BEGIN_REGISTER_VP_INTRINSIC(vp_add, 2, 3) BEGIN_REGISTER_VP_SDNODE(VP_ADD, -1, vp_add, 2, 3) @@ -490,6 +497,16 @@ HANDLE_VP_IS_MEMOP(1, 0) END_REGISTER_CASES(vp_store, VP_STORE) // llvm.vp.scatter(ptr,val,mask,vlen) +// VP gather and scatter - load and store operations for a vector of +// random addresses with additional mask and vector length operand that +// prevents memory accesses to the masked-off lanes. +// +// Val, OutChain = VP_GATHER(InChain, BasePtr, Index, Scale, Mask, EVL) +// OutChain = VP_SCATTER(InChain, Value, BasePtr, Index, Scale, Mask, EVL) +// +// The Index operand can have more vector elements than the other operands +// due to type legalization. The extra elements are ignored. + BEGIN_REGISTER_VP_INTRINSIC(vp_scatter, 2, 3) BEGIN_REGISTER_VP_SDNODE(VP_SCATTER, 1, vp_scatter, 3, 4) HANDLE_VP_TO_INTRIN(masked_scatter) @@ -514,16 +531,23 @@ END_REGISTER_CASES(vp_gather, VP_GATHER) ///// Shuffle & Blend ///// // llvm.vp.compress(x,mask,vlen) +/// VP_COMPRESS(VEC1, MASK, VLEN) - Returns a vector, of the same type as +/// VEC1. BEGIN_REGISTER_VP_INTRINSIC(vp_compress, 1, 2) BEGIN_REGISTER_VP_SDNODE(VP_COMPRESS, -1, vp_compress, 1, 2) END_REGISTER_CASES(vp_compress, VP_COMPRESS) // llvm.vp.expand(x,mask,vlen) +/// VP_EXPAND(VEC1, MASK, VLEN) - Returns a vector, of the same type as +/// VEC1. BEGIN_REGISTER_VP_INTRINSIC(vp_expand, 1, 2) BEGIN_REGISTER_VP_SDNODE(VP_EXPAND, -1, vp_expand, 1, 2) END_REGISTER_CASES(vp_expand, VP_EXPAND) // llvm.vp.vshift(x,amount,mask,vlen) +/// VP_VSHIFT(VEC1, AMOUNT, MASK, VLEN) - Returns a vector, of the same type as +/// VEC1. AMOUNT is an integer value. The returned vector is equivalent +/// to VEC1 shifted by AMOUNT (RETURNED_VEC[idx] = VEC1[idx + AMOUNT]). BEGIN_REGISTER_VP_INTRINSIC(vp_vshift, 2, 3) BEGIN_REGISTER_VP_SDNODE(VP_VSHIFT, -1, vp_vshift, 2, 3) END_REGISTER_CASES(vp_vshift, VP_VSHIFT) diff --git a/llvm/lib/Target/VE/VVPInstrInfo.td b/llvm/lib/Target/VE/VVPInstrInfo.td index 5ddca4732003..9f594f15f259 100644 --- a/llvm/lib/Target/VE/VVPInstrInfo.td +++ b/llvm/lib/Target/VE/VVPInstrInfo.td @@ -68,6 +68,11 @@ def SDTFPUnaryOpVVP : SDTypeProfile<1, 3, [ // fneg, fsqrt, etc SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisSameNumEltsAs<0, 2>, IsVLVT<3> ]>; +// unary int +def SDTUnaryOpVVP : SDTypeProfile<1, 3, [ // ctpop + SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisSameNumEltsAs<0, 2>, IsVLVT<3> +]>; + // gather scatter def vvp_scatter : SDNode<"VEISD::VVP_SCATTER", SDTScatterVVP, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; @@ -145,6 +150,9 @@ def vvp_sext : SDNode<"VEISD::VVP_SEXT", SDTIntExtendOpVVP>; def vvp_zext : SDNode<"VEISD::VVP_ZEXT", SDTIntExtendOpVVP>; def vvp_trunc : SDNode<"VEISD::VVP_TRUNC", SDTIntTruncOpVVP>; +// element-wise bitops +def vvp_ctpop : SDNode<"VEISD::VVP_CTPOP", SDTUnaryOpVVP>; + // reductions def vvp_reduce_fadd : SDNode<"VEISD::VVP_REDUCE_FADD", SDTReduceVVP>; def vvp_reduce_strict_fadd : SDNode<"VEISD::VVP_REDUCE_STRICT_FADD", SDTReduceStartVVP>; diff --git a/llvm/lib/Target/VE/VVPInstrPatterns.td b/llvm/lib/Target/VE/VVPInstrPatterns.td index d21d6039efaa..866f7480529d 100644 --- a/llvm/lib/Target/VE/VVPInstrPatterns.td +++ b/llvm/lib/Target/VE/VVPInstrPatterns.td @@ -241,6 +241,8 @@ multiclass VectorTernaryArith_ShortLong; + defm : VectorBinaryArith_ShortLong; defm : VectorBinaryArith_ShortLong; diff --git a/llvm/lib/Target/VE/VVPNodes.inc b/llvm/lib/Target/VE/VVPNodes.inc index 799637a9bb75..2a289dde4b24 100644 --- a/llvm/lib/Target/VE/VVPNodes.inc +++ b/llvm/lib/Target/VE/VVPNodes.inc @@ -95,6 +95,10 @@ ADD_VVP_OP(VVP_ZEXT) REGISTER_ICONV_VVP_OP(VVP_ZEXT,ZERO_EXTEND) // HAND ADD_VVP_OP(VVP_FPEXT) REGISTER_FPCONV_VVP_OP(VVP_FPEXT,FP_EXTEND) HANDLE_VP_TO_VVP(VP_FP_EXTEND,VVP_FPEXT) ADD_VVP_OP(VVP_FPROUND) REGISTER_FPCONV_VVP_OP(VVP_FPROUND,FP_ROUND) HANDLE_VP_TO_VVP(VP_FROUND,VVP_FPROUND) +// element-wise bitops +ADD_VVP_OP(VVP_CTPOP) REGISTER_UNARY_VVP_OP(VVP_CTPOP,CTPOP) // HANDLE_VP_TO_VVP(VP_FROUND,VVP_FPROUND) // TODO as VP -opt + + #if 0 // Disabled, this gets expanded instead ADD_VVP_OP(VVP_FFLOOR) REGISTER_UNARY_VVP_OP(VVP_FFLOOR, FFLOOR) HANDLE_VP_TO_VVP(VP_FFLOOR, VVP_FFLOOR) diff --git a/llvm/test/CodeGen/VE/Vector/extract_insert_vector_elt.ll b/llvm/test/CodeGen/VE/Vector/extract_insert_vector_elt.ll index c44a3738cd85..d7df1db83055 100644 --- a/llvm/test/CodeGen/VE/Vector/extract_insert_vector_elt.ll +++ b/llvm/test/CodeGen/VE/Vector/extract_insert_vector_elt.ll @@ -101,14 +101,14 @@ define x86_regcallcc <512 x i32> @__regcall3__insert_v512i32r(<512 x i32>, i32) ; CHECK-NEXT: lea %s2, 1024(, %s1) ; CHECK-NEXT: lea %s3, 256 ; CHECK-NEXT: lvl %s3 -; CHECK-NEXT: vstl %v1,4,%s2 -; CHECK-NEXT: vstl %v0,4,%s1 +; CHECK-NEXT: vstl %v1, 4, %s2 +; CHECK-NEXT: vstl %v0, 4, %s1 ; CHECK-NEXT: and %s0, %s0, (55)0 ; CHECK-NEXT: sll %s0, %s0, 2 ; CHECK-NEXT: or %s4, 2, (0)1 ; CHECK-NEXT: stl %s4, 176(%s0, %s11) -; CHECK-NEXT: vldl.zx %v0,4,%s1 -; CHECK-NEXT: vldl.zx %v1,4,%s2 +; CHECK-NEXT: vldl.zx %v0, 4, %s1 +; CHECK-NEXT: vldl.zx %v1, 4, %s2 ; CHECK-NEXT: or %s11, 0, %s9 %3 = insertelement <512 x i32> %0, i32 2, i32 %1 ret <512 x i32> %3 @@ -122,8 +122,8 @@ define x86_regcallcc i32 @__regcall3__extract_v512i32r(<512 x i32>, i32) { ; CHECK-NEXT: lea %s2, 1024(, %s1) ; CHECK-NEXT: lea %s3, 256 ; CHECK-NEXT: lvl %s3 -; CHECK-NEXT: vstl %v1,4,%s2 -; CHECK-NEXT: vstl %v0,4,%s1 +; CHECK-NEXT: vstl %v1, 4, %s2 +; CHECK-NEXT: vstl %v0, 4, %s1 ; CHECK-NEXT: and %s0, %s0, (55)0 ; CHECK-NEXT: sll %s0, %s0, 2 ; CHECK-NEXT: ldl.sx %s0, 176(%s0, %s11) @@ -140,14 +140,14 @@ define x86_regcallcc <512 x float> @__regcall3__insert_v512f32r(<512 x float>, i ; CHECK-NEXT: lea %s2, 1024(, %s1) ; CHECK-NEXT: lea %s3, 256 ; CHECK-NEXT: lvl %s3 -; CHECK-NEXT: vstu %v1,4,%s2 -; CHECK-NEXT: vstu %v0,4,%s1 +; CHECK-NEXT: vstu %v1, 4, %s2 +; CHECK-NEXT: vstu %v0, 4, %s1 ; CHECK-NEXT: and %s0, %s0, (55)0 ; CHECK-NEXT: sll %s0, %s0, 2 ; CHECK-NEXT: lea %s4, 1065353216 ; CHECK-NEXT: stl %s4, 176(%s0, %s11) -; CHECK-NEXT: vldu %v0,4,%s1 -; CHECK-NEXT: vldu %v1,4,%s2 +; CHECK-NEXT: vldu %v0, 4, %s1 +; CHECK-NEXT: vldu %v1, 4, %s2 ; CHECK-NEXT: or %s11, 0, %s9 %3 = insertelement <512 x float> %0, float 1.0, i32 %1 ret <512 x float> %3 @@ -161,8 +161,8 @@ define x86_regcallcc float @__regcall3__extract_v512f32r(<512 x float>, i32) { ; CHECK-NEXT: lea %s2, 1024(, %s1) ; CHECK-NEXT: lea %s3, 256 ; CHECK-NEXT: lvl %s3 -; CHECK-NEXT: vstu %v1,4,%s2 -; CHECK-NEXT: vstu %v0,4,%s1 +; CHECK-NEXT: vstu %v1, 4, %s2 +; CHECK-NEXT: vstu %v0, 4, %s1 ; CHECK-NEXT: and %s0, %s0, (55)0 ; CHECK-NEXT: sll %s0, %s0, 2 ; CHECK-NEXT: ldu %s0, 176(%s0, %s11) diff --git a/llvm/test/CodeGen/VE/Vector/shufflevector_rand.ll b/llvm/test/CodeGen/VE/Vector/shufflevector_rand.ll index f176695f8e0e..57e546416b8a 100644 --- a/llvm/test/CodeGen/VE/Vector/shufflevector_rand.ll +++ b/llvm/test/CodeGen/VE/Vector/shufflevector_rand.ll @@ -8,13 +8,20 @@ define <256 x i32> @shuffle256_rand_ab(<256 x i32> %A, <256 x i32> %B) { ; CHECK-NEXT: lea %s0, 256 ; CHECK-NEXT: lea %s1, 176(, %s11) ; CHECK-NEXT: lvl %s0 -; CHECK-NEXT: vstl %v1,4,%s1 +; CHECK-NEXT: vstl %v1, 4, %s1 ; CHECK-NEXT: lea %s2, 1200(, %s11) -; CHECK-NEXT: vstl %v0,4,%s2 +; CHECK-NEXT: vstl %v0, 4, %s2 +; CHECK-NEXT: xorm %vm1,%vm0,%vm0 +; CHECK-NEXT: lea %s3, 1958668075 +; CHECK-NEXT: lea.sl %s3, 1098834656(, %s3) +; CHECK-NEXT: lvm %vm1,0,%s3 ; CHECK-NEXT: lea %s3, .LCPI0_0@lo ; CHECK-NEXT: and %s3, %s3, (32)0 ; CHECK-NEXT: lea.sl %s3, .LCPI0_0@hi(, %s3) -; CHECK-NEXT: vld %v0,8,%s3 +; CHECK-NEXT: vld %v0, 8, %s3 +; CHECK-NEXT: lea %s3, 1817067682 +; CHECK-NEXT: lea.sl %s3, 2043872357(, %s3) +; CHECK-NEXT: lvm %vm1,1,%s3 ; CHECK-NEXT: vadds.l %v0, %s1, %v0 ; CHECK-NEXT: vgtl.zx %v1, %v0, 0, 0 ; CHECK-NEXT: lea %s1, 252 @@ -22,19 +29,17 @@ define <256 x i32> @shuffle256_rand_ab(<256 x i32> %A, <256 x i32> %B) { ; CHECK-NEXT: and %s3, %s3, (32)0 ; CHECK-NEXT: lea.sl %s3, .LCPI0_1@hi(, %s3) ; CHECK-NEXT: lvl %s1 -; CHECK-NEXT: vld %v0,8,%s3 +; CHECK-NEXT: vld %v0, 8, %s3 +; CHECK-NEXT: lea %s3, -117302645 +; CHECK-NEXT: and %s3, %s3, (32)0 +; CHECK-NEXT: lea.sl %s3, 1208662709(, %s3) +; CHECK-NEXT: lvm %vm1,2,%s3 ; CHECK-NEXT: lvl %s0 ; CHECK-NEXT: vadds.l %v0, %s2, %v0 -; CHECK-NEXT: xorm %vm1,%vm0,%vm0 -; CHECK-NEXT: lea %s2, 1979639787 -; CHECK-NEXT: lvm %vm1,0,%s2 -; CHECK-NEXT: lea %s2, 2111784167 -; CHECK-NEXT: lvm %vm1,1,%s2 -; CHECK-NEXT: lea %s2, -116737345 ; CHECK-NEXT: lvl %s1 ; CHECK-NEXT: vgtl.zx %v0, %v0, 0, 0 -; CHECK-NEXT: lvm %vm1,2,%s2 -; CHECK-NEXT: lea %s1, -51384867 +; CHECK-NEXT: lea %s1, 1548708317 +; CHECK-NEXT: lea.sl %s1, -257309616(, %s1) ; CHECK-NEXT: lvm %vm1,3,%s1 ; CHECK-NEXT: lvl %s0 ; CHECK-NEXT: vmrg %v0,%v0,%v1,%vm1 diff --git a/llvm/test/CodeGen/VE/Vector/vec_ctpop.ll b/llvm/test/CodeGen/VE/Vector/vec_ctpop.ll new file mode 100644 index 000000000000..0a7511863687 --- /dev/null +++ b/llvm/test/CodeGen/VE/Vector/vec_ctpop.ll @@ -0,0 +1,43 @@ +; RUN: llc < %s -mtriple=ve-unknown-unknown -mattr=-packed | FileCheck %s + +declare <256 x i64> @llvm.ctpop.v256i64(<256 x i64>) +declare <256 x i32> @llvm.ctpop.v256i32(<256 x i32>) +declare <256 x i16> @llvm.ctpop.v256i16(<256 x i16>) + +define <256 x i64> @vec_ctpopv256i64(<256 x i64> %a) { +; CHECK-LABEL: vec_ctpopv256i64: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vpcnt %v0, %v0 +; CHECK-NEXT: or %s11, 0, %s9 + %r = call <256 x i64> @llvm.ctpop.v256i64(<256 x i64> %a) + ret <256 x i64> %r +} + +define <256 x i32> @vec_ctpopv256i32(<256 x i32> %a) { +; CHECK-LABEL: vec_ctpopv256i32: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: pvpcnt.lo %v0, %v0 +; CHECK-NEXT: or %s11, 0, %s9 + %r = call <256 x i32> @llvm.ctpop.v256i32(<256 x i32> %a) + ret <256 x i32> %r +} + +define <256 x i16> @vec_ctpopv256i16(<256 x i16> %a) { +; CHECK-LABEL: vec_ctpopv256i16: +; CHECK: .LBB{{[0-9]+}}_2: +; CHECK-NEXT: lea %s0, 256 +; CHECK-NEXT: lea %s1, 65535 +; CHECK-NEXT: lvl %s0 +; CHECK-NEXT: vbrdl %v1,%s1 +; CHECK-NEXT: pvand.lo %v0, %v0, %v1 +; CHECK-NEXT: pvpcnt.lo %v0, %v0 +; CHECK-NEXT: or %s11, 0, %s9 + %r = call <256 x i16> @llvm.ctpop.v256i16(<256 x i16> %a) + ret <256 x i16> %r +} + + diff --git a/llvm/test/CodeGen/VE/Vector/vp-v256i32-mask-avl-isel.ll b/llvm/test/CodeGen/VE/Vector/vp-v256i32-mask-avl-isel.ll index eb8b8fb33acd..b11bc42cc9c2 100644 --- a/llvm/test/CodeGen/VE/Vector/vp-v256i32-mask-avl-isel.ll +++ b/llvm/test/CodeGen/VE/Vector/vp-v256i32-mask-avl-isel.ll @@ -6,7 +6,7 @@ define void @test_vp_harness(<256 x i32>* %Out, <256 x i32> %i0) { ; CHECK-NEXT: lea %s1, 256 ; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 ; CHECK-NEXT: lvl %s1 -; CHECK-NEXT: vstl %v0,4,%s0 +; CHECK-NEXT: vstl %v0, 4, %s0 ; CHECK-NEXT: or %s11, 0, %s9 store <256 x i32> %i0, <256 x i32>* %Out ret void @@ -24,7 +24,7 @@ define void @test_vp_add_sub_mul(<256 x i32>* %Out, <256 x i32> %i0, <256 x i32> ; CHECK-NEXT: lea %s1, 256 ; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 ; CHECK-NEXT: lvl %s1 -; CHECK-NEXT: vstl %v0,4,%s0 +; CHECK-NEXT: vstl %v0, 4, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r0 = call <256 x i32> @llvm.vp.add.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) %r1 = call <256 x i32> @llvm.vp.sub.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) @@ -44,7 +44,7 @@ define void @test_vp_su_div(<256 x i32>* %Out, <256 x i32> %i0, <256 x i32> %i1, ; CHECK-NEXT: lea %s1, 256 ; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 ; CHECK-NEXT: lvl %s1 -; CHECK-NEXT: vstl %v0,4,%s0 +; CHECK-NEXT: vstl %v0, 4, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r0 = call <256 x i32> @llvm.vp.sdiv.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) %r1 = call <256 x i32> @llvm.vp.udiv.v256i32(<256 x i32> %r0, <256 x i32> %i1, <256 x i1> %m, i32 %n) @@ -68,7 +68,7 @@ define void @test_vp_bitarith(<256 x i32>* %Out, <256 x i32> %i0, <256 x i32> %i ; CHECK-NEXT: lea %s1, 256 ; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 ; CHECK-NEXT: lvl %s1 -; CHECK-NEXT: vstl %v0,4,%s0 +; CHECK-NEXT: vstl %v0, 4, %s0 ; CHECK-NEXT: or %s11, 0, %s9 %r0 = call <256 x i32> @llvm.vp.and.v256i32(<256 x i32> %i0, <256 x i32> %i1, <256 x i1> %m, i32 %n) %r1 = call <256 x i32> @llvm.vp.or.v256i32(<256 x i32> %r0, <256 x i32> %i1, <256 x i1> %m, i32 %n) @@ -87,14 +87,12 @@ define void @test_vp_memory(<256 x i32>* %VecPtr, <256 x i32*> %PtrVec, <256 x i ; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 ; CHECK-NEXT: lvl %s1 ; CHECK-NEXT: vseq %v1 -; CHECK-NEXT: # implicit-def: $v2 -; CHECK-NEXT: vmulu.l %v2,4,%v1,%vm1 -; CHECK-NEXT: # implicit-def: $v1 -; CHECK-NEXT: vaddu.l %v1,%s0,%v2,%vm1 -; CHECK-NEXT: vgtl.zx %v1,%v1,0,0,%vm1 +; CHECK-NEXT: vmulu.l %v1, 4, %v1, %vm1 +; CHECK-NEXT: vaddu.l %v1, %s0, %v1, %vm1 +; CHECK-NEXT: vgtl.zx %v1, %v1, 0, 0, %vm1 ; CHECK-NEXT: vgtl.zx %v2, %v0, 0, 0, %vm1 ; CHECK-NEXT: vscl %v0, %v1, 0, 0, %vm1 -; CHECK-NEXT: vstl %v2,4,%s0,%vm1 +; CHECK-NEXT: vstl %v2, 4, %s0, %vm1 ; CHECK-NEXT: or %s11, 0, %s9 %r0 = call <256 x i32> @llvm.vp.load.v256i32.p0v256i32(<256 x i32>* %VecPtr, <256 x i1> %m, i32 %n) %r1 = call <256 x i32> @llvm.vp.gather.v256i32.v256p0i32(<256 x i32*> %PtrVec, <256 x i1> %m, i32 %n) diff --git a/llvm/test/CodeGen/VE/Vector/vr_call.ll b/llvm/test/CodeGen/VE/Vector/vr_call.ll index 12efda6b6a62..477b27fdcaba 100644 --- a/llvm/test/CodeGen/VE/Vector/vr_call.ll +++ b/llvm/test/CodeGen/VE/Vector/vr_call.ll @@ -28,7 +28,7 @@ define x86_regcallcc <256 x i32> @__regcall3__calc3(<256 x i32>, <256 x i32>, <2 ; CHECK-NEXT: lea %s0, 256 ; CHECK-NEXT: lea %s1, 416(, %s11) ; CHECK-NEXT: lvl %s0 -; CHECK-NEXT: vldl.zx %v8,4,%s1 +; CHECK-NEXT: vldl.zx %v8, 4, %s1 ; CHECK-NEXT: vadds.w.sx %v0, %v1, %v0 ; CHECK-NEXT: vadds.w.sx %v1, %v3, %v2 ; CHECK-NEXT: vadds.w.sx %v2, %v5, %v4 diff --git a/llvm/test/CodeGen/VE/loadvm.ll b/llvm/test/CodeGen/VE/loadvm.ll deleted file mode 100644 index a4cb23f2de82..000000000000 --- a/llvm/test/CodeGen/VE/loadvm.ll +++ /dev/null @@ -1,139 +0,0 @@ -; RUN: llc < %s -mtriple=ve -mattr=+vec | FileCheck %s - -@v256i1 = common dso_local local_unnamed_addr global <256 x i1> zeroinitializer, align 4 -@v512i1 = common dso_local local_unnamed_addr global <512 x i1> zeroinitializer, align 4 - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc <256 x i1> @loadv256i1(<256 x i1>* nocapture readonly) { -; CHECK-LABEL: loadv256i1: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: ld %s1, (, %s0) -; CHECK-NEXT: ld %s2, 8(, %s0) -; CHECK-NEXT: ld %s3, 16(, %s0) -; CHECK-NEXT: ld %s0, 24(, %s0) -; CHECK-NEXT: lvm %vm1, 0, %s1 -; CHECK-NEXT: lvm %vm1, 1, %s2 -; CHECK-NEXT: lvm %vm1, 2, %s3 -; CHECK-NEXT: lvm %vm1, 3, %s0 -; CHECK-NEXT: or %s11, 0, %s9 - %2 = load <256 x i1>, <256 x i1>* %0, align 16 - ret <256 x i1> %2 -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc <256 x i1> @loadv256i1stk() { -; CHECK-LABEL: loadv256i1stk: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: ld %s16, 176(, %s11) -; CHECK-NEXT: lvm %vm1, 0, %s16 -; CHECK-NEXT: ld %s16, 184(, %s11) -; CHECK-NEXT: lvm %vm1, 1, %s16 -; CHECK-NEXT: ld %s16, 192(, %s11) -; CHECK-NEXT: lvm %vm1, 2, %s16 -; CHECK-NEXT: ld %s16, 200(, %s11) -; CHECK-NEXT: lvm %vm1, 3, %s16 -; CHECK-NEXT: or %s11, 0, %s9 - %addr = alloca <256 x i1>, align 16 - %1 = load <256 x i1>, <256 x i1>* %addr, align 16 - ret <256 x i1> %1 -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc <256 x i1> @loadv256i1com() { -; CHECK-LABEL: loadv256i1com: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lea %s0, v256i1@lo -; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s0, v256i1@hi(, %s0) -; CHECK-NEXT: ld %s1, (, %s0) -; CHECK-NEXT: ld %s2, 8(, %s0) -; CHECK-NEXT: ld %s3, 16(, %s0) -; CHECK-NEXT: ld %s0, 24(, %s0) -; CHECK-NEXT: lvm %vm1, 0, %s1 -; CHECK-NEXT: lvm %vm1, 1, %s2 -; CHECK-NEXT: lvm %vm1, 2, %s3 -; CHECK-NEXT: lvm %vm1, 3, %s0 -; CHECK-NEXT: or %s11, 0, %s9 - %1 = load <256 x i1>, <256 x i1>* @v256i1, align 16 - ret <256 x i1> %1 -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc <512 x i1> @loadv512i1(<512 x i1>* nocapture readonly) { -; CHECK-LABEL: loadv512i1: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: ld %s1, (, %s0) -; CHECK-NEXT: ld %s2, 8(, %s0) -; CHECK-NEXT: ld %s3, 16(, %s0) -; CHECK-NEXT: ld %s4, 24(, %s0) -; CHECK-NEXT: lvm %vm3, 0, %s1 -; CHECK-NEXT: lvm %vm3, 1, %s2 -; CHECK-NEXT: lvm %vm3, 2, %s3 -; CHECK-NEXT: lvm %vm3, 3, %s4 -; CHECK-NEXT: ld %s1, 32(, %s0) -; CHECK-NEXT: ld %s2, 40(, %s0) -; CHECK-NEXT: ld %s3, 48(, %s0) -; CHECK-NEXT: ld %s0, 56(, %s0) -; CHECK-NEXT: lvm %vm2, 0, %s1 -; CHECK-NEXT: lvm %vm2, 1, %s2 -; CHECK-NEXT: lvm %vm2, 2, %s3 -; CHECK-NEXT: lvm %vm2, 3, %s0 -; CHECK-NEXT: or %s11, 0, %s9 - %2 = load <512 x i1>, <512 x i1>* %0, align 16 - ret <512 x i1> %2 -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc <512 x i1> @loadv512i1stk() { -; CHECK-LABEL: loadv512i1stk: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: # implicit-def: $vmp1 -; CHECK-NEXT: ld %s16, 176(, %s11) -; CHECK-NEXT: lvm %vm3, 0, %s16 -; CHECK-NEXT: ld %s16, 184(, %s11) -; CHECK-NEXT: lvm %vm3, 1, %s16 -; CHECK-NEXT: ld %s16, 192(, %s11) -; CHECK-NEXT: lvm %vm3, 2, %s16 -; CHECK-NEXT: ld %s16, 200(, %s11) -; CHECK-NEXT: lvm %vm3, 3, %s16 -; CHECK-NEXT: ld %s16, 208(, %s11) -; CHECK-NEXT: lvm %vm2, 0, %s16 -; CHECK-NEXT: ld %s16, 216(, %s11) -; CHECK-NEXT: lvm %vm2, 1, %s16 -; CHECK-NEXT: ld %s16, 224(, %s11) -; CHECK-NEXT: lvm %vm2, 2, %s16 -; CHECK-NEXT: ld %s16, 232(, %s11) -; CHECK-NEXT: lvm %vm2, 3, %s16 -; CHECK-NEXT: or %s11, 0, %s9 - %addr = alloca <512 x i1>, align 16 - %1 = load <512 x i1>, <512 x i1>* %addr, align 16 - ret <512 x i1> %1 -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc <512 x i1> @loadv512i1com() { -; CHECK-LABEL: loadv512i1com: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lea %s0, v512i1@lo -; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s0, v512i1@hi(, %s0) -; CHECK-NEXT: ld %s1, (, %s0) -; CHECK-NEXT: ld %s2, 8(, %s0) -; CHECK-NEXT: ld %s3, 16(, %s0) -; CHECK-NEXT: ld %s4, 24(, %s0) -; CHECK-NEXT: lvm %vm3, 0, %s1 -; CHECK-NEXT: lvm %vm3, 1, %s2 -; CHECK-NEXT: lvm %vm3, 2, %s3 -; CHECK-NEXT: lvm %vm3, 3, %s4 -; CHECK-NEXT: ld %s1, 32(, %s0) -; CHECK-NEXT: ld %s2, 40(, %s0) -; CHECK-NEXT: ld %s3, 48(, %s0) -; CHECK-NEXT: ld %s0, 56(, %s0) -; CHECK-NEXT: lvm %vm2, 0, %s1 -; CHECK-NEXT: lvm %vm2, 1, %s2 -; CHECK-NEXT: lvm %vm2, 2, %s3 -; CHECK-NEXT: lvm %vm2, 3, %s0 -; CHECK-NEXT: or %s11, 0, %s9 - %1 = load <512 x i1>, <512 x i1>* @v512i1, align 16 - ret <512 x i1> %1 -} diff --git a/llvm/test/CodeGen/VE/shufflevector.ll b/llvm/test/CodeGen/VE/shufflevector.ll deleted file mode 100644 index 47e5fcd444cd..000000000000 --- a/llvm/test/CodeGen/VE/shufflevector.ll +++ /dev/null @@ -1,194 +0,0 @@ -; RUN: llc < %s -mtriple=ve -mattr=+vec | FileCheck %s - -; Function Attrs: norecurse nounwind readnone -define x86_regcallcc <8 x i32> @__regcall3__svv512i32(<4 x i32>) { -; CHECK-LABEL: __regcall3__svv512i32: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lvs %s0, %v0(1) -; CHECK-NEXT: lvs %s1, %v0(0) -; CHECK-NEXT: lsv %v0(0), %s1 -; CHECK-NEXT: lsv %v0(1), %s0 -; CHECK-NEXT: lsv %v0(2), %s0 -; CHECK-NEXT: lsv %v0(3), %s0 -; CHECK-NEXT: lsv %v0(4), %s0 -; CHECK-NEXT: lsv %v0(5), %s0 -; CHECK-NEXT: lsv %v0(6), %s0 -; CHECK-NEXT: lsv %v0(7), %s0 -; CHECK-NEXT: or %s11, 0, %s9 - %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer, <8 x i32> - ret <8 x i32> %2 -} - -; Function Attrs: norecurse nounwind readnone -define x86_regcallcc <8 x i32> @__regcall3__svv512i32_brd2(<4 x i32>) { -; CHECK-LABEL: __regcall3__svv512i32_brd2: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lea %s0, 8 -; CHECK-NEXT: lvs %s1, %v0(2) -; CHECK-NEXT: lvl %s0 -; CHECK-NEXT: vbrdl %v0, %s1 -; CHECK-NEXT: or %s11, 0, %s9 - %2 = shufflevector <4 x i32> %0, <4 x i32> zeroinitializer, <8 x i32> - ret <8 x i32> %2 -} - -; Function Attrs: norecurse nounwind readnone -define x86_regcallcc <8 x i32> @__regcall3__svv512i32_brd7(<4 x i32>, <4 x i32>) { -; CHECK-LABEL: __regcall3__svv512i32_brd7: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lea %s0, 8 -; CHECK-NEXT: lvs %s1, %v1(3) -; CHECK-NEXT: lvl %s0 -; CHECK-NEXT: vbrdl %v0, %s1 -; CHECK-NEXT: or %s11, 0, %s9 - %3 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> - ret <8 x i32> %3 -} - -; Function Attrs: norecurse nounwind readnone -define x86_regcallcc <256 x i32> @__regcall3__svv512i32_vmrg(<256 x i32>, <256 x i32>) { -; CHECK-LABEL: __regcall3__svv512i32_vmrg: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: or %s11, 0, %s9 - %3 = shufflevector <256 x i32> %0, <256 x i32> %1, <256 x i32> - - ret <256 x i32> %3 -} - -; Function Attrs: norecurse nounwind readnone -define x86_regcallcc <256 x i32> @__regcall3__svv512i32_vmrg1(<256 x i32>, <256 x i32>) { -; CHECK-LABEL: __regcall3__svv512i32_vmrg1: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lea %s0, 256 -; CHECK-NEXT: or %s1, 16, (0)1 -; CHECK-NEXT: lvl %s0 -; CHECK-NEXT: vmv %v1,%s1,%v1 -; CHECK-NEXT: or %s1, 0, (0)1 -; CHECK-NEXT: lvm %vm1, 0, %s1 -; CHECK-NEXT: lvm %vm1, 1, %s1 -; CHECK-NEXT: lvm %vm1, 2, %s1 -; CHECK-NEXT: lea %s1, 255 -; CHECK-NEXT: lvm %vm1, 3, %s1 -; CHECK-NEXT: vmrg %v0,%v0,%v1,%vm1 -; CHECK-NEXT: or %s11, 0, %s9 - %3 = shufflevector <256 x i32> %0, <256 x i32> %1, <256 x i32> - - ret <256 x i32> %3 -} - -; Function Attrs: norecurse nounwind readnone -define x86_regcallcc <256 x i32> @__regcall3__svv512i32_vmrg2(<256 x i32>, <256 x i32>) { -; CHECK-LABEL: __regcall3__svv512i32_vmrg2: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: lea %s0, 256 -; CHECK-NEXT: or %s1, 16, (0)1 -; CHECK-NEXT: lvl %s0 -; CHECK-NEXT: vmv %v1,%s1,%v1 -; CHECK-NEXT: or %s1, 8, (0)1 -; CHECK-NEXT: vmv %v0,%s1,%v0 -; CHECK-NEXT: or %s1, 0, (0)1 -; CHECK-NEXT: lvm %vm1, 0, %s1 -; CHECK-NEXT: lvm %vm1, 1, %s1 -; CHECK-NEXT: lvm %vm1, 2, %s1 -; CHECK-NEXT: lea %s1, 255 -; CHECK-NEXT: lvm %vm1, 3, %s1 -; CHECK-NEXT: vmrg %v0,%v0,%v1,%vm1 -; CHECK-NEXT: or %s11, 0, %s9 - %3 = shufflevector <256 x i32> %0, <256 x i32> %1, <256 x i32> - - ret <256 x i32> %3 -} - diff --git a/llvm/test/CodeGen/VE/storevm.ll b/llvm/test/CodeGen/VE/storevm.ll deleted file mode 100644 index 514cfcad83be..000000000000 --- a/llvm/test/CodeGen/VE/storevm.ll +++ /dev/null @@ -1,139 +0,0 @@ -; RUN: llc < %s -mtriple=ve-unknown-unknown | FileCheck %s - -@v256i1 = common dso_local local_unnamed_addr global <256 x i1> zeroinitializer, align 4 -@v512i1 = common dso_local local_unnamed_addr global <512 x i1> zeroinitializer, align 4 - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc void @storev256i1(<256 x i1>* nocapture, <256 x i1>) { -; CHECK-LABEL: storev256i1: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: svm %s1,%vm1,3 -; CHECK-NEXT: st %s1, 24(, %s0) -; CHECK-NEXT: svm %s1,%vm1,2 -; CHECK-NEXT: st %s1, 16(, %s0) -; CHECK-NEXT: svm %s1,%vm1,1 -; CHECK-NEXT: st %s1, 8(, %s0) -; CHECK-NEXT: svm %s1,%vm1,0 -; CHECK-NEXT: st %s1, (, %s0) -; CHECK-NEXT: or %s11, 0, %s9 - store <256 x i1> %1, <256 x i1>* %0, align 16 - ret void -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc void @storev256i1stk(<256 x i1>) { -; CHECK-LABEL: storev256i1stk: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: svm %s16,%vm1,0 -; CHECK-NEXT: st %s16, 176(, %s11) -; CHECK-NEXT: svm %s16,%vm1,1 -; CHECK-NEXT: st %s16, 184(, %s11) -; CHECK-NEXT: svm %s16,%vm1,2 -; CHECK-NEXT: st %s16, 192(, %s11) -; CHECK-NEXT: svm %s16,%vm1,3 -; CHECK-NEXT: st %s16, 200(, %s11) -; CHECK-NEXT: or %s11, 0, %s9 - %addr = alloca <256 x i1>, align 16 - store <256 x i1> %0, <256 x i1>* %addr, align 16 - ret void -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc void @storev256i1com(<256 x i1>) { -; CHECK-LABEL: storev256i1com: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: svm %s0,%vm1,3 -; CHECK-NEXT: lea %s1, v256i1@lo -; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s1, v256i1@hi(, %s1) -; CHECK-NEXT: st %s0, 24(, %s1) -; CHECK-NEXT: svm %s0,%vm1,2 -; CHECK-NEXT: st %s0, 16(, %s1) -; CHECK-NEXT: svm %s0,%vm1,1 -; CHECK-NEXT: st %s0, 8(, %s1) -; CHECK-NEXT: svm %s0,%vm1,0 -; CHECK-NEXT: st %s0, (, %s1) -; CHECK-NEXT: or %s11, 0, %s9 - store <256 x i1> %0, <256 x i1>* @v256i1, align 16 - ret void -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc void @storev512i1(<512 x i1>* nocapture, <512 x i1>) { -; CHECK-LABEL: storev512i1: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: svm %s1,%vm2,3 -; CHECK-NEXT: st %s1, 56(, %s0) -; CHECK-NEXT: svm %s1,%vm2,2 -; CHECK-NEXT: st %s1, 48(, %s0) -; CHECK-NEXT: svm %s1,%vm2,1 -; CHECK-NEXT: st %s1, 40(, %s0) -; CHECK-NEXT: svm %s1,%vm2,0 -; CHECK-NEXT: st %s1, 32(, %s0) -; CHECK-NEXT: svm %s1,%vm1,3 -; CHECK-NEXT: st %s1, 24(, %s0) -; CHECK-NEXT: svm %s1,%vm1,2 -; CHECK-NEXT: st %s1, 16(, %s0) -; CHECK-NEXT: svm %s1,%vm1,1 -; CHECK-NEXT: st %s1, 8(, %s0) -; CHECK-NEXT: svm %s1,%vm1,0 -; CHECK-NEXT: st %s1, (, %s0) -; CHECK-NEXT: or %s11, 0, %s9 - store <512 x i1> %1, <512 x i1>* %0, align 16 - ret void -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc void @storev512i1stk(<512 x i1>) { -; CHECK-LABEL: storev512i1stk: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: svm %s16,%vm1,0 -; CHECK-NEXT: st %s16, 176(, %s11) -; CHECK-NEXT: svm %s16,%vm1,1 -; CHECK-NEXT: st %s16, 184(, %s11) -; CHECK-NEXT: svm %s16,%vm1,2 -; CHECK-NEXT: st %s16, 192(, %s11) -; CHECK-NEXT: svm %s16,%vm1,3 -; CHECK-NEXT: st %s16, 200(, %s11) -; CHECK-NEXT: svm %s0,%vm2,3 -; CHECK-NEXT: st %s0, 232(, %s11) -; CHECK-NEXT: svm %s0,%vm2,2 -; CHECK-NEXT: st %s0, 224(, %s11) -; CHECK-NEXT: svm %s0,%vm2,1 -; CHECK-NEXT: st %s0, 216(, %s11) -; CHECK-NEXT: svm %s0,%vm2,0 -; CHECK-NEXT: st %s0, 208(, %s11) -; CHECK-NEXT: or %s11, 0, %s9 - %addr = alloca <512 x i1>, align 16 - store <512 x i1> %0, <512 x i1>* %addr, align 16 - ret void -} - -; Function Attrs: norecurse nounwind readonly -define x86_regcallcc void @storev512i1com(<512 x i1>) { -; CHECK-LABEL: storev512i1com: -; CHECK: .LBB{{[0-9]+}}_2: -; CHECK-NEXT: svm %s0,%vm2,3 -; CHECK-NEXT: lea %s1, v512i1@lo -; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s1, v512i1@hi(, %s1) -; CHECK-NEXT: st %s0, 56(, %s1) -; CHECK-NEXT: svm %s0,%vm2,2 -; CHECK-NEXT: st %s0, 48(, %s1) -; CHECK-NEXT: svm %s0,%vm2,1 -; CHECK-NEXT: st %s0, 40(, %s1) -; CHECK-NEXT: svm %s0,%vm2,0 -; CHECK-NEXT: st %s0, 32(, %s1) -; CHECK-NEXT: svm %s0,%vm1,3 -; CHECK-NEXT: st %s0, 24(, %s1) -; CHECK-NEXT: svm %s0,%vm1,2 -; CHECK-NEXT: st %s0, 16(, %s1) -; CHECK-NEXT: svm %s0,%vm1,1 -; CHECK-NEXT: st %s0, 8(, %s1) -; CHECK-NEXT: svm %s0,%vm1,0 -; CHECK-NEXT: st %s0, (, %s1) -; CHECK-NEXT: or %s11, 0, %s9 - store <512 x i1> %0, <512 x i1>* @v512i1, align 16 - ret void -} - diff --git a/llvm/utils/benchmark/src/cycleclock.h b/llvm/utils/benchmark/src/cycleclock.h index 1b0f09359c9b..d179185e51b2 100644 --- a/llvm/utils/benchmark/src/cycleclock.h +++ b/llvm/utils/benchmark/src/cycleclock.h @@ -193,6 +193,12 @@ inline BENCHMARK_ALWAYS_INLINE int64_t Now() { asm volatile("rdcycle %0" : "=r"(cycles)); return cycles; #endif +#elif defined(__ve__) // SX-Aurora Vector Engine + // we could use alternatively the USRCC register for per process user cycles + // but gettimeofday() is also a fast call. + struct timeval tv; + gettimeofday(&tv, nullptr); + return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; #else // The soft failover to a generic implementation is automatic only for ARM. // For other platforms the developer is expected to make an attempt to create