sx-aurora-dev · kaz7 · Aug 26, 2021 · Aug 16, 2021 · Aug 23, 2021
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1050,6 +1050,14 @@ class TargetLoweringBase {
     return Legal;
   }
 
+  // Use this to bypass the builtin legalization decisions for EVTs. The builtin
+  // scheme may lead to undesirable results (eg power-of-two-padding or
+  // scalarization) for EVT-typed nodes (eg v7f16).
+  virtual Optional<LegalizeKind> getCustomTypeConversion(LLVMContext &Context,
+                                                         EVT VT) const {
+    return None;
+  }
+
   /// Return how this operation should be treated: either it is legal, needs to
   /// be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -422,6 +422,27 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
         return Val;
     }
 
+    // Vector/Vector bitcast.
+    if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+    if (ValueVT.isScalableVector()) {
+      assert(PartEVT.getVectorElementCount() ==
+             ValueVT.getVectorElementCount());
+      // Promote or truncate.
+      return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
+    }
+
+    // Shorten and promote.
+    assert(PartEVT.getVectorNumElements() >= ValueVT.getVectorNumElements());
+    if (PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
+      EVT ClippedVT =
+          EVT::getVectorVT(*DAG.getContext(), PartEVT.getVectorElementType(),
+                           ValueVT.getVectorNumElements());
+      Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ClippedVT, Val,
+                        DAG.getVectorIdxConstant(0, DL));
+    }
+
     // Promoted vector extract
     return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
   }
@@ -611,26 +632,42 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
 
 static SDValue widenVectorToPartType(SelectionDAG &DAG, SDValue Val,
                                      const SDLoc &DL, EVT PartVT) {
+
   if (!PartVT.isVector())
     return SDValue();
 
   EVT ValueVT = Val.getValueType();
   ElementCount PartNumElts = PartVT.getVectorElementCount();
   ElementCount ValueNumElts = ValueVT.getVectorElementCount();
 
+  // Widening a scalable vector to another scalable vector is done by inserting
+  // the vector into a larger undef one.
+  if (PartVT.isFixedLengthVector() &&
+      (PartNumElts.getFixedValue() > ValueNumElts.getFixedValue())) {
+    // Promote first?
+    if (PartVT.getVectorElementType() != ValueVT.getVectorElementType()) {
+      if (PartVT.getVectorElementType().getScalarSizeInBits() <
+          ValueVT.getVectorElementType().getScalarSizeInBits()) {
+        return SDValue();
+      }
+
+      // Promote, then extract.
+      EVT PromotedVT =
+          EVT::getVectorVT(*DAG.getContext(), PartVT.getVectorElementType(),
+                           ValueVT.getVectorNumElements());
+      Val = DAG.getAnyExtOrTrunc(Val, DL, PromotedVT);
+    }
+  } else if (PartNumElts.isScalable())
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT),
+                       Val, DAG.getVectorIdxConstant(0, DL));
   // We only support widening vectors with equivalent element types and
   // fixed/scalable properties. If a target needs to widen a fixed-length type
   // to a scalable one, it should be possible to use INSERT_SUBVECTOR below.
-  if (ElementCount::isKnownLE(PartNumElts, ValueNumElts) ||
+  else if (ElementCount::isKnownLE(PartNumElts, ValueNumElts) ||
       PartNumElts.isScalable() != ValueNumElts.isScalable() ||
       PartVT.getVectorElementType() != ValueVT.getVectorElementType())
     return SDValue();
 
-  // Widening a scalable vector to another scalable vector is done by inserting
-  // the vector into a larger undef one.
-  if (PartNumElts.isScalable())
-    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, PartVT, DAG.getUNDEF(PartVT),
-                       Val, DAG.getVectorIdxConstant(0, DL));
 
   EVT ElementVT = PartVT.getVectorElementType();
   // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -956,6 +956,11 @@ void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
 
 TargetLoweringBase::LegalizeKind
 TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
+  // Fully customized legalization.
+  Optional<LegalizeKind> CustomLK = getCustomTypeConversion(Context, VT);
+  if (CustomLK)
+    return *CustomLK;
+
   // If this is a simple type, use the ComputeRegisterProp mechanism.
   if (VT.isSimple()) {
     MVT SVT = VT.getSimpleVT();

diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td
@@ -30,6 +30,9 @@ class CCIfNotSubtarget<string F, CCAction A>
 class CCIfVPU<CCAction A> : CCIfSubtarget<"enableVPU()",A>;
 class CCIfNotVPU<CCAction A> : CCIfNotSubtarget<"enableVPU()",A>;
 
+class CCIfPacked<CCAction A> : CCIfVPU<CCIfSubtarget<"hasPackedMode()",A>>;
+class CCIfNotPacked<CCAction A> : CCIfNotSubtarget<"hasPackedMode()",A>;
+
 def CC_VE_C_Stack: CallingConv<[
   // F128 are assigned to the stack in 16-byte aligned units
   CCIfType<[f128], CCAssignToStackWithShadow<16, 16, [SX7]>>,
@@ -176,24 +179,22 @@ def RetCC_VE_C : CallingConv<[
 ///// fastcc - fast vreg passing /////
 def CC_VE_Fast : CallingConv<[
   // Virtual packed registers.
-  CCIfVPU<CCIfType<[v512f64, v512i64],
+  CCIfPacked<CCIfType<[v512f64, v512i64],
            CCAssignToRegWithShadow<[VP0, VP1, VP2, VP3],
                                    [V0,  V2,  V4,  V6]>>>,
 
   // vector --> generic vector registers
-  CCIfVPU<CCIfType<[v256i32, v256f32, v256i64, v256f64,
-            v512i32, v512f32],
+  CCIfVPU<CCIfType<[v256i32, v256f32, v256i64, v256f64],
            CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>>,
-
-  CCIfVPU<CCIfType<[v512i32, v512f32],
+  CCIfPacked<CCIfType<[v512i32, v512f32],
            CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>>,
 
   // vector mask --> generic vector mask registers
   CCIfVPU<CCIfType<[v256i1],
            CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>>,
 
   // pair of vector mask --> generic vector mask registers
-  CCIfVPU<CCIfType<[v512i1],
+  CCIfPacked<CCIfType<[v512i1],
            CCAssignToRegWithShadow<[VMP1, VMP2, VMP3], [VM1, VM3, VM5]>>>,
 
   // Default to the standard cc
@@ -202,23 +203,23 @@ def CC_VE_Fast : CallingConv<[
 
 def RetCC_VE_Fast : CallingConv<[
   // Virtual packed registers.
-  CCIfVPU<CCIfType<[v512f64, v512i64],
+  CCIfPacked<CCIfType<[v512f64, v512i64],
            CCAssignToRegWithShadow<[VP0, VP1, VP2, VP3],
                                    [V0,  V2,  V4,  V6]>>>,
 
   // vector --> generic vector registers
   CCIfVPU<CCIfType<[v256i32, v256f32, v256i64, v256f64],
            CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>>,
 
-  CCIfVPU<CCIfType<[v512i32, v512f32],
+  CCIfPacked<CCIfType<[v512i32, v512f32],
            CCAssignToReg<[V0, V1, V2, V3, V4, V5, V6, V7]>>>,
 
   // vector mask --> generic vector mask registers
   CCIfVPU<CCIfType<[v256i1],
            CCAssignToReg<[VM1, VM2, VM3, VM4, VM5, VM6, VM7]>>>,
 
   // pair of vector mask --> generic vector mask registers
-  CCIfVPU<CCIfType<[v512i1],
+  CCIfPacked<CCIfType<[v512i1],
            CCAssignToRegWithShadow<[VMP1, VMP2, VMP3],
                                    [VM1, VM3, VM5]>>>,
 

diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -3777,3 +3777,201 @@ SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerVAARG(Op, DAG);
   }
 }
+
+static bool isPackableElemVT(EVT VT) {
+  if (VT.isVector())
+    return false;
+  return VT.getScalarSizeInBits() <= 32;
+}
+
+static bool isVectorRegisterVT(EVT VT) {
+  if (!VT.isVector() || VT.isScalableVector())
+    return false;
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT ElemVT = VT.getVectorElementType();
+
+  // Not a legal element count.
+  if ((NumElems != 256) && (NumElems != 512))
+    return false;
+
+  // Legal as both regular and packed vectors.
+  if (ElemVT == MVT::i1 || ElemVT == MVT::i32 || ElemVT == MVT::f32)
+    return true;
+
+  // Only legal in regular mode.
+  return NumElems == 256;
+}
+
+static TargetLoweringBase::LegalizeKind
+getPromoteElementConversion(LLVMContext &Context, EVT ElemVT,
+                            unsigned NumElems) {
+  using LegalizeKind = TargetLoweringBase::LegalizeKind;
+  using LegalizeTypeAction = TargetLoweringBase::LegalizeTypeAction;
+
+  LegalizeTypeAction LTA;
+  MVT PromotedElemVT;
+  if (ElemVT.isFloatingPoint()) {
+    PromotedElemVT = MVT::f32;
+    LTA = LegalizeTypeAction::TypePromoteFloat;
+  } else {
+    assert(ElemVT.isInteger());
+    PromotedElemVT = MVT::i32;
+    LTA = LegalizeTypeAction::TypePromoteInteger;
+  }
+  return LegalizeKind(LTA, EVT::getVectorVT(Context, PromotedElemVT, NumElems));
+}
+
+static TargetLoweringBase::LegalizeKind
+getWidenVectorConversion(LLVMContext &Context, EVT ElemVT,
+                         unsigned LegalNumElems) {
+  using LegalizeKind = TargetLoweringBase::LegalizeKind;
+  using LegalizeTypeAction = TargetLoweringBase::LegalizeTypeAction;
+
+  return LegalizeKind(LegalizeTypeAction::TypeWidenVector,
+                      EVT::getVectorVT(Context, ElemVT, LegalNumElems));
+}
+
+static TargetLoweringBase::LegalizeKind
+getSplitVectorConversion(LLVMContext &Context, EVT ElemVT, unsigned NumElems) {
+  using LegalizeKind = TargetLoweringBase::LegalizeKind;
+  using LegalizeTypeAction = TargetLoweringBase::LegalizeTypeAction;
+
+  return LegalizeKind(LegalizeTypeAction::TypeSplitVector,
+                      EVT::getVectorVT(Context, ElemVT, (NumElems + 1) / 2));
+}
+
+Optional<TargetLoweringBase::LegalizeKind>
+VETargetLowering::getCustomTypeConversion(LLVMContext &Context, EVT VT) const {
+  // Do not interfere with SPU legalization.
+  if (!VT.isVector() || !Subtarget->enableVPU() ||
+      VT.getVectorNumElements() == 1)
+    return None;
+
+  EVT ElemVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  auto ElemBits = ElemVT.getScalarSizeInBits();
+
+  // Only use packed mode when surpassing the regular (256 elements) vector
+  // size.
+  const bool RequiresPackedRegister =
+      isOverPackedType(VT) || (isPackableElemVT(ElemVT) && NumElems > 256);
+
+  // Already a legal type.
+  if (isVectorRegisterVT(VT) &&
+      (!RequiresPackedRegister || Subtarget->hasPackedMode()))
+    return None;
+
+  // Promote small elements to i/f32.
+  if (1 < ElemBits && ElemBits < 32)
+    return getPromoteElementConversion(Context, ElemVT, NumElems);
+
+  // Excessive element size.
+  if (ElemBits > 64)
+    return None; // Defer to builtin expansion for oversized vectors.
+
+  // Only use packed mode when surpassing the regular (256 elements) vector
+  // size.
+  const bool UsePackedRegister =
+      Subtarget->hasPackedMode() && RequiresPackedRegister;
+
+  // Widen to register width.
+  const unsigned RegisterNumElems = UsePackedRegister ? 512 : 256;
+  if (NumElems < RegisterNumElems)
+    return getWidenVectorConversion(Context, ElemVT, RegisterNumElems);
+
+  // Split to register width.
+  // TODO: Teach isel to split non-power-of-two vectors.
+  if (NumElems > RegisterNumElems && (NumElems % 2 == 0))
+    return getSplitVectorConversion(Context, ElemVT, NumElems);
+
+  // Type is either legal or not custom converted.
+  return None;
+}
+
+Optional<VETargetLowering::RegisterCountPair>
+VETargetLowering::getRegistersForCallingConv(LLVMContext &Context,
+                                             CallingConv::ID CC, EVT VT) const {
+  using RegisterCount = VETargetLowering::RegisterCountPair;
+  if (CC != CallingConv::Fast)
+    return None;
+  if (!VT.isVector() || VT.isScalableVector())
+    return None;
+
+  MVT RegisterVT;
+  EVT IntermediateVT;
+  unsigned NumIntermediates;
+  unsigned NumRegs = getVectorTypeBreakdownForCallingConv(
+      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+  return RegisterCount{RegisterVT, NumRegs};
+}
+
+unsigned VETargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+  auto DefaultImpl = [&]() {
+    return TargetLoweringBase::getVectorTypeBreakdownForCallingConv(
+        Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+  };
+
+  auto ElemVT = VT.getVectorElementType();
+  unsigned NumElems = VT.isScalableVector() ? 0 : VT.getVectorNumElements();
+  const bool RequiresPackedRegister =
+      !VT.isScalableVector() &&
+      (isOverPackedType(VT) || (isPackableElemVT(ElemVT) && NumElems > 256));
+
+  if (CC != CallingConv::Fast || VT.isScalableVector() ||
+      (isVectorRegisterVT(VT) &&
+       !(Subtarget->hasPackedMode() && RequiresPackedRegister)))
+    return DefaultImpl();
+
+  // fastcc - map everything to vregs.
+  auto LK = getCustomTypeConversion(Context, VT);
+  // Non-custom converted type - back to builtin logic.
+  if (!LK.hasValue())
+    return DefaultImpl();
+
+  // Compute the fixed point of the custom type conversion rules.
+  // We want to have the same vector layout inside functions as well as across
+  // function boundaries.
+
+  // IntermediateVT : used to copy the parts.
+  IntermediateVT = VT;
+  NumIntermediates = 1;
+
+  EVT NextVT;
+  do {
+    NextVT = LK->second;
+    auto LTA = LK->first;
+
+    switch (LTA) {
+    default:
+      return DefaultImpl();
+
+    case LegalizeTypeAction::TypePromoteFloat:
+    case LegalizeTypeAction::TypePromoteInteger:
+      // Promote elements across call boundaries.
+      IntermediateVT = NextVT;
+      break;
+
+    case LegalizeTypeAction::TypeWidenVector:
+      // Retain all information about the original vector length.
+      // That is, keep the IntermediateVT at the original vector length if
+      // possible
+      break;
+
+    case LegalizeTypeAction::TypeSplitVector:
+      // The last split results in the intermediate VT used for copying vectors
+      // at calls.
+      IntermediateVT = NextVT;
+      NumIntermediates *= 2;
+      break;
+    }
+
+    LK = getCustomTypeConversion(Context, NextVT);
+  } while (LK.hasValue());
+
+  RegisterVT = NextVT.getSimpleVT();
+
+  // Must converge in a valid RegisterVT.
+  return NumIntermediates;
+}