Skip to content

Commit 27c8a1e

Browse files
Generate explicit bitcasts in NeonEmitter
1 parent 67ba481 commit 27c8a1e

13 files changed

+1018
-962
lines changed

clang/include/clang/Basic/TargetBuiltins.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,10 @@ namespace clang {
214214
EltType ET = getEltType();
215215
return ET == Poly8 || ET == Poly16 || ET == Poly64;
216216
}
217+
bool isFloatingPoint() const {
218+
EltType ET = getEltType();
219+
return ET == Float16 || ET == Float32 || ET == Float64 || ET == BFloat16;
220+
}
217221
bool isUnsigned() const { return (Flags & UnsignedFlag) != 0; }
218222
bool isQuad() const { return (Flags & QuadFlag) != 0; }
219223
unsigned getEltSizeInBits() const {

clang/include/clang/Basic/arm_neon.td

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def OP_VCVT_NA_HI_F32 : Op<(call "vcombine", $p0, (call "vcvt_f32_f64", $p1))>;
129129
def OP_VCVT_EX_HI_F32 : Op<(call "vcvt_f32_f16", (call "vget_high", $p0))>;
130130
def OP_VCVT_EX_HI_F64 : Op<(call "vcvt_f64_f32", (call "vget_high", $p0))>;
131131
def OP_VCVTX_HI : Op<(call "vcombine", $p0, (call "vcvtx_f32", $p1))>;
132-
def OP_REINT : Op<(cast "R", $p0)>;
132+
def OP_REINT : Op<(bitcast "R", $p0)>;
133133
def OP_ADDHNHi : Op<(call "vcombine", $p0, (call "vaddhn", $p1, $p2))>;
134134
def OP_RADDHNHi : Op<(call "vcombine", $p0, (call "vraddhn", $p1, $p2))>;
135135
def OP_SUBHNHi : Op<(call "vcombine", $p0, (call "vsubhn", $p1, $p2))>;
@@ -929,12 +929,12 @@ def CFMLE : SOpInst<"vcle", "U..", "lUldQdQlQUl", OP_LE>;
929929
def CFMGT : SOpInst<"vcgt", "U..", "lUldQdQlQUl", OP_GT>;
930930
def CFMLT : SOpInst<"vclt", "U..", "lUldQdQlQUl", OP_LT>;
931931

932-
def CMEQ : SInst<"vceqz", "U.",
932+
def CMEQ : SInst<"vceqz", "U(.!)",
933933
"csilfUcUsUiUlPcPlQcQsQiQlQfQUcQUsQUiQUlQPcdQdQPl">;
934-
def CMGE : SInst<"vcgez", "U.", "csilfdQcQsQiQlQfQd">;
935-
def CMLE : SInst<"vclez", "U.", "csilfdQcQsQiQlQfQd">;
936-
def CMGT : SInst<"vcgtz", "U.", "csilfdQcQsQiQlQfQd">;
937-
def CMLT : SInst<"vcltz", "U.", "csilfdQcQsQiQlQfQd">;
934+
def CMGE : SInst<"vcgez", "U(.!)", "csilfdQcQsQiQlQfQd">;
935+
def CMLE : SInst<"vclez", "U(.!)", "csilfdQcQsQiQlQfQd">;
936+
def CMGT : SInst<"vcgtz", "U(.!)", "csilfdQcQsQiQlQfQd">;
937+
def CMLT : SInst<"vcltz", "U(.!)", "csilfdQcQsQiQlQfQd">;
938938

939939
////////////////////////////////////////////////////////////////////////////////
940940
// Max/Min Integer
@@ -1672,11 +1672,11 @@ let TargetGuard = "fullfp16,neon" in {
16721672
// ARMv8.2-A FP16 one-operand vector intrinsics.
16731673

16741674
// Comparison
1675-
def CMEQH : SInst<"vceqz", "U.", "hQh">;
1676-
def CMGEH : SInst<"vcgez", "U.", "hQh">;
1677-
def CMGTH : SInst<"vcgtz", "U.", "hQh">;
1678-
def CMLEH : SInst<"vclez", "U.", "hQh">;
1679-
def CMLTH : SInst<"vcltz", "U.", "hQh">;
1675+
def CMEQH : SInst<"vceqz", "U(.!)", "hQh">;
1676+
def CMGEH : SInst<"vcgez", "U(.!)", "hQh">;
1677+
def CMGTH : SInst<"vcgtz", "U(.!)", "hQh">;
1678+
def CMLEH : SInst<"vclez", "U(.!)", "hQh">;
1679+
def CMLTH : SInst<"vcltz", "U(.!)", "hQh">;
16801680

16811681
// Vector conversion
16821682
def VCVT_F16 : SInst<"vcvt_f16", "F(.!)", "sUsQsQUs">;

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 66 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8158,8 +8158,9 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
81588158

81598159
// Determine the type of this overloaded NEON intrinsic.
81608160
NeonTypeFlags Type(NeonTypeConst->getZExtValue());
8161-
bool Usgn = Type.isUnsigned();
8162-
bool Quad = Type.isQuad();
8161+
const bool Usgn = Type.isUnsigned();
8162+
const bool Quad = Type.isQuad();
8163+
const bool Floating = Type.isFloatingPoint();
81638164
const bool HasLegalHalfType = getTarget().hasLegalHalfType();
81648165
const bool AllowBFloatArgsAndRet =
81658166
getTargetHooks().getABIInfo().allowBFloatArgsAndRet();
@@ -8260,24 +8261,28 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
82608261
}
82618262
case NEON::BI__builtin_neon_vceqz_v:
82628263
case NEON::BI__builtin_neon_vceqzq_v:
8263-
return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OEQ,
8264-
ICmpInst::ICMP_EQ, "vceqz");
8264+
return EmitAArch64CompareBuiltinExpr(
8265+
Ops[0], Ty, Floating ? ICmpInst::FCMP_OEQ : ICmpInst::ICMP_EQ, "vceqz");
82658266
case NEON::BI__builtin_neon_vcgez_v:
82668267
case NEON::BI__builtin_neon_vcgezq_v:
8267-
return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGE,
8268-
ICmpInst::ICMP_SGE, "vcgez");
8268+
return EmitAArch64CompareBuiltinExpr(
8269+
Ops[0], Ty, Floating ? ICmpInst::FCMP_OGE : ICmpInst::ICMP_SGE,
8270+
"vcgez");
82698271
case NEON::BI__builtin_neon_vclez_v:
82708272
case NEON::BI__builtin_neon_vclezq_v:
8271-
return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLE,
8272-
ICmpInst::ICMP_SLE, "vclez");
8273+
return EmitAArch64CompareBuiltinExpr(
8274+
Ops[0], Ty, Floating ? ICmpInst::FCMP_OLE : ICmpInst::ICMP_SLE,
8275+
"vclez");
82738276
case NEON::BI__builtin_neon_vcgtz_v:
82748277
case NEON::BI__builtin_neon_vcgtzq_v:
8275-
return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OGT,
8276-
ICmpInst::ICMP_SGT, "vcgtz");
8278+
return EmitAArch64CompareBuiltinExpr(
8279+
Ops[0], Ty, Floating ? ICmpInst::FCMP_OGT : ICmpInst::ICMP_SGT,
8280+
"vcgtz");
82778281
case NEON::BI__builtin_neon_vcltz_v:
82788282
case NEON::BI__builtin_neon_vcltzq_v:
8279-
return EmitAArch64CompareBuiltinExpr(Ops[0], Ty, ICmpInst::FCMP_OLT,
8280-
ICmpInst::ICMP_SLT, "vcltz");
8283+
return EmitAArch64CompareBuiltinExpr(
8284+
Ops[0], Ty, Floating ? ICmpInst::FCMP_OLT : ICmpInst::ICMP_SLT,
8285+
"vcltz");
82818286
case NEON::BI__builtin_neon_vclz_v:
82828287
case NEON::BI__builtin_neon_vclzq_v:
82838288
// We generate target-independent intrinsic, which needs a second argument
@@ -8840,28 +8845,32 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
88408845
return Builder.CreateBitCast(Result, ResultType, NameHint);
88418846
}
88428847

8843-
Value *CodeGenFunction::EmitAArch64CompareBuiltinExpr(
8844-
Value *Op, llvm::Type *Ty, const CmpInst::Predicate Fp,
8845-
const CmpInst::Predicate Ip, const Twine &Name) {
8846-
llvm::Type *OTy = Op->getType();
8847-
8848-
// FIXME: this is utterly horrific. We should not be looking at previous
8849-
// codegen context to find out what needs doing. Unfortunately TableGen
8850-
// currently gives us exactly the same calls for vceqz_f32 and vceqz_s32
8851-
// (etc).
8852-
if (BitCastInst *BI = dyn_cast<BitCastInst>(Op))
8853-
OTy = BI->getOperand(0)->getType();
8854-
8855-
Op = Builder.CreateBitCast(Op, OTy);
8856-
if (OTy->getScalarType()->isFloatingPointTy()) {
8857-
if (Fp == CmpInst::FCMP_OEQ)
8858-
Op = Builder.CreateFCmp(Fp, Op, Constant::getNullValue(OTy));
8848+
Value *
8849+
CodeGenFunction::EmitAArch64CompareBuiltinExpr(Value *Op, llvm::Type *Ty,
8850+
const CmpInst::Predicate Pred,
8851+
const Twine &Name) {
8852+
8853+
if (isa<FixedVectorType>(Ty)) {
8854+
// Vector types are cast to i8 vectors. Recover original type.
8855+
Op = Builder.CreateBitCast(Op, Ty);
8856+
}
8857+
8858+
if (CmpInst::isFPPredicate(Pred)) {
8859+
if (Pred == CmpInst::FCMP_OEQ)
8860+
Op = Builder.CreateFCmp(Pred, Op, Constant::getNullValue(Op->getType()));
88598861
else
8860-
Op = Builder.CreateFCmpS(Fp, Op, Constant::getNullValue(OTy));
8862+
Op = Builder.CreateFCmpS(Pred, Op, Constant::getNullValue(Op->getType()));
88618863
} else {
8862-
Op = Builder.CreateICmp(Ip, Op, Constant::getNullValue(OTy));
8864+
Op = Builder.CreateICmp(Pred, Op, Constant::getNullValue(Op->getType()));
88638865
}
8864-
return Builder.CreateSExt(Op, Ty, Name);
8866+
8867+
llvm::Type *ResTy = Ty;
8868+
if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
8869+
ResTy = FixedVectorType::get(
8870+
IntegerType::get(getLLVMContext(), VTy->getScalarSizeInBits()),
8871+
VTy->getNumElements());
8872+
8873+
return Builder.CreateSExt(Op, ResTy, Name);
88658874
}
88668875

88678876
static Value *packTBLDVectorList(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
@@ -12350,45 +12359,66 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1235012359
return Builder.CreateFAdd(Op0, Op1, "vpaddd");
1235112360
}
1235212361
case NEON::BI__builtin_neon_vceqzd_s64:
12362+
Ops.push_back(EmitScalarExpr(E->getArg(0)));
12363+
return EmitAArch64CompareBuiltinExpr(
12364+
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12365+
ICmpInst::ICMP_EQ, "vceqz");
1235312366
case NEON::BI__builtin_neon_vceqzd_f64:
1235412367
case NEON::BI__builtin_neon_vceqzs_f32:
1235512368
case NEON::BI__builtin_neon_vceqzh_f16:
1235612369
Ops.push_back(EmitScalarExpr(E->getArg(0)));
1235712370
return EmitAArch64CompareBuiltinExpr(
1235812371
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12359-
ICmpInst::FCMP_OEQ, ICmpInst::ICMP_EQ, "vceqz");
12372+
ICmpInst::FCMP_OEQ, "vceqz");
1236012373
case NEON::BI__builtin_neon_vcgezd_s64:
12374+
Ops.push_back(EmitScalarExpr(E->getArg(0)));
12375+
return EmitAArch64CompareBuiltinExpr(
12376+
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12377+
ICmpInst::ICMP_SGE, "vcgez");
1236112378
case NEON::BI__builtin_neon_vcgezd_f64:
1236212379
case NEON::BI__builtin_neon_vcgezs_f32:
1236312380
case NEON::BI__builtin_neon_vcgezh_f16:
1236412381
Ops.push_back(EmitScalarExpr(E->getArg(0)));
1236512382
return EmitAArch64CompareBuiltinExpr(
1236612383
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12367-
ICmpInst::FCMP_OGE, ICmpInst::ICMP_SGE, "vcgez");
12384+
ICmpInst::FCMP_OGE, "vcgez");
1236812385
case NEON::BI__builtin_neon_vclezd_s64:
12386+
Ops.push_back(EmitScalarExpr(E->getArg(0)));
12387+
return EmitAArch64CompareBuiltinExpr(
12388+
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12389+
ICmpInst::ICMP_SLE, "vclez");
1236912390
case NEON::BI__builtin_neon_vclezd_f64:
1237012391
case NEON::BI__builtin_neon_vclezs_f32:
1237112392
case NEON::BI__builtin_neon_vclezh_f16:
1237212393
Ops.push_back(EmitScalarExpr(E->getArg(0)));
1237312394
return EmitAArch64CompareBuiltinExpr(
1237412395
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12375-
ICmpInst::FCMP_OLE, ICmpInst::ICMP_SLE, "vclez");
12396+
ICmpInst::FCMP_OLE, "vclez");
1237612397
case NEON::BI__builtin_neon_vcgtzd_s64:
12398+
Ops.push_back(EmitScalarExpr(E->getArg(0)));
12399+
return EmitAArch64CompareBuiltinExpr(
12400+
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12401+
ICmpInst::ICMP_SGT, "vcgtz");
1237712402
case NEON::BI__builtin_neon_vcgtzd_f64:
1237812403
case NEON::BI__builtin_neon_vcgtzs_f32:
1237912404
case NEON::BI__builtin_neon_vcgtzh_f16:
1238012405
Ops.push_back(EmitScalarExpr(E->getArg(0)));
1238112406
return EmitAArch64CompareBuiltinExpr(
1238212407
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12383-
ICmpInst::FCMP_OGT, ICmpInst::ICMP_SGT, "vcgtz");
12408+
ICmpInst::FCMP_OGT, "vcgtz");
1238412409
case NEON::BI__builtin_neon_vcltzd_s64:
12410+
Ops.push_back(EmitScalarExpr(E->getArg(0)));
12411+
return EmitAArch64CompareBuiltinExpr(
12412+
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12413+
ICmpInst::ICMP_SLT, "vcltz");
12414+
1238512415
case NEON::BI__builtin_neon_vcltzd_f64:
1238612416
case NEON::BI__builtin_neon_vcltzs_f32:
1238712417
case NEON::BI__builtin_neon_vcltzh_f16:
1238812418
Ops.push_back(EmitScalarExpr(E->getArg(0)));
1238912419
return EmitAArch64CompareBuiltinExpr(
1239012420
Ops[0], ConvertType(E->getCallReturnType(getContext())),
12391-
ICmpInst::FCMP_OLT, ICmpInst::ICMP_SLT, "vcltz");
12421+
ICmpInst::FCMP_OLT, "vcltz");
1239212422

1239312423
case NEON::BI__builtin_neon_vceqzd_u64: {
1239412424
Ops.push_back(EmitScalarExpr(E->getArg(0)));

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4628,10 +4628,10 @@ class CodeGenFunction : public CodeGenTypeCache {
46284628
llvm::Value *EmitTargetBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
46294629
ReturnValueSlot ReturnValue);
46304630

4631-
llvm::Value *EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
4632-
const llvm::CmpInst::Predicate Fp,
4633-
const llvm::CmpInst::Predicate Ip,
4634-
const llvm::Twine &Name = "");
4631+
llvm::Value *
4632+
EmitAArch64CompareBuiltinExpr(llvm::Value *Op, llvm::Type *Ty,
4633+
const llvm::CmpInst::Predicate Pred,
4634+
const llvm::Twine &Name = "");
46354635
llvm::Value *EmitARMBuiltinExpr(unsigned BuiltinID, const CallExpr *E,
46364636
ReturnValueSlot ReturnValue,
46374637
llvm::Triple::ArchType Arch);

clang/test/CodeGen/AArch64/bf16-dotprod-intrinsics.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ float32x4_t test_vbfdotq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b){
2828
// CHECK-NEXT: entry:
2929
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
3030
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <2 x i32> zeroinitializer
31-
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
32-
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
31+
// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
32+
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
3333
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
3434
//
3535
float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
@@ -40,8 +40,8 @@ float32x2_t test_vbfdot_lane_f32(float32x2_t r, bfloat16x4_t a, bfloat16x4_t b){
4040
// CHECK-NEXT: entry:
4141
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
4242
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
43-
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
44-
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
43+
// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
44+
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
4545
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
4646
//
4747
float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b) {
@@ -52,8 +52,8 @@ float32x4_t test_vbfdotq_laneq_f32(float32x4_t r, bfloat16x8_t a, bfloat16x8_t b
5252
// CHECK-NEXT: entry:
5353
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <8 x bfloat> [[B:%.*]] to <4 x float>
5454
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[DOTCAST]], <4 x float> poison, <2 x i32> <i32 3, i32 3>
55-
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
56-
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST1]])
55+
// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <2 x float> [[LANE]] to <4 x bfloat>
56+
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v4bf16(<2 x float> [[R:%.*]], <4 x bfloat> [[A:%.*]], <4 x bfloat> [[DOTCAST2]])
5757
// CHECK-NEXT: ret <2 x float> [[VBFDOT3_I]]
5858
//
5959
float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b) {
@@ -64,8 +64,8 @@ float32x2_t test_vbfdot_laneq_f32(float32x2_t r, bfloat16x4_t a, bfloat16x8_t b)
6464
// CHECK-NEXT: entry:
6565
// CHECK-NEXT: [[DOTCAST:%.*]] = bitcast <4 x bfloat> [[B:%.*]] to <2 x float>
6666
// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[DOTCAST]], <2 x float> poison, <4 x i32> zeroinitializer
67-
// CHECK-NEXT: [[DOTCAST1:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
68-
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST1]])
67+
// CHECK-NEXT: [[DOTCAST2:%.*]] = bitcast <4 x float> [[LANE]] to <8 x bfloat>
68+
// CHECK-NEXT: [[VBFDOT3_I:%.*]] = call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v8bf16(<4 x float> [[R:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[DOTCAST2]])
6969
// CHECK-NEXT: ret <4 x float> [[VBFDOT3_I]]
7070
//
7171
float32x4_t test_vbfdotq_lane_f32(float32x4_t r, bfloat16x8_t a, bfloat16x4_t b) {

0 commit comments

Comments
 (0)