From c647c580cac763f4105be1e6ed10924266518fd7 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Tue, 27 May 2025 18:39:56 -0700 Subject: [PATCH 1/2] [Matrix] Propagate shape information through fdiv insts --- .../Scalar/LowerMatrixIntrinsics.cpp | 3 +++ .../Transforms/LowerMatrixIntrinsics/binop.ll | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+) create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 56d4be513ea6f..259148124c701 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -233,6 +233,7 @@ static bool isUniformShape(Value *V) { case Instruction::FAdd: case Instruction::FSub: case Instruction::FMul: // Scalar multiply. + case Instruction::FDiv: case Instruction::FNeg: case Instruction::Add: case Instruction::Mul: @@ -2167,6 +2168,8 @@ class LowerMatrixIntrinsics { return Builder.CreateFAdd(LHS, RHS); case Instruction::FMul: return Builder.CreateFMul(LHS, RHS); + case Instruction::FDiv: + return Builder.CreateFDiv(LHS, RHS); case Instruction::FSub: return Builder.CreateFSub(LHS, RHS); default: diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll new file mode 100644 index 0000000000000..fd3e440d779ea --- /dev/null +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s + +define void @fdiv_2x2(ptr %num, ptr %denom, ptr %out) { +; CHECK-LABEL: @fdiv_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[NUM:%.*]], align 32 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr double, ptr [[NUM]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[DENOM:%.*]], align 32 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[DENOM]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = fdiv <2 x double> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP5]], align 16 +; CHECK-NEXT: ret void +; + %numv = load <4 x double>, ptr %num + %denomv = load <4 x double>, ptr %denom + %div = fdiv <4 x double> %numv, %denomv + %divt = call <4 x double> @llvm.matrix.transpose(<4 x double> %div, i32 2, i32 2) + %divtt = call <4 x double> @llvm.matrix.transpose(<4 x double> %divt, i32 2, i32 2) + store <4 x double> %divtt, ptr %out + ret void +} From 39c585c34d6775f0d2b141d945a4f7668a9aa9b3 Mon Sep 17 00:00:00 2001 From: Jon Roelofs Date: Wed, 28 May 2025 07:43:21 -0700 Subject: [PATCH 2/2] Extend split behavior to any binop --- .../Scalar/LowerMatrixIntrinsics.cpp | 35 +- .../Transforms/LowerMatrixIntrinsics/binop.ll | 414 +++++++++++++++++- 2 files changed, 416 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 259148124c701..756a72e6d97bc 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -229,15 +229,11 @@ static bool isUniformShape(Value *V) { if (!I) return true; + if (I->isBinaryOp()) + return true; + switch (I->getOpcode()) { - case Instruction::FAdd: - case Instruction::FSub: - case Instruction::FMul: // Scalar multiply. - case Instruction::FDiv: case Instruction::FNeg: - case Instruction::Add: - case Instruction::Mul: - case Instruction::Sub: return true; default: return false; @@ -2155,30 +2151,9 @@ class LowerMatrixIntrinsics { Builder.setFastMathFlags(getFastMathFlags(Inst)); - // Helper to perform binary op on vectors. - auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) { - switch (Inst->getOpcode()) { - case Instruction::Add: - return Builder.CreateAdd(LHS, RHS); - case Instruction::Mul: - return Builder.CreateMul(LHS, RHS); - case Instruction::Sub: - return Builder.CreateSub(LHS, RHS); - case Instruction::FAdd: - return Builder.CreateFAdd(LHS, RHS); - case Instruction::FMul: - return Builder.CreateFMul(LHS, RHS); - case Instruction::FDiv: - return Builder.CreateFDiv(LHS, RHS); - case Instruction::FSub: - return Builder.CreateFSub(LHS, RHS); - default: - llvm_unreachable("Unsupported binary operator for matrix"); - } - }; - for (unsigned I = 0; I < Shape.getNumVectors(); ++I) - Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I))); + Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), A.getVector(I), + B.getVector(I))); finalizeLowering(Inst, Result.addNumComputeOps(getNumOps(Result.getVectorTy()) * diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll index fd3e440d779ea..9160ced2715aa 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/binop.ll @@ -1,6 +1,198 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s +define void @add_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @add_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = add <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @fadd_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @fadd_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x float>, ptr %lhs + %rhsv = load <4 x float>, ptr %rhs + %op = fadd <4 x float> %lhsv, %rhsv + %opt = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2) + %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2) + store <4 x float> %optt, ptr %out + ret void +} + +define void @sub_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @sub_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = sub <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = sub <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @fsub_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @fsub_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fsub nnan <2 x float> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub nnan <2 x float> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x float>, ptr %lhs + %rhsv = load <4 x float>, ptr %rhs + %op = fsub nnan <4 x float> %lhsv, %rhsv + %opt = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2) + %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2) + store <4 x float> %optt, ptr %out + ret void +} + +define void @mul_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @mul_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = mul <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = mul <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @fmul_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @fmul_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fmul contract <2 x float> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x float> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x float>, ptr %lhs + %rhsv = load <4 x float>, ptr %rhs + %op = fmul contract <4 x float> %lhsv, %rhsv + %opt = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2) + %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2) + store <4 x float> %optt, ptr %out + ret void +} + +define void @udiv_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @udiv_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = udiv <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = udiv <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = udiv <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @sdiv_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @sdiv_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = sdiv <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = sdiv <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = sdiv <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + define void @fdiv_2x2(ptr %num, ptr %denom, ptr %out) { ; CHECK-LABEL: @fdiv_2x2( ; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, ptr [[NUM:%.*]], align 32 @@ -9,8 +201,8 @@ define void @fdiv_2x2(ptr %num, ptr %denom, ptr %out) { ; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[DENOM:%.*]], align 32 ; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr double, ptr [[DENOM]], i64 2 ; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[COL_LOAD]], [[COL_LOAD2]] -; CHECK-NEXT: [[TMP2:%.*]] = fdiv <2 x double> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: [[TMP1:%.*]] = fdiv nnan <2 x double> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = fdiv nnan <2 x double> [[COL_LOAD1]], [[COL_LOAD4]] ; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 32 ; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr double, ptr [[OUT]], i64 2 ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[VEC_GEP5]], align 16 @@ -18,9 +210,225 @@ define void @fdiv_2x2(ptr %num, ptr %denom, ptr %out) { ; %numv = load <4 x double>, ptr %num %denomv = load <4 x double>, ptr %denom - %div = fdiv <4 x double> %numv, %denomv + %div = fdiv nnan <4 x double> %numv, %denomv %divt = call <4 x double> @llvm.matrix.transpose(<4 x double> %div, i32 2, i32 2) %divtt = call <4 x double> @llvm.matrix.transpose(<4 x double> %divt, i32 2, i32 2) store <4 x double> %divtt, ptr %out ret void } + +define void @urem_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @urem_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = urem <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = urem <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = urem <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @srem_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @srem_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = srem <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = srem <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = srem <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @frem_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @frem_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x float>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr float, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x float>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr float, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x float>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = frem fast <2 x float> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = frem fast <2 x float> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr float, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x float>, ptr %lhs + %rhsv = load <4 x float>, ptr %rhs + %op = frem fast <4 x float> %lhsv, %rhsv + %opt = call <4 x float> @llvm.matrix.transpose(<4 x float> %op, i32 2, i32 2) + %optt = call <4 x float> @llvm.matrix.transpose(<4 x float> %opt, i32 2, i32 2) + store <4 x float> %optt, ptr %out + ret void +} + +define void @shl_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @shl_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = shl <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @lshr_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @lshr_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = lshr <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = lshr <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = lshr <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @ashr_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @ashr_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = ashr <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = ashr <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @and_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @and_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = and <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @or_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @or_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = or <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +} + +define void @xor_2x2(ptr %lhs, ptr %rhs, ptr %out) { +; CHECK-LABEL: @xor_2x2( +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x i32>, ptr [[LHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr i32, ptr [[LHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD1:%.*]] = load <2 x i32>, ptr [[VEC_GEP]], align 8 +; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x i32>, ptr [[RHS:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP3:%.*]] = getelementptr i32, ptr [[RHS]], i64 2 +; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x i32>, ptr [[VEC_GEP3]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[COL_LOAD]], [[COL_LOAD2]] +; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[COL_LOAD1]], [[COL_LOAD4]] +; CHECK-NEXT: store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 16 +; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr i32, ptr [[OUT]], i64 2 +; CHECK-NEXT: store <2 x i32> [[TMP2]], ptr [[VEC_GEP5]], align 8 +; CHECK-NEXT: ret void +; + %lhsv = load <4 x i32>, ptr %lhs + %rhsv = load <4 x i32>, ptr %rhs + %op = xor <4 x i32> %lhsv, %rhsv + %opt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %op, i32 2, i32 2) + %optt = call <4 x i32> @llvm.matrix.transpose(<4 x i32> %opt, i32 2, i32 2) + store <4 x i32> %optt, ptr %out + ret void +}