-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[Clang][RISCV] Handle RVV tuple types correctly as OutputOperands for inline asm #67018
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
1e02945
to
565785d
Compare
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-clang-codegen ChangesThis commit fixes compilation issue for RVV tuple types as OutputOperand for inline asm. The LLVM IR generated by the test case added is currently not handled successfully in the backend, which also needs a fix too. Full diff: https://github.com/llvm/llvm-project/pull/67018.diff 2 Files Affected:
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 6674aa2409a5947..0dd79903802f328 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -19,6 +19,7 @@
#include "clang/AST/Expr.h"
#include "clang/AST/Stmt.h"
#include "clang/AST/StmtVisitor.h"
+#include "clang/AST/Type.h"
#include "clang/Basic/Builtins.h"
#include "clang/Basic/DiagnosticSema.h"
#include "clang/Basic/PrettyStackTrace.h"
@@ -29,10 +30,13 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/IR/Assumptions.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SaveAndRestore.h"
#include <optional>
@@ -2392,6 +2396,27 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
Tmp = Builder.CreateZExtOrTrunc(Tmp, TruncTy);
} else if (TruncTy->isVectorTy()) {
Tmp = Builder.CreateBitCast(Tmp, TruncTy);
+ } else if (TruncTy->isStructTy()) {
+ auto *STy = cast<llvm::StructType>(TruncTy);
+ auto *VTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+ assert(ResultRegQualTys[i]->isRVVType() &&
+ STy->containsHomogeneousScalableVectorTypes() &&
+ "Must be dealing with RVV tuple type");
+
+ unsigned MinElts = VTy->getElementCount().getKnownMinValue();
+ llvm::Value *StructValue = llvm::PoisonValue::get(STy);
+
+ for (unsigned Idx = 0, TupleSize = STy->getNumElements();
+ Idx != TupleSize; ++Idx) {
+ llvm::Value *IdxValue =
+ llvm::ConstantInt::get(CGM.Int64Ty, Idx * MinElts);
+ llvm::Value *SubVec = Builder.CreateExtractVector(VTy, Tmp, IdxValue);
+
+ StructValue = Builder.CreateInsertValue(StructValue, SubVec, Idx);
+ }
+
+ Tmp = StructValue;
}
}
@@ -2399,7 +2424,13 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
// ResultTypeRequiresCast elements correspond to the first
// ResultTypeRequiresCast.size() elements of RegResults.
if ((i < ResultTypeRequiresCast.size()) && ResultTypeRequiresCast[i]) {
- unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
+ unsigned Size;
+ if (ResultRegQualTys[i]->isRVVType() && TruncTy->isStructTy()) {
+ Size = cast<llvm::ScalableVectorType>(
+ cast<llvm::StructType>(TruncTy)->getElementType(0))
+ ->getScalarSizeInBits();
+ } else
+ Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
Address A = Dest.getAddress(CGF).withElementType(ResultRegTypes[i]);
if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) {
Builder.CreateStore(Tmp, A);
@@ -2524,11 +2555,32 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
ResultRegIsFlagReg.push_back(IsFlagReg);
llvm::Type *Ty = ConvertTypeForMem(QTy);
+ ResultTruncRegTypes.push_back(Ty);
+
+ // Expressing the type as a structure in inline asm calls will complicate
+ // the current code case, so instead, the return type is set to be a
+ // single scalable vector, then reconstructed with `vector.extract` and
+ // `insertvalue`. The type is derived here, and the reconstruction is done
+ // under EmitAsmStores.
+ if (QTy->isRVVType() && isa<llvm::StructType>(Ty)) {
+ // Flatten the structure into a single ScalableVectorType
+ auto *STy = cast<llvm::StructType>(Ty);
+ assert(STy->containsHomogeneousScalableVectorTypes() &&
+ isa<llvm::ScalableVectorType>(STy->getElementType(0)) &&
+ "Dealing with RVV tuple (aggregate with homogeneous scalable "
+ "vectors");
+
+ auto *VecTy = cast<llvm::ScalableVectorType>(STy->getElementType(0));
+
+ Ty = llvm::ScalableVectorType::get(VecTy->getScalarType(),
+ STy->getNumElements() *
+ VecTy->getMinNumElements());
+ }
+
const bool RequiresCast = Info.allowsRegister() &&
(getTargetHooks().isScalarizableAsmOperand(*this, Ty) ||
Ty->isAggregateType());
- ResultTruncRegTypes.push_back(Ty);
ResultTypeRequiresCast.push_back(RequiresCast);
if (RequiresCast) {
@@ -2551,6 +2603,13 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
QualType InputTy = S.getInputExpr(InputNo)->getType();
QualType OutputType = OutExpr->getType();
+ if ((InputTy->isRVVType() &&
+ isa<llvm::StructType>(ConvertType(InputTy))) ||
+ (OutputType->isRVVType() &&
+ isa<llvm::StructType>(ConvertType(OutputType)))) {
+ llvm_unreachable("FIXME: Deal with RVV type matching.");
+ }
+
uint64_t InputSize = getContext().getTypeSize(InputTy);
if (getContext().getTypeSize(OutputType) < InputSize) {
// Form the asm to return the value as a larger integer or fp type.
@@ -2671,6 +2730,13 @@ void CodeGenFunction::EmitAsmStmt(const AsmStmt &S) {
QualType OutputType = S.getOutputExpr(Output)->getType();
QualType InputTy = InputExpr->getType();
+ if ((InputTy->isRVVType() &&
+ isa<llvm::StructType>(ConvertType(InputTy))) ||
+ (OutputType->isRVVType() &&
+ isa<llvm::StructType>(ConvertType(OutputType)))) {
+ llvm_unreachable("FIXME: Deal with RVV type matching.");
+ }
+
if (getContext().getTypeSize(OutputType) >
getContext().getTypeSize(InputTy)) {
// Use ptrtoint as appropriate so that we can do our extension.
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
new file mode 100644
index 000000000000000..cad4f8ed5dcbd48
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-handcrafted/rvv-inline-asm.c
@@ -0,0 +1,41 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
+#include <riscv_vector.h>
+
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve32x -disable-O0-optnone \
+// RUN: -emit-llvm %s -o - | opt -S -passes=mem2reg | FileCheck %s
+
+// CHECK-LABEL: define dso_local void @foo(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> asm "#NOP", "=^vr"() #[[ATTR2:[0-9]+]], !srcloc !4
+// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[TMP0]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT: ret void
+//
+void foo() {
+ vint32m1x2_t v0;
+ asm ("#NOP" : "=vr" (v0));
+}
+
+// CHECK-LABEL: define dso_local void @bar(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } asm "#NOP", "=^vr,=^vr"() #[[ATTR2]], !srcloc !5
+// CHECK-NEXT: [[ASMRESULT:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 0
+// CHECK-NEXT: [[ASMRESULT1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], 1
+// CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 0)
+// CHECK-NEXT: [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP1]], 0
+// CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT]], i64 2)
+// CHECK-NEXT: [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[TMP3]], 1
+// CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 0)
+// CHECK-NEXT: [[TMP6:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[TMP5]], 0
+// CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i32> @llvm.vector.extract.nxv2i32.nxv4i32(<vscale x 4 x i32> [[ASMRESULT1]], i64 2)
+// CHECK-NEXT: [[TMP8:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]], <vscale x 2 x i32> [[TMP7]], 1
+// CHECK-NEXT: ret void
+//
+void bar() {
+ vint32m1x2_t v0, v2;
+ asm ("#NOP" : "=vr" (v0), "=vr" (v2));
+}
|
565785d
to
a26eb9f
Compare
a26eb9f
to
dc77e5c
Compare
Does this mean the backend register allocation will pick a large LMUL register the same size as the whole tuple and force the register to be overly aligned? For example an lmul=1 seg2 tuple can use v0+v1, or v1+v2, or v2+v3, etc. But lmul=2 can only use v0+v1, v2+v3, v4+v5, etc. |
Yes you are correct. The current approach will set restrictions since we will allocate the registers to be the multiplier of the LMUL in the back end. Let me try and work through how I can break them up. |
…inline asm The RVV tuple type maps to an aggregate type with homogeneous scalable vectors. EmitAsmStmt does not handle this correctly and this commit attempts to fix it. Get pass validation check for homogeneous scalable vector types in InlineAsm::verify. Handle RVV tuple types correctly under CGStmt.cpp:EmitAsmStores, since we can allow direct store for the tuple types. A follow-up commit will deal with details when associated with InputOperands.
dc77e5c
to
6f232cc
Compare
Change: Updated the approach to get pass inline asm verifications and handle them correctly under |
This PR fixes compilation issue for RVV tuple types as OutputOperands for inline asm.
The RVV tuple type maps to an aggregate type with homogeneous scalable
vectors. EmitAsmStmt does not handle this correctly and this commit
attempts to fix it.
Expressing the type as a structure in inline asm calls will complicate
the current code base, so instead, the return type is set to be
a single scalable vector, then reconstructed with
vector.extract
andinsertvalue
.A follow-up PR will handle InputOperands correctly.