You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[RISCV] Lower fixed-length mgather/mscatter for zvfhmin/zvfbfmin (#114945)
In preparation for allowing zvfhmin and zvfbfmin in
isLegalElementTypeForRVV, this lowers fixed-length masked gathers and
scatters
We need to mark f16 and bf16 as legal in isLegalMaskedGatherScatter
otherwise ScalarizeMaskedMemIntrin will just scalarize them, but we can
move this back into isLegalElementTypeForRVV afterwards.
The scalarized codegen required #114938, #114927 and #114915 to not
crash.
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 4, <4 x i1> undef, <4 x float> undef)
15
16
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 4, <2 x i1> undef, <2 x float> undef)
16
17
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 4, <1 x i1> undef, <1 x float> undef)
18
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x bfloat> undef)
19
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x bfloat> undef)
20
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x bfloat> undef)
21
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x bfloat> undef)
22
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x bfloat> undef)
23
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x bfloat> undef)
17
24
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 2, <32 x i1> undef, <32 x half> undef)
18
25
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 2, <16 x i1> undef, <16 x half> undef)
19
26
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 2, <8 x i1> undef, <8 x half> undef)
@@ -51,6 +58,12 @@ define i32 @masked_gather() {
51
58
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> undef, i32 2, <4 x i1> undef, <4 x float> undef)
52
59
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i32 2, <2 x i1> undef, <2 x float> undef)
53
60
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i32 2, <1 x i1> undef, <1 x float> undef)
61
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32BF16.u = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x bfloat> undef)
62
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16BF16.u = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x bfloat> undef)
63
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8BF16.u = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x bfloat> undef)
64
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4BF16.u = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i32 1, <4 x i1> undef, <4 x bfloat> undef)
65
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2BF16.u = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i32 1, <2 x i1> undef, <2 x bfloat> undef)
66
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1BF16.u = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i32 1, <1 x i1> undef, <1 x bfloat> undef)
54
67
; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i32 1, <32 x i1> undef, <32 x half> undef)
55
68
; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i32 1, <16 x i1> undef, <16 x half> undef)
56
69
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i32 1, <8 x i1> undef, <8 x half> undef)
@@ -85,6 +98,13 @@ define i32 @masked_gather() {
85
98
%V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> undef, i324, <2 x i1> undef, <2 x float> undef)
86
99
%V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0(<1 x ptr> undef, i324, <1 x i1> undef, <1 x float> undef)
87
100
101
+
%V32BF16 = call <32 x bfloat> @llvm.masked.gather.v32bf16.v32p0(<32 x ptr> undef, i322, <32 x i1> undef, <32 x bfloat> undef)
102
+
%V16BF16 = call <16 x bfloat> @llvm.masked.gather.v16bf16.v16p0(<16 x ptr> undef, i322, <16 x i1> undef, <16 x bfloat> undef)
103
+
%V8BF16 = call <8 x bfloat> @llvm.masked.gather.v8bf16.v8p0(<8 x ptr> undef, i322, <8 x i1> undef, <8 x bfloat> undef)
104
+
%V4BF16 = call <4 x bfloat> @llvm.masked.gather.v4bf16.v4p0(<4 x ptr> undef, i322, <4 x i1> undef, <4 x bfloat> undef)
105
+
%V2BF16 = call <2 x bfloat> @llvm.masked.gather.v2bf16.v2p0(<2 x ptr> undef, i322, <2 x i1> undef, <2 x bfloat> undef)
106
+
%V1BF16 = call <1 x bfloat> @llvm.masked.gather.v1bf16.v1p0(<1 x ptr> undef, i322, <1 x i1> undef, <1 x bfloat> undef)
107
+
88
108
%V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0(<32 x ptr> undef, i322, <32 x i1> undef, <32 x half> undef)
89
109
%V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr> undef, i322, <16 x i1> undef, <16 x half> undef)
90
110
%V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> undef, i322, <8 x i1> undef, <8 x half> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 4, <4 x i1> undef)
15
16
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 4, <2 x i1> undef)
16
17
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 4, <1 x i1> undef)
18
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
19
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
20
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
21
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
22
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
23
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
17
24
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 2, <32 x i1> undef)
18
25
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 2, <16 x i1> undef)
19
26
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 2, <8 x i1> undef)
@@ -51,6 +58,12 @@ define i32 @masked_scatter() {
51
58
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> undef, <4 x ptr> undef, i32 2, <4 x i1> undef)
52
59
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i32 2, <2 x i1> undef)
53
60
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i32 2, <1 x i1> undef)
61
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
62
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
63
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
64
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: call void @llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i32 1, <4 x i1> undef)
65
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i32 1, <2 x i1> undef)
66
+
; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i32 1, <1 x i1> undef)
54
67
; CHECK-NEXT: Cost Model: Found an estimated cost of 140 for instruction: call void @llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i32 1, <32 x i1> undef)
55
68
; CHECK-NEXT: Cost Model: Found an estimated cost of 70 for instruction: call void @llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i32 1, <16 x i1> undef)
56
69
; CHECK-NEXT: Cost Model: Found an estimated cost of 35 for instruction: call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i32 1, <8 x i1> undef)
@@ -85,6 +98,13 @@ define i32 @masked_scatter() {
85
98
callvoid@llvm.masked.scatter.v2f32.v2p0(<2 x float> undef, <2 x ptr> undef, i324, <2 x i1> undef)
86
99
callvoid@llvm.masked.scatter.v1f32.v1p0(<1 x float> undef, <1 x ptr> undef, i324, <1 x i1> undef)
87
100
101
+
callvoid@llvm.masked.scatter.v32bf16.v32p0(<32 x bfloat> undef, <32 x ptr> undef, i322, <32 x i1> undef)
102
+
callvoid@llvm.masked.scatter.v16bf16.v16p0(<16 x bfloat> undef, <16 x ptr> undef, i322, <16 x i1> undef)
103
+
callvoid@llvm.masked.scatter.v8bf16.v8p0(<8 x bfloat> undef, <8 x ptr> undef, i322, <8 x i1> undef)
104
+
callvoid@llvm.masked.scatter.v4bf16.v4p0(<4 x bfloat> undef, <4 x ptr> undef, i322, <4 x i1> undef)
105
+
callvoid@llvm.masked.scatter.v2bf16.v2p0(<2 x bfloat> undef, <2 x ptr> undef, i322, <2 x i1> undef)
106
+
callvoid@llvm.masked.scatter.v1bf16.v1p0(<1 x bfloat> undef, <1 x ptr> undef, i322, <1 x i1> undef)
107
+
88
108
callvoid@llvm.masked.scatter.v32f16.v32p0(<32 x half> undef, <32 x ptr> undef, i322, <32 x i1> undef)
89
109
callvoid@llvm.masked.scatter.v16f16.v16p0(<16 x half> undef, <16 x ptr> undef, i322, <16 x i1> undef)
90
110
callvoid@llvm.masked.scatter.v8f16.v8p0(<8 x half> undef, <8 x ptr> undef, i322, <8 x i1> undef)
0 commit comments