From b887c4380b8d54b3ce436c81aed229064d309682 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 7 Nov 2024 12:19:12 +0800 Subject: [PATCH 1/2] Precommit tests --- .../RISCV/strided-loads-vectorized.ll | 356 +++++++++++++++++- 1 file changed, 355 insertions(+), 1 deletion(-) diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll index f1619c9dd034d..21bb0b5a91419 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll @@ -1,5 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s +; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,NO-ZVFHMIN-ZVFBFMIN +; RUN: opt -passes=slp-vectorizer -S -mtriple=riscv64-unknown-linux-gnu -mattr=+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN-ZVFBFMIN + define void @test(ptr %p, ptr noalias %s) { ; CHECK-LABEL: @test( @@ -308,3 +310,355 @@ entry: ret void } + +define void @test_bf16(ptr %p, ptr noalias %s) { +; NO-ZVFHMIN-ZVFBFMIN-LABEL: @test_bf16( +; NO-ZVFHMIN-ZVFBFMIN-NEXT: entry: +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load bfloat, ptr [[ARRAYIDX1]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD:%.*]] = fsub fast bfloat [[I1]], [[I]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD]], ptr [[ARRAYIDX2]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load bfloat, ptr [[ARRAYIDX4]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 26 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load bfloat, ptr [[ARRAYIDX6]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD7:%.*]] = fsub fast bfloat [[I3]], [[I2]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 1 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD7]], ptr [[ARRAYIDX9]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 8 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load bfloat, ptr [[ARRAYIDX11]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 22 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load bfloat, ptr [[ARRAYIDX13]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD14:%.*]] = fsub fast bfloat [[I5]], [[I4]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 2 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD14]], ptr [[ARRAYIDX16]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 12 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load bfloat, ptr [[ARRAYIDX18]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 18 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load bfloat, ptr [[ARRAYIDX20]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD21:%.*]] = fsub fast bfloat [[I7]], [[I6]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 3 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD21]], ptr [[ARRAYIDX23]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 16 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load bfloat, ptr [[ARRAYIDX25]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 14 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load bfloat, ptr [[ARRAYIDX27]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD28:%.*]] = fsub fast bfloat [[I9]], [[I8]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD28]], ptr [[ARRAYIDX30]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 20 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load bfloat, ptr [[ARRAYIDX32]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 10 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load bfloat, ptr [[ARRAYIDX34]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD35:%.*]] = fsub fast bfloat [[I11]], [[I10]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 5 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD35]], ptr [[ARRAYIDX37]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 24 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load bfloat, ptr [[ARRAYIDX39]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 6 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load bfloat, ptr [[ARRAYIDX41]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD42:%.*]] = fsub fast bfloat [[I13]], [[I12]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 6 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD42]], ptr [[ARRAYIDX44]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 28 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load bfloat, ptr [[ARRAYIDX46]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 2 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load bfloat, ptr [[ARRAYIDX48]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD49:%.*]] = fsub fast bfloat [[I15]], [[I14]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds bfloat, ptr [[S]], i64 7 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store bfloat [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: ret void +; +; ZVFHMIN-ZVFBFMIN-LABEL: @test_bf16( +; ZVFHMIN-ZVFBFMIN-NEXT: entry: +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load bfloat, ptr [[ARRAYIDX1]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load bfloat, ptr [[ARRAYIDX4]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 26 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load bfloat, ptr [[ARRAYIDX6]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 8 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load bfloat, ptr [[ARRAYIDX11]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 22 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load bfloat, ptr [[ARRAYIDX13]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 12 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load bfloat, ptr [[ARRAYIDX18]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 18 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load bfloat, ptr [[ARRAYIDX20]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 16 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load bfloat, ptr [[ARRAYIDX25]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 14 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load bfloat, ptr [[ARRAYIDX27]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 20 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load bfloat, ptr [[ARRAYIDX32]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 10 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load bfloat, ptr [[ARRAYIDX34]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 24 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load bfloat, ptr [[ARRAYIDX39]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 6 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load bfloat, ptr [[ARRAYIDX41]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 28 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load bfloat, ptr [[ARRAYIDX46]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 2 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load bfloat, ptr [[ARRAYIDX48]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP0:%.*]] = insertelement <8 x bfloat> poison, bfloat [[I1]], i32 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP1:%.*]] = insertelement <8 x bfloat> [[TMP0]], bfloat [[I3]], i32 1 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP2:%.*]] = insertelement <8 x bfloat> [[TMP1]], bfloat [[I5]], i32 2 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP3:%.*]] = insertelement <8 x bfloat> [[TMP2]], bfloat [[I7]], i32 3 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP4:%.*]] = insertelement <8 x bfloat> [[TMP3]], bfloat [[I9]], i32 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP5:%.*]] = insertelement <8 x bfloat> [[TMP4]], bfloat [[I11]], i32 5 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP6:%.*]] = insertelement <8 x bfloat> [[TMP5]], bfloat [[I13]], i32 6 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = insertelement <8 x bfloat> [[TMP6]], bfloat [[I15]], i32 7 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP8:%.*]] = insertelement <8 x bfloat> poison, bfloat [[I]], i32 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP9:%.*]] = insertelement <8 x bfloat> [[TMP8]], bfloat [[I2]], i32 1 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP10:%.*]] = insertelement <8 x bfloat> [[TMP9]], bfloat [[I4]], i32 2 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP11:%.*]] = insertelement <8 x bfloat> [[TMP10]], bfloat [[I6]], i32 3 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP12:%.*]] = insertelement <8 x bfloat> [[TMP11]], bfloat [[I8]], i32 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP13:%.*]] = insertelement <8 x bfloat> [[TMP12]], bfloat [[I10]], i32 5 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP14:%.*]] = insertelement <8 x bfloat> [[TMP13]], bfloat [[I12]], i32 6 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = insertelement <8 x bfloat> [[TMP14]], bfloat [[I14]], i32 7 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP16:%.*]] = fsub fast <8 x bfloat> [[TMP7]], [[TMP15]] +; ZVFHMIN-ZVFBFMIN-NEXT: store <8 x bfloat> [[TMP16]], ptr [[ARRAYIDX2]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: ret void +; +entry: + %arrayidx = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 0 + %i = load bfloat, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 30 + %i1 = load bfloat, ptr %arrayidx1, align 4 + %add = fsub fast bfloat %i1, %i + %arrayidx2 = getelementptr inbounds bfloat, ptr %s, i64 0 + store bfloat %add, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 4 + %i2 = load bfloat, ptr %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 26 + %i3 = load bfloat, ptr %arrayidx6, align 4 + %add7 = fsub fast bfloat %i3, %i2 + %arrayidx9 = getelementptr inbounds bfloat, ptr %s, i64 1 + store bfloat %add7, ptr %arrayidx9, align 4 + %arrayidx11 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 8 + %i4 = load bfloat, ptr %arrayidx11, align 4 + %arrayidx13 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 22 + %i5 = load bfloat, ptr %arrayidx13, align 4 + %add14 = fsub fast bfloat %i5, %i4 + %arrayidx16 = getelementptr inbounds bfloat, ptr %s, i64 2 + store bfloat %add14, ptr %arrayidx16, align 4 + %arrayidx18 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 12 + %i6 = load bfloat, ptr %arrayidx18, align 4 + %arrayidx20 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 18 + %i7 = load bfloat, ptr %arrayidx20, align 4 + %add21 = fsub fast bfloat %i7, %i6 + %arrayidx23 = getelementptr inbounds bfloat, ptr %s, i64 3 + store bfloat %add21, ptr %arrayidx23, align 4 + %arrayidx25 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 16 + %i8 = load bfloat, ptr %arrayidx25, align 4 + %arrayidx27 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 14 + %i9 = load bfloat, ptr %arrayidx27, align 4 + %add28 = fsub fast bfloat %i9, %i8 + %arrayidx30 = getelementptr inbounds bfloat, ptr %s, i64 4 + store bfloat %add28, ptr %arrayidx30, align 4 + %arrayidx32 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 20 + %i10 = load bfloat, ptr %arrayidx32, align 4 + %arrayidx34 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 10 + %i11 = load bfloat, ptr %arrayidx34, align 4 + %add35 = fsub fast bfloat %i11, %i10 + %arrayidx37 = getelementptr inbounds bfloat, ptr %s, i64 5 + store bfloat %add35, ptr %arrayidx37, align 4 + %arrayidx39 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 24 + %i12 = load bfloat, ptr %arrayidx39, align 4 + %arrayidx41 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 6 + %i13 = load bfloat, ptr %arrayidx41, align 4 + %add42 = fsub fast bfloat %i13, %i12 + %arrayidx44 = getelementptr inbounds bfloat, ptr %s, i64 6 + store bfloat %add42, ptr %arrayidx44, align 4 + %arrayidx46 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 28 + %i14 = load bfloat, ptr %arrayidx46, align 4 + %arrayidx48 = getelementptr inbounds [48 x bfloat], ptr %p, i64 0, i64 2 + %i15 = load bfloat, ptr %arrayidx48, align 4 + %add49 = fsub fast bfloat %i15, %i14 + %arrayidx51 = getelementptr inbounds bfloat, ptr %s, i64 7 + store bfloat %add49, ptr %arrayidx51, align 4 + ret void +} + +define void @test_f16(ptr %p, ptr noalias %s) { +; NO-ZVFHMIN-ZVFBFMIN-LABEL: @test_f16( +; NO-ZVFHMIN-ZVFBFMIN-NEXT: entry: +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load half, ptr [[ARRAYIDX]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load half, ptr [[ARRAYIDX1]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD:%.*]] = fsub fast half [[I1]], [[I]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD]], ptr [[ARRAYIDX2]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load half, ptr [[ARRAYIDX4]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 26 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX6]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD7:%.*]] = fsub fast half [[I3]], [[I2]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds half, ptr [[S]], i64 1 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD7]], ptr [[ARRAYIDX9]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 8 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX11]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 22 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load half, ptr [[ARRAYIDX13]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD14:%.*]] = fsub fast half [[I5]], [[I4]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds half, ptr [[S]], i64 2 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD14]], ptr [[ARRAYIDX16]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 12 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load half, ptr [[ARRAYIDX18]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 18 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load half, ptr [[ARRAYIDX20]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD21:%.*]] = fsub fast half [[I7]], [[I6]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX23:%.*]] = getelementptr inbounds half, ptr [[S]], i64 3 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD21]], ptr [[ARRAYIDX23]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 16 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load half, ptr [[ARRAYIDX25]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 14 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load half, ptr [[ARRAYIDX27]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD28:%.*]] = fsub fast half [[I9]], [[I8]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds half, ptr [[S]], i64 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD28]], ptr [[ARRAYIDX30]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 20 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load half, ptr [[ARRAYIDX32]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 10 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load half, ptr [[ARRAYIDX34]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD35:%.*]] = fsub fast half [[I11]], [[I10]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX37:%.*]] = getelementptr inbounds half, ptr [[S]], i64 5 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD35]], ptr [[ARRAYIDX37]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 24 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load half, ptr [[ARRAYIDX39]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 6 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load half, ptr [[ARRAYIDX41]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD42:%.*]] = fsub fast half [[I13]], [[I12]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds half, ptr [[S]], i64 6 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD42]], ptr [[ARRAYIDX44]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 28 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load half, ptr [[ARRAYIDX46]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 2 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load half, ptr [[ARRAYIDX48]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ADD49:%.*]] = fsub fast half [[I15]], [[I14]] +; NO-ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX51:%.*]] = getelementptr inbounds half, ptr [[S]], i64 7 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: store half [[ADD49]], ptr [[ARRAYIDX51]], align 4 +; NO-ZVFHMIN-ZVFBFMIN-NEXT: ret void +; +; ZVFHMIN-ZVFBFMIN-LABEL: @test_f16( +; ZVFHMIN-ZVFBFMIN-NEXT: entry: +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load half, ptr [[ARRAYIDX]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load half, ptr [[ARRAYIDX1]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load half, ptr [[ARRAYIDX4]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 26 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX6]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 8 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX11]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 22 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load half, ptr [[ARRAYIDX13]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 12 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load half, ptr [[ARRAYIDX18]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 18 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load half, ptr [[ARRAYIDX20]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 16 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load half, ptr [[ARRAYIDX25]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 14 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load half, ptr [[ARRAYIDX27]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 20 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load half, ptr [[ARRAYIDX32]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 10 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load half, ptr [[ARRAYIDX34]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 24 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load half, ptr [[ARRAYIDX39]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 6 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load half, ptr [[ARRAYIDX41]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 28 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load half, ptr [[ARRAYIDX46]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 2 +; ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load half, ptr [[ARRAYIDX48]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP0:%.*]] = insertelement <8 x half> poison, half [[I1]], i32 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[I3]], i32 1 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[I5]], i32 2 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[I7]], i32 3 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[I9]], i32 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[I11]], i32 5 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[I13]], i32 6 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[I15]], i32 7 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP8:%.*]] = insertelement <8 x half> poison, half [[I]], i32 0 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half [[I2]], i32 1 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half [[I4]], i32 2 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half [[I6]], i32 3 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP12:%.*]] = insertelement <8 x half> [[TMP11]], half [[I8]], i32 4 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP12]], half [[I10]], i32 5 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP14:%.*]] = insertelement <8 x half> [[TMP13]], half [[I12]], i32 6 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP14]], half [[I14]], i32 7 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP16:%.*]] = fsub fast <8 x half> [[TMP7]], [[TMP15]] +; ZVFHMIN-ZVFBFMIN-NEXT: store <8 x half> [[TMP16]], ptr [[ARRAYIDX2]], align 4 +; ZVFHMIN-ZVFBFMIN-NEXT: ret void +; +entry: + %arrayidx = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 0 + %i = load half, ptr %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 30 + %i1 = load half, ptr %arrayidx1, align 4 + %add = fsub fast half %i1, %i + %arrayidx2 = getelementptr inbounds half, ptr %s, i64 0 + store half %add, ptr %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 4 + %i2 = load half, ptr %arrayidx4, align 4 + %arrayidx6 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 26 + %i3 = load half, ptr %arrayidx6, align 4 + %add7 = fsub fast half %i3, %i2 + %arrayidx9 = getelementptr inbounds half, ptr %s, i64 1 + store half %add7, ptr %arrayidx9, align 4 + %arrayidx11 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 8 + %i4 = load half, ptr %arrayidx11, align 4 + %arrayidx13 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 22 + %i5 = load half, ptr %arrayidx13, align 4 + %add14 = fsub fast half %i5, %i4 + %arrayidx16 = getelementptr inbounds half, ptr %s, i64 2 + store half %add14, ptr %arrayidx16, align 4 + %arrayidx18 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 12 + %i6 = load half, ptr %arrayidx18, align 4 + %arrayidx20 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 18 + %i7 = load half, ptr %arrayidx20, align 4 + %add21 = fsub fast half %i7, %i6 + %arrayidx23 = getelementptr inbounds half, ptr %s, i64 3 + store half %add21, ptr %arrayidx23, align 4 + %arrayidx25 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 16 + %i8 = load half, ptr %arrayidx25, align 4 + %arrayidx27 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 14 + %i9 = load half, ptr %arrayidx27, align 4 + %add28 = fsub fast half %i9, %i8 + %arrayidx30 = getelementptr inbounds half, ptr %s, i64 4 + store half %add28, ptr %arrayidx30, align 4 + %arrayidx32 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 20 + %i10 = load half, ptr %arrayidx32, align 4 + %arrayidx34 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 10 + %i11 = load half, ptr %arrayidx34, align 4 + %add35 = fsub fast half %i11, %i10 + %arrayidx37 = getelementptr inbounds half, ptr %s, i64 5 + store half %add35, ptr %arrayidx37, align 4 + %arrayidx39 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 24 + %i12 = load half, ptr %arrayidx39, align 4 + %arrayidx41 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 6 + %i13 = load half, ptr %arrayidx41, align 4 + %add42 = fsub fast half %i13, %i12 + %arrayidx44 = getelementptr inbounds half, ptr %s, i64 6 + store half %add42, ptr %arrayidx44, align 4 + %arrayidx46 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 28 + %i14 = load half, ptr %arrayidx46, align 4 + %arrayidx48 = getelementptr inbounds [48 x half], ptr %p, i64 0, i64 2 + %i15 = load half, ptr %arrayidx48, align 4 + %add49 = fsub fast half %i15, %i14 + %arrayidx51 = getelementptr inbounds half, ptr %s, i64 7 + store half %add49, ptr %arrayidx51, align 4 + ret void +} From cd1503e4580579f476c5675e69d30ff1cfab4bf0 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 7 Nov 2024 12:22:55 +0800 Subject: [PATCH 2/2] [RISCV] Allow f16/bf16 with zvfhmin/zvfbfmin as legal strided access This is also split off from the zvfhmin/zvfbfmin isLegalElementTypeForRVV work. Enabling this will cause SLP and RISCVGatherScatterLowering to emit @llvm.experimental.vp.strided.{load,store} intrinsics, and support for this was added in #109387 and #114750. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 5 +- .../RISCV/strided-loads-vectorized.ll | 96 +------------------ 2 files changed, 8 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index a625e9d5efeb5..b0b504887006f 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21552,7 +21552,10 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return false; EVT ScalarType = DataType.getScalarType(); - if (!isLegalElementTypeForRVV(ScalarType)) + // TODO: Move bf16/f16 support into isLegalElementTypeForRVV + if (!(isLegalElementTypeForRVV(ScalarType) || + (ScalarType == MVT::bf16 && Subtarget.hasVInstructionsBF16Minimal()) || + (ScalarType == MVT::f16 && Subtarget.hasVInstructionsF16Minimal()))) return false; if (!Subtarget.enableUnalignedVectorMem() && diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll index 21bb0b5a91419..4556df3b31687 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/strided-loads-vectorized.ll @@ -375,54 +375,10 @@ define void @test_bf16(ptr %p, ptr noalias %s) { ; ZVFHMIN-ZVFBFMIN-LABEL: @test_bf16( ; ZVFHMIN-ZVFBFMIN-NEXT: entry: ; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P:%.*]], i64 0, i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load bfloat, ptr [[ARRAYIDX]], align 4 ; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 30 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load bfloat, ptr [[ARRAYIDX1]], align 4 ; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds bfloat, ptr [[S:%.*]], i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load bfloat, ptr [[ARRAYIDX4]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 26 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load bfloat, ptr [[ARRAYIDX6]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 8 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load bfloat, ptr [[ARRAYIDX11]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 22 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load bfloat, ptr [[ARRAYIDX13]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 12 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load bfloat, ptr [[ARRAYIDX18]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 18 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load bfloat, ptr [[ARRAYIDX20]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 16 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load bfloat, ptr [[ARRAYIDX25]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 14 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load bfloat, ptr [[ARRAYIDX27]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 20 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load bfloat, ptr [[ARRAYIDX32]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 10 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load bfloat, ptr [[ARRAYIDX34]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 24 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load bfloat, ptr [[ARRAYIDX39]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 6 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load bfloat, ptr [[ARRAYIDX41]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 28 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load bfloat, ptr [[ARRAYIDX46]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x bfloat], ptr [[P]], i64 0, i64 2 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load bfloat, ptr [[ARRAYIDX48]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP0:%.*]] = insertelement <8 x bfloat> poison, bfloat [[I1]], i32 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP1:%.*]] = insertelement <8 x bfloat> [[TMP0]], bfloat [[I3]], i32 1 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP2:%.*]] = insertelement <8 x bfloat> [[TMP1]], bfloat [[I5]], i32 2 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP3:%.*]] = insertelement <8 x bfloat> [[TMP2]], bfloat [[I7]], i32 3 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP4:%.*]] = insertelement <8 x bfloat> [[TMP3]], bfloat [[I9]], i32 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP5:%.*]] = insertelement <8 x bfloat> [[TMP4]], bfloat [[I11]], i32 5 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP6:%.*]] = insertelement <8 x bfloat> [[TMP5]], bfloat [[I13]], i32 6 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = insertelement <8 x bfloat> [[TMP6]], bfloat [[I15]], i32 7 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP8:%.*]] = insertelement <8 x bfloat> poison, bfloat [[I]], i32 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP9:%.*]] = insertelement <8 x bfloat> [[TMP8]], bfloat [[I2]], i32 1 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP10:%.*]] = insertelement <8 x bfloat> [[TMP9]], bfloat [[I4]], i32 2 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP11:%.*]] = insertelement <8 x bfloat> [[TMP10]], bfloat [[I6]], i32 3 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP12:%.*]] = insertelement <8 x bfloat> [[TMP11]], bfloat [[I8]], i32 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP13:%.*]] = insertelement <8 x bfloat> [[TMP12]], bfloat [[I10]], i32 5 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP14:%.*]] = insertelement <8 x bfloat> [[TMP13]], bfloat [[I12]], i32 6 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = insertelement <8 x bfloat> [[TMP14]], bfloat [[I14]], i32 7 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i64(ptr align 4 [[ARRAYIDX]], i64 8, <8 x i1> splat (i1 true), i32 8) +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = call <8 x bfloat> @llvm.experimental.vp.strided.load.v8bf16.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -8, <8 x i1> splat (i1 true), i32 8) ; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP16:%.*]] = fsub fast <8 x bfloat> [[TMP7]], [[TMP15]] ; ZVFHMIN-ZVFBFMIN-NEXT: store <8 x bfloat> [[TMP16]], ptr [[ARRAYIDX2]], align 4 ; ZVFHMIN-ZVFBFMIN-NEXT: ret void @@ -551,54 +507,10 @@ define void @test_f16(ptr %p, ptr noalias %s) { ; ZVFHMIN-ZVFBFMIN-LABEL: @test_f16( ; ZVFHMIN-ZVFBFMIN-NEXT: entry: ; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [48 x half], ptr [[P:%.*]], i64 0, i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I:%.*]] = load half, ptr [[ARRAYIDX]], align 4 ; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 30 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I1:%.*]] = load half, ptr [[ARRAYIDX1]], align 4 ; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, ptr [[S:%.*]], i64 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I2:%.*]] = load half, ptr [[ARRAYIDX4]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 26 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I3:%.*]] = load half, ptr [[ARRAYIDX6]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 8 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I4:%.*]] = load half, ptr [[ARRAYIDX11]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 22 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I5:%.*]] = load half, ptr [[ARRAYIDX13]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 12 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I6:%.*]] = load half, ptr [[ARRAYIDX18]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 18 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I7:%.*]] = load half, ptr [[ARRAYIDX20]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX25:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 16 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I8:%.*]] = load half, ptr [[ARRAYIDX25]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 14 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I9:%.*]] = load half, ptr [[ARRAYIDX27]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 20 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I10:%.*]] = load half, ptr [[ARRAYIDX32]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 10 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I11:%.*]] = load half, ptr [[ARRAYIDX34]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 24 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I12:%.*]] = load half, ptr [[ARRAYIDX39]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 6 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I13:%.*]] = load half, ptr [[ARRAYIDX41]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX46:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 28 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I14:%.*]] = load half, ptr [[ARRAYIDX46]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [48 x half], ptr [[P]], i64 0, i64 2 -; ZVFHMIN-ZVFBFMIN-NEXT: [[I15:%.*]] = load half, ptr [[ARRAYIDX48]], align 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP0:%.*]] = insertelement <8 x half> poison, half [[I1]], i32 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[I3]], i32 1 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[I5]], i32 2 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[I7]], i32 3 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP4:%.*]] = insertelement <8 x half> [[TMP3]], half [[I9]], i32 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP5:%.*]] = insertelement <8 x half> [[TMP4]], half [[I11]], i32 5 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP6:%.*]] = insertelement <8 x half> [[TMP5]], half [[I13]], i32 6 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = insertelement <8 x half> [[TMP6]], half [[I15]], i32 7 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP8:%.*]] = insertelement <8 x half> poison, half [[I]], i32 0 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half [[I2]], i32 1 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half [[I4]], i32 2 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half [[I6]], i32 3 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP12:%.*]] = insertelement <8 x half> [[TMP11]], half [[I8]], i32 4 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP12]], half [[I10]], i32 5 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP14:%.*]] = insertelement <8 x half> [[TMP13]], half [[I12]], i32 6 -; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP14]], half [[I14]], i32 7 +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP15:%.*]] = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i64(ptr align 4 [[ARRAYIDX]], i64 8, <8 x i1> splat (i1 true), i32 8) +; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP7:%.*]] = call <8 x half> @llvm.experimental.vp.strided.load.v8f16.p0.i64(ptr align 4 [[ARRAYIDX1]], i64 -8, <8 x i1> splat (i1 true), i32 8) ; ZVFHMIN-ZVFBFMIN-NEXT: [[TMP16:%.*]] = fsub fast <8 x half> [[TMP7]], [[TMP15]] ; ZVFHMIN-ZVFBFMIN-NEXT: store <8 x half> [[TMP16]], ptr [[ARRAYIDX2]], align 4 ; ZVFHMIN-ZVFBFMIN-NEXT: ret void