From 16f895c4fc79cd7fcb27ef995352100e7ea980d6 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Thu, 31 Oct 2024 23:04:06 +0800 Subject: [PATCH 1/2] [RISCV] Add +optimized-nf{3,4}-segment-load-store This is a follow up to #111511, where after benchmarking we learnt that the Banana Pi F3 has fast segmented loads for not just NF=2, but also NF=3 and NF=4: https://github.com/preames/bp3-microarch#vlseg_lmul_x_sew_throughput This adds a tuning feature to allow these segment loads and stores to be costed cheaper and enables it for the spacemit-x60. --- llvm/lib/Target/RISCV/RISCVFeatures.td | 7 ++ llvm/lib/Target/RISCV/RISCVProcessors.td | 4 +- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 7 +- .../LoopVectorize/RISCV/interleaved-cost.ll | 70 +++++++++++++------ 4 files changed, 62 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 1e4bf1b8830bc..719a39944c683 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1386,6 +1386,13 @@ def TuneOptimizedZeroStrideLoad "true", "Optimized (perform fewer memory operations)" "zero-stride vector load">; +foreach nf = {3-4} in + def TuneOptimizedNF#nf#SegmentLoadStore : + SubtargetFeature<"optimized-nf"#nf#"-segment-load-store", + "HasOptimizedNF"#nf#"SegmentLoadStore", + "true", "vlseg"#nf#"eN.v and vsseg"#nf#"eN.v are" + "implemented as a wide memory op and shuffle">; + def Experimental : SubtargetFeature<"experimental", "HasExperimental", "true", "Experimental intrinsics">; diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 364aa35f09453..c73bd5e6a1f7c 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -472,7 +472,9 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", FeatureStdExtZvfh, FeatureStdExtZvkt, FeatureStdExtZvl256b]), - [TuneDLenFactor2]>; + [TuneDLenFactor2, + TuneOptimizedNF3SegmentLoadStore, + TuneOptimizedNF4SegmentLoadStore]>; def RP2350_HAZARD3 : RISCVProcessorModel<"rp2350-hazard3", NoSchedModel, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index f050fb569946d..936e0a9bfdffc 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -738,8 +738,11 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( AddressSpace, DL)) { // Most available hardware today optimizes NF=2 as as one wide memory op - // + Factor * LMUL shuffle ops. - if (Factor == 2) { + // + Factor * LMUL shuffle ops. Some processors may also optimize NF=3 + // and NF=4. + if (Factor == 2 || + (Factor == 3 && ST->hasOptimizedNF3SegmentLoadStore()) || + (Factor == 4 && ST->hasOptimizedNF4SegmentLoadStore())) { InstructionCost Cost = getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind); MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT(); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll index 6477f14e3c698..b5f26dd001f88 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll @@ -1,5 +1,7 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,NO-OPT +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf3-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,OPT-NF3 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf4-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,OPT-NF4 %i8.2 = type {i8, i8} define void @i8_factor_2(ptr %data, i64 %n) { @@ -48,17 +50,28 @@ for.end: define void @i8_factor_3(ptr %data, i64 %n) { entry: br label %for.body -; CHECK-LABEL: Checking a loop in 'i8_factor_3' -; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> -; CHECK: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> -; CHECK: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; OPT-NF3-LABEL: Checking a loop in 'i8_factor_3' +; OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; OPT-NF3: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; OPT-NF3: Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; OPT-NF3: Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; OPT-NF3: Cost of 7 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; OPT-NF3: Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; NO-OPT-LABEL: Checking a loop in 'i8_factor_3' +; NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; NO-OPT: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; NO-OPT: Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; NO-OPT: Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; NO-OPT: Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 3 at , ir<%p0> +; NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at %l0, ir<%p0> +; NO-OPT: Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 3 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.3, ptr %data, i64 %i, i32 0 @@ -85,17 +98,28 @@ for.end: define void @i8_factor_4(ptr %data, i64 %n) { entry: br label %for.body -; CHECK-LABEL: Checking a loop in 'i8_factor_4' -; CHECK: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> -; CHECK: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> -; CHECK: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; OPT-NF4-LABEL: Checking a loop in 'i8_factor_4' +; OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; OPT-NF4: Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; OPT-NF4: Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; OPT-NF4: Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; OPT-NF4: Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; OPT-NF4: Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; NO-OPT-LABEL: Checking a loop in 'i8_factor_4' +; NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; NO-OPT: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; NO-OPT: Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; NO-OPT: Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; NO-OPT: Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 4 at , ir<%p0> +; NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at %l0, ir<%p0> +; NO-OPT: Cost of 128 for VF 32: INTERLEAVE-GROUP with factor 4 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.4, ptr %data, i64 %i, i32 0 From 3984711dbd773682e1afcf000ff13ea327bb3434 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Fri, 1 Nov 2024 01:02:25 +0800 Subject: [PATCH 2/2] Add other NFs, including NF2. Add NF2 to the generic processor model --- llvm/lib/Target/RISCV/RISCVFeatures.td | 2 +- llvm/lib/Target/RISCV/RISCVProcessors.td | 12 +- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 31 ++- .../LoopVectorize/RISCV/interleaved-cost.ll | 182 ++++++++++++------ 4 files changed, 157 insertions(+), 70 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index 719a39944c683..f2e661f007d11 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1386,7 +1386,7 @@ def TuneOptimizedZeroStrideLoad "true", "Optimized (perform fewer memory operations)" "zero-stride vector load">; -foreach nf = {3-4} in +foreach nf = {2-8} in def TuneOptimizedNF#nf#SegmentLoadStore : SubtargetFeature<"optimized-nf"#nf#"-segment-load-store", "HasOptimizedNF"#nf#"SegmentLoadStore", diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index c73bd5e6a1f7c..5277752a38ad9 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -57,15 +57,19 @@ class RISCVTuneProcessorModel f = []> : ProcessorModel; +defvar GenericTuneFeatures = [TuneOptimizedNF2SegmentLoadStore]; + def GENERIC_RV32 : RISCVProcessorModel<"generic-rv32", NoSchedModel, [Feature32Bit, - FeatureStdExtI]>, + FeatureStdExtI], + GenericTuneFeatures>, GenericTuneInfo; def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit, - FeatureStdExtI]>, + FeatureStdExtI], + GenericTuneFeatures>, GenericTuneInfo; // Support generic for compatibility with other targets. The triple will be used // to change to the appropriate rv32/rv64 version. @@ -221,7 +225,8 @@ def SIFIVE_U74 : RISCVProcessorModel<"sifive-u74", defvar SiFiveX280TuneFeatures = !listconcat(SiFive7TuneFeatures, [TuneDLenFactor2, - TuneOptimizedZeroStrideLoad]); + TuneOptimizedZeroStrideLoad, + TuneOptimizedNF2SegmentLoadStore]); def SIFIVE_X280 : RISCVProcessorModel<"sifive-x280", SiFive7Model, [Feature64Bit, FeatureStdExtI, @@ -473,6 +478,7 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", FeatureStdExtZvkt, FeatureStdExtZvl256b]), [TuneDLenFactor2, + TuneOptimizedNF2SegmentLoadStore, TuneOptimizedNF3SegmentLoadStore, TuneOptimizedNF4SegmentLoadStore]>; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 936e0a9bfdffc..b7a559b8ba39a 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -716,6 +716,28 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } +static bool hasOptimizedSegmentLoadStore(unsigned NF, + const RISCVSubtarget *ST) { + switch (NF) { + case 2: + return ST->hasOptimizedNF2SegmentLoadStore(); + case 3: + return ST->hasOptimizedNF3SegmentLoadStore(); + case 4: + return ST->hasOptimizedNF4SegmentLoadStore(); + case 5: + return ST->hasOptimizedNF5SegmentLoadStore(); + case 6: + return ST->hasOptimizedNF6SegmentLoadStore(); + case 7: + return ST->hasOptimizedNF7SegmentLoadStore(); + case 8: + return ST->hasOptimizedNF8SegmentLoadStore(); + default: + llvm_unreachable("Unexpected NF"); + } +} + InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -737,12 +759,9 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment, AddressSpace, DL)) { - // Most available hardware today optimizes NF=2 as as one wide memory op - // + Factor * LMUL shuffle ops. Some processors may also optimize NF=3 - // and NF=4. - if (Factor == 2 || - (Factor == 3 && ST->hasOptimizedNF3SegmentLoadStore()) || - (Factor == 4 && ST->hasOptimizedNF4SegmentLoadStore())) { + // Some processors optimize segment loads/stores as one wide memory op + + // Factor * LMUL shuffle ops. + if (hasOptimizedSegmentLoadStore(Factor, ST)) { InstructionCost Cost = getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind); MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT(); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll index b5f26dd001f88..d6f16bfcba1af 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-cost.ll @@ -1,33 +1,59 @@ ; REQUIRES: asserts -; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,NO-OPT -; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf3-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,OPT-NF3 -; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf4-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,OPT-NF4 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,-optimized-nf2-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=NO-OPT +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF2 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf3-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF3 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf4-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF4 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf5-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF5 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf6-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF6 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf7-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF7 +; RUN: opt -passes=loop-vectorize -mtriple=riscv64 -mattr=+v,+optimized-nf8-segment-load-store -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=OPT-NF8 %i8.2 = type {i8, i8} define void @i8_factor_2(ptr %data, i64 %n) { entry: br label %for.body -; CHECK-LABEL: Checking a loop in 'i8_factor_2' -; CHECK: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 3 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 3 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 3 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 4 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 4 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> -; CHECK: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> -; CHECK: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2-LABEL: Checking a loop in 'i8_factor_2' +; OPT-NF2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 3 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 3 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 3 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 3 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 3 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 4 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 4 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; OPT-NF2: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; OPT-NF2: Cost of 8 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT-LABEL: Checking a loop in 'i8_factor_2' +; NO-OPT: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 64 for VF 32: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 64 for VF 32: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 4 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 4 for VF vscale x 1: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 8 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 8 for VF vscale x 2: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 16 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 16 for VF vscale x 4: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 32 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 32 for VF vscale x 8: INTERLEAVE-GROUP with factor 2 at , ir<%p0> +; NO-OPT: Cost of 64 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at %l0, ir<%p0> +; NO-OPT: Cost of 64 for VF vscale x 16: INTERLEAVE-GROUP with factor 2 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.2, ptr %data, i64 %i, i32 0 @@ -150,15 +176,24 @@ for.end: define void @i8_factor_5(ptr %data, i64 %n) { entry: br label %for.body -; CHECK-LABEL: Checking a loop in 'i8_factor_5' -; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; CHECK: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; CHECK: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> -; CHECK: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> -; CHECK: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; OPT-NF5-LABEL: Checking a loop in 'i8_factor_5' +; OPT-NF5: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; OPT-NF5: Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; OPT-NF5: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; OPT-NF5: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; OPT-NF5: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; OPT-NF5: Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; OPT-NF5: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; OPT-NF5: Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; NO-OPT-LABEL: Checking a loop in 'i8_factor_5' +; NO-OPT: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; NO-OPT: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; NO-OPT: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; NO-OPT: Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; NO-OPT: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; NO-OPT: Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 5 at , ir<%p0> +; NO-OPT: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at %l0, ir<%p0> +; NO-OPT: Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 5 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.5, ptr %data, i64 %i, i32 0 @@ -193,15 +228,24 @@ for.end: define void @i8_factor_6(ptr %data, i64 %n) { entry: br label %for.body -; CHECK-LABEL: Checking a loop in 'i8_factor_6' -; CHECK: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; CHECK: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> -; CHECK: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> -; CHECK: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; OPT-NF6-LABEL: Checking a loop in 'i8_factor_6' +; OPT-NF6: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; OPT-NF6: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; OPT-NF6: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; OPT-NF6: Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; OPT-NF6: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; OPT-NF6: Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; OPT-NF6: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; OPT-NF6: Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; NO-OPT-LABEL: Checking a loop in 'i8_factor_6' +; NO-OPT: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; NO-OPT: Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; NO-OPT: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; NO-OPT: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; NO-OPT: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; NO-OPT: Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 6 at , ir<%p0> +; NO-OPT: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at %l0, ir<%p0> +; NO-OPT: Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 6 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.6, ptr %data, i64 %i, i32 0 @@ -240,15 +284,24 @@ for.end: define void @i8_factor_7(ptr %data, i64 %n) { entry: br label %for.body -; CHECK-LABEL: Checking a loop in 'i8_factor_7' -; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; CHECK: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; CHECK: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> -; CHECK: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> -; CHECK: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; OPT-NF7-LABEL: Checking a loop in 'i8_factor_7' +; OPT-NF7: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; OPT-NF7: Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; OPT-NF7: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; OPT-NF7: Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; OPT-NF7: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; OPT-NF7: Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; OPT-NF7: Cost of 15 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; OPT-NF7: Cost of 15 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; NO-OPT-LABEL: Checking a loop in 'i8_factor_7' +; NO-OPT: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; NO-OPT: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; NO-OPT: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; NO-OPT: Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; NO-OPT: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; NO-OPT: Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 7 at , ir<%p0> +; NO-OPT: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at %l0, ir<%p0> +; NO-OPT: Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 7 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.7, ptr %data, i64 %i, i32 0 @@ -291,15 +344,24 @@ for.end: define void @i8_factor_8(ptr %data, i64 %n) { entry: br label %for.body -; CHECK-LABEL: Checking a loop in 'i8_factor_8' -; CHECK: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; CHECK: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; CHECK: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> -; CHECK: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> -; CHECK: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; OPT-NF8-LABEL: Checking a loop in 'i8_factor_8' +; OPT-NF8: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; OPT-NF8: Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; OPT-NF8: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; OPT-NF8: Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; OPT-NF8: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; OPT-NF8: Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; OPT-NF8: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; OPT-NF8: Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; NO-OPT-LABEL: Checking a loop in 'i8_factor_8' +; NO-OPT: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; NO-OPT: Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; NO-OPT: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; NO-OPT: Cost of 32 for VF 4: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; NO-OPT: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; NO-OPT: Cost of 64 for VF 8: INTERLEAVE-GROUP with factor 8 at , ir<%p0> +; NO-OPT: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at %l0, ir<%p0> +; NO-OPT: Cost of 128 for VF 16: INTERLEAVE-GROUP with factor 8 at , ir<%p0> for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %p0 = getelementptr inbounds %i8.8, ptr %data, i64 %i, i32 0