diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index d3c923a76d074..cdc2ce752743c 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -4329,9 +4329,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, { ISD::ABS, MVT::i64, { 1, 2, 3, 3 } }, // SUB+CMOV { ISD::BITREVERSE, MVT::i64, { 10, 12, 20, 22 } }, { ISD::BSWAP, MVT::i64, { 1, 2, 1, 2 } }, - { ISD::CTLZ, MVT::i64, { 2, 2, 4, 5 } }, // BSR+XOR or BSR+XOR+CMOV + { ISD::CTLZ, MVT::i64, { 1, 2, 3, 3 } }, // MOV+BSR+XOR + { ISD::CTLZ, MVT::i32, { 1, 2, 3, 3 } }, // MOV+BSR+XOR + { ISD::CTLZ, MVT::i16, { 2, 2, 3, 3 } }, // MOV+BSR+XOR + { ISD::CTLZ, MVT::i8, { 2, 2, 4, 3 } }, // MOV+BSR+XOR { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 2, 2 } }, // BSR+XOR - { ISD::CTTZ, MVT::i64, { 2, 2, 3, 4 } }, // TEST+BSF+CMOV/BRANCH + { ISD::CTTZ, MVT::i64, { 1, 2, 2, 2 } }, // MOV+BSF + { ISD::CTTZ, MVT::i32, { 1, 2, 2, 2 } }, // MOV+BSF + { ISD::CTTZ, MVT::i16, { 2, 2, 2, 2 } }, // MOV+BSF + { ISD::CTTZ, MVT::i8, { 2, 2, 2, 2 } }, // MOV+BSF { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{ 1, 2, 1, 2 } }, // BSF { ISD::CTPOP, MVT::i64, { 10, 6, 19, 19 } }, { ISD::ROTL, MVT::i64, { 2, 3, 1, 3 } }, diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll index da0f71c63ef80..9f8e4edf7a0fc 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll index 2425e7286265b..fc3516695852a 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' @@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) { define i16 @var_ctlz_i16(i16 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i16' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i16' @@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) { define i8 @var_ctlz_i8(i8 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i8' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll index fa7982ce09e9c..d9d04de12467d 100644 --- a/llvm/test/Analysis/CostModel/X86/ctlz.ll +++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll @@ -17,7 +17,7 @@ declare i8 @llvm.ctlz.i8(i8, i1) define i64 @var_ctlz_i64(i64 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i64' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i64' @@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) { define i32 @var_ctlz_i32(i32 %a) { ; NOLZCNT-LABEL: 'var_ctlz_i32' -; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) +; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false) ; NOLZCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz ; ; LZCNT-LABEL: 'var_ctlz_i32' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll index 07bf1dd7a2ff6..621c1b9320fc8 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' @@ -40,7 +40,7 @@ define i64 @var_cttz_i64u(i64 %a) { define i32 @var_cttz_i32(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32' diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll index afe5cb8c55fe6..34d363ce00879 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' @@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) { define i32 @var_cttz_i32(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32' @@ -70,7 +70,7 @@ define i32 @var_cttz_i32u(i32 %a) { define i16 @var_cttz_i16(i16 %a) { ; NOBMI-LABEL: 'var_cttz_i16' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i16 %cttz ; ; BMI-LABEL: 'var_cttz_i16' @@ -96,7 +96,7 @@ define i16 @var_cttz_i16u(i16 %a) { define i8 @var_cttz_i8(i8 %a) { ; NOBMI-LABEL: 'var_cttz_i8' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i8 %cttz ; ; BMI-LABEL: 'var_cttz_i8' diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll index fa0f10f886f63..3f5a731b27d9b 100644 --- a/llvm/test/Analysis/CostModel/X86/cttz.ll +++ b/llvm/test/Analysis/CostModel/X86/cttz.ll @@ -18,7 +18,7 @@ declare i8 @llvm.cttz.i8(i8, i1) define i64 @var_cttz_i64(i64 %a) { ; NOBMI-LABEL: 'var_cttz_i64' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz ; ; BMI-LABEL: 'var_cttz_i64' @@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) { define i32 @var_cttz_i32(i32 %a) { ; NOBMI-LABEL: 'var_cttz_i32' -; NOBMI-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; NOBMI-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; NOBMI-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz ; ; BMI-LABEL: 'var_cttz_i32' diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll index 062e5f157bae2..bcef47ee9e056 100644 --- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll @@ -232,7 +232,7 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { define void @cttz(i32 %a, <16 x i32> %va) { ; THRU-LABEL: 'cttz' -; THRU-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; THRU-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -242,12 +242,12 @@ define void @cttz(i32 %a, <16 x i32> %va) { ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'cttz' -; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'cttz' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll index 8a22e45fe1ca5..9bf2ade3176d6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2 -; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4 +; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512 @@ -136,32 +136,47 @@ define void @ctlz_4i64() #0 { } define void @ctlz_4i32() #0 { -; SSE2-LABEL: @ctlz_4i32( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; SSE2-NEXT: ret void +; SSE-LABEL: @ctlz_4i32( +; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 +; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @ctlz_4i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 +; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; AVX1-NEXT: ret void ; -; SSE4-LABEL: @ctlz_4i32( -; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 -; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 -; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 -; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 -; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 4 -; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 -; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 -; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 -; SSE4-NEXT: ret void +; AVX2-LABEL: @ctlz_4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX2-NEXT: ret void ; -; AVX-LABEL: @ctlz_4i32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; AVX-NEXT: ret void +; AVX512-LABEL: @ctlz_4i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 4 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 @@ -179,47 +194,71 @@ define void @ctlz_4i32() #0 { } define void @ctlz_8i32() #0 { -; SSE2-LABEL: @ctlz_8i32( -; SSE2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2 -; SSE2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false) -; SSE2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 2 -; SSE2-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 -; SSE2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false) -; SSE2-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 -; SSE2-NEXT: ret void +; SSE-LABEL: @ctlz_8i32( +; SSE-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; SSE-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; SSE-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; SSE-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; SSE-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; SSE-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; SSE-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; SSE-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; SSE-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; SSE-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; SSE-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; SSE-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; SSE-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) +; SSE-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) +; SSE-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) +; SSE-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) +; SSE-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 +; SSE-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; SSE-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; SSE-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; SSE-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; SSE-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; SSE-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; SSE-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; SSE-NEXT: ret void +; +; AVX1-LABEL: @ctlz_8i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; AVX1-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; AVX1-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; AVX1-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; AVX1-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) +; AVX1-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) +; AVX1-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) +; AVX1-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) +; AVX1-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 +; AVX1-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; AVX1-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; AVX1-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; AVX1-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; AVX1-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; AVX1-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; AVX1-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: ret void ; -; SSE4-LABEL: @ctlz_8i32( -; SSE4-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 -; SSE4-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 -; SSE4-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 -; SSE4-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 -; SSE4-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 -; SSE4-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 -; SSE4-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 -; SSE4-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 -; SSE4-NEXT: [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false) -; SSE4-NEXT: [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false) -; SSE4-NEXT: [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false) -; SSE4-NEXT: [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false) -; SSE4-NEXT: [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false) -; SSE4-NEXT: [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false) -; SSE4-NEXT: [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false) -; SSE4-NEXT: [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false) -; SSE4-NEXT: store i32 [[CTLZ0]], ptr @dst32, align 2 -; SSE4-NEXT: store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 -; SSE4-NEXT: store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 -; SSE4-NEXT: store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 -; SSE4-NEXT: store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 -; SSE4-NEXT: store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 -; SSE4-NEXT: store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 -; SSE4-NEXT: store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 -; SSE4-NEXT: ret void +; AVX2-LABEL: @ctlz_8i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX2-NEXT: ret void ; -; AVX-LABEL: @ctlz_8i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 -; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 -; AVX-NEXT: ret void +; AVX512-LABEL: @ctlz_8i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 2 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 @@ -1063,3 +1102,6 @@ define void @ctlz_undef_32i8() #0 { } attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SSE2: {{.*}} +; SSE4: {{.*}} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll index 22f0c3f936509..896be6f2fe213 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll @@ -142,11 +142,32 @@ define void @cttz_4i32() #0 { ; SSE-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 ; SSE-NEXT: ret void ; -; AVX-LABEL: @cttz_4i32( -; AVX-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 -; AVX-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 -; AVX-NEXT: ret void +; AVX1-LABEL: @cttz_4i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 4 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4 +; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: store i32 [[CTTZ0]], ptr @dst32, align 4 +; AVX1-NEXT: store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4 +; AVX1-NEXT: store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4 +; AVX1-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @cttz_4i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX2-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @cttz_4i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4 +; AVX512-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <4 x i32> [[TMP2]], ptr @dst32, align 4 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 4 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4 @@ -191,11 +212,44 @@ define void @cttz_8i32() #0 { ; SSE-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 ; SSE-NEXT: ret void ; -; AVX-LABEL: @cttz_8i32( -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 -; AVX-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) -; AVX-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 -; AVX-NEXT: ret void +; AVX1-LABEL: @cttz_8i32( +; AVX1-NEXT: [[LD0:%.*]] = load i32, ptr @src32, align 2 +; AVX1-NEXT: [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2 +; AVX1-NEXT: [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2 +; AVX1-NEXT: [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2 +; AVX1-NEXT: [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2 +; AVX1-NEXT: [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2 +; AVX1-NEXT: [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2 +; AVX1-NEXT: [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2 +; AVX1-NEXT: [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false) +; AVX1-NEXT: [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false) +; AVX1-NEXT: [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false) +; AVX1-NEXT: [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false) +; AVX1-NEXT: [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false) +; AVX1-NEXT: [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false) +; AVX1-NEXT: [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false) +; AVX1-NEXT: [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false) +; AVX1-NEXT: store i32 [[CTTZ0]], ptr @dst32, align 2 +; AVX1-NEXT: store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2 +; AVX1-NEXT: store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2 +; AVX1-NEXT: store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2 +; AVX1-NEXT: store i32 [[CTTZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2 +; AVX1-NEXT: store i32 [[CTTZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2 +; AVX1-NEXT: store i32 [[CTTZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2 +; AVX1-NEXT: store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2 +; AVX1-NEXT: ret void +; +; AVX2-LABEL: @cttz_8i32( +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX2-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX2-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX2-NEXT: ret void +; +; AVX512-LABEL: @cttz_8i32( +; AVX512-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2 +; AVX512-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false) +; AVX512-NEXT: store <8 x i32> [[TMP2]], ptr @dst32, align 2 +; AVX512-NEXT: ret void ; %ld0 = load i32, ptr @src32, align 2 %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2