Skip to content

Commit 72efe7b

Browse files
committed
Add atan2 to accelerate vector library and hasOptimizedCodeGen
1 parent 139d5f9 commit 72efe7b

File tree

4 files changed

+203
-0
lines changed

4 files changed

+203
-0
lines changed

llvm/include/llvm/Analysis/TargetLibraryInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -410,6 +410,7 @@ class TargetLibraryInfo {
410410
// clang-format off
411411
case LibFunc_acos: case LibFunc_acosf: case LibFunc_acosl:
412412
case LibFunc_asin: case LibFunc_asinf: case LibFunc_asinl:
413+
case LibFunc_atan2: case LibFunc_atan2f: case LibFunc_atan2l:
413414
case LibFunc_atan: case LibFunc_atanf: case LibFunc_atanl:
414415
case LibFunc_ceil: case LibFunc_ceilf: case LibFunc_ceill:
415416
case LibFunc_copysign: case LibFunc_copysignf: case LibFunc_copysignl:

llvm/include/llvm/Analysis/VecFuncs.def

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
5656
TLI_DEFINE_VECFUNC("llvm.acos.f32", "vacosf", FIXED(4), "_ZGV_LLVM_N4v")
5757
TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
5858
TLI_DEFINE_VECFUNC("llvm.atan.f32", "vatanf", FIXED(4), "_ZGV_LLVM_N4v")
59+
TLI_DEFINE_VECFUNC("atan2f", "vatan2f", FIXED(4), "_ZGV_LLVM_N4vv")
60+
TLI_DEFINE_VECFUNC("llvm.atan2.f32", "vatan2f", FIXED(4), "_ZGV_LLVM_N4vv")
5961

6062
// Hyperbolic Functions
6163
TLI_DEFINE_VECFUNC("sinhf", "vsinhf", FIXED(4), "_ZGV_LLVM_N4v")

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,106 @@ entry:
801801
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
802802
ret <4 x float> %vecins.3
803803
}
804+
declare float @atan2f(float,float) readonly nounwind willreturn
805+
define <4 x float> @atan2_4x(ptr %a, ptr %b) {
806+
; CHECK-LABEL: @atan2_4x(
807+
; CHECK-NEXT: entry:
808+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
809+
; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
810+
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
811+
; CHECK-NEXT: ret <4 x float> [[TMP1]]
812+
;
813+
; NOACCELERATE-LABEL: @atan2_4x(
814+
; NOACCELERATE-NEXT: entry:
815+
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
816+
; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
817+
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
818+
; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
819+
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]])
820+
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
821+
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
822+
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
823+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
824+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
825+
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
826+
; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
827+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atan2f(float [[VECEXT_2]], float [[VECEXTB_2]])
828+
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
829+
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
830+
; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
831+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atan2f(float [[VECEXT_3]], float [[VECEXTB_3]])
832+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
833+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
834+
;
835+
entry:
836+
%0 = load <4 x float>, ptr %a, align 16
837+
%bb = load <4 x float>, ptr %b, align 16
838+
%vecext = extractelement <4 x float> %0, i32 0
839+
%vecextb = extractelement <4 x float> %bb, i32 0
840+
%1 = tail call fast float @atan2f(float %vecext, float %vecextb)
841+
%vecins = insertelement <4 x float> poison, float %1, i32 0
842+
%vecext.1 = extractelement <4 x float> %0, i32 1
843+
%vecextb.1 = extractelement <4 x float> %bb, i32 1
844+
%2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1)
845+
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
846+
%vecext.2 = extractelement <4 x float> %0, i32 2
847+
%vecextb.2 = extractelement <4 x float> %bb, i32 2
848+
%3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2)
849+
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
850+
%vecext.3 = extractelement <4 x float> %0, i32 3
851+
%vecextb.3 = extractelement <4 x float> %bb, i32 3
852+
%4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3)
853+
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
854+
ret <4 x float> %vecins.3
855+
}
856+
define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
857+
; CHECK-LABEL: @int_atan2_4x(
858+
; CHECK-NEXT: entry:
859+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
860+
; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
861+
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
862+
; CHECK-NEXT: ret <4 x float> [[TMP1]]
863+
;
864+
; NOACCELERATE-LABEL: @int_atan2_4x(
865+
; NOACCELERATE-NEXT: entry:
866+
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
867+
; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
868+
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
869+
; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
870+
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]])
871+
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0
872+
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
873+
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
874+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
875+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
876+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
877+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
878+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
879+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
880+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
881+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
882+
;
883+
entry:
884+
%0 = load <4 x float>, ptr %a, align 16
885+
%bb = load <4 x float>, ptr %b, align 16
886+
%vecext = extractelement <4 x float> %0, i32 0
887+
%vecextb = extractelement <4 x float> %bb, i32 0
888+
%1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb)
889+
%vecins = insertelement <4 x float> poison, float %1, i32 0
890+
%vecext.1 = extractelement <4 x float> %0, i32 1
891+
%vecextb.1 = extractelement <4 x float> %bb, i32 1
892+
%2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1)
893+
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
894+
%vecext.2 = extractelement <4 x float> %0, i32 2
895+
%vecextb.2 = extractelement <4 x float> %bb, i32 2
896+
%3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2)
897+
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
898+
%vecext.3 = extractelement <4 x float> %0, i32 3
899+
%vecextb.3 = extractelement <4 x float> %bb, i32 3
900+
%4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3)
901+
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
902+
ret <4 x float> %vecins.3
903+
}
804904
declare float @sinhf(float) readonly nounwind willreturn
805905
define <4 x float> @sinh_4x(ptr %a) {
806906
; CHECK-LABEL: @sinh_4x(

llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -801,6 +801,106 @@ entry:
801801
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
802802
ret <4 x float> %vecins.3
803803
}
804+
declare float @atan2f(float,float) readonly nounwind willreturn
805+
define <4 x float> @atan2_4x(ptr %a, ptr %b) {
806+
; CHECK-LABEL: @atan2_4x(
807+
; CHECK-NEXT: entry:
808+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
809+
; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
810+
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
811+
; CHECK-NEXT: ret <4 x float> [[TMP1]]
812+
;
813+
; NOACCELERATE-LABEL: @atan2_4x(
814+
; NOACCELERATE-NEXT: entry:
815+
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
816+
; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
817+
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
818+
; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
819+
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @atan2f(float [[VECEXT]], float [[VECEXTB]])
820+
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
821+
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
822+
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
823+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @atan2f(float [[VECEXT_1]], float [[VECEXTB_1]])
824+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
825+
; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
826+
; NOACCELERATE-NEXT: [[VECEXTB_2:%.*]] = extractelement <4 x float> [[BB]], i32 2
827+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = tail call fast float @atan2f(float [[VECEXT_2]], float [[VECEXTB_2]])
828+
; NOACCELERATE-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
829+
; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
830+
; NOACCELERATE-NEXT: [[VECEXTB_3:%.*]] = extractelement <4 x float> [[BB]], i32 3
831+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = tail call fast float @atan2f(float [[VECEXT_3]], float [[VECEXTB_3]])
832+
; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
833+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]]
834+
;
835+
entry:
836+
%0 = load <4 x float>, ptr %a, align 16
837+
%bb = load <4 x float>, ptr %b, align 16
838+
%vecext = extractelement <4 x float> %0, i32 0
839+
%vecextb = extractelement <4 x float> %bb, i32 0
840+
%1 = tail call fast float @atan2f(float %vecext, float %vecextb)
841+
%vecins = insertelement <4 x float> undef, float %1, i32 0
842+
%vecext.1 = extractelement <4 x float> %0, i32 1
843+
%vecextb.1 = extractelement <4 x float> %bb, i32 1
844+
%2 = tail call fast float @atan2f(float %vecext.1, float %vecextb.1)
845+
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
846+
%vecext.2 = extractelement <4 x float> %0, i32 2
847+
%vecextb.2 = extractelement <4 x float> %bb, i32 2
848+
%3 = tail call fast float @atan2f(float %vecext.2, float %vecextb.2)
849+
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
850+
%vecext.3 = extractelement <4 x float> %0, i32 3
851+
%vecextb.3 = extractelement <4 x float> %bb, i32 3
852+
%4 = tail call fast float @atan2f(float %vecext.3, float %vecextb.3)
853+
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
854+
ret <4 x float> %vecins.3
855+
}
856+
define <4 x float> @int_atan2_4x(ptr %a, ptr %b) {
857+
; CHECK-LABEL: @int_atan2_4x(
858+
; CHECK-NEXT: entry:
859+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
860+
; CHECK-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
861+
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vatan2f(<4 x float> [[TMP0]], <4 x float> [[BB]])
862+
; CHECK-NEXT: ret <4 x float> [[TMP1]]
863+
;
864+
; NOACCELERATE-LABEL: @int_atan2_4x(
865+
; NOACCELERATE-NEXT: entry:
866+
; NOACCELERATE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 16
867+
; NOACCELERATE-NEXT: [[BB:%.*]] = load <4 x float>, ptr [[B:%.*]], align 16
868+
; NOACCELERATE-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
869+
; NOACCELERATE-NEXT: [[VECEXTB:%.*]] = extractelement <4 x float> [[BB]], i32 0
870+
; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT]], float [[VECEXTB]])
871+
; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
872+
; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
873+
; NOACCELERATE-NEXT: [[VECEXTB_1:%.*]] = extractelement <4 x float> [[BB]], i32 1
874+
; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.atan2.f32(float [[VECEXT_1]], float [[VECEXTB_1]])
875+
; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
876+
; NOACCELERATE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
877+
; NOACCELERATE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[BB]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
878+
; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.atan2.v2f32(<2 x float> [[TMP3]], <2 x float> [[TMP4]])
879+
; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
880+
; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
881+
; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]]
882+
;
883+
entry:
884+
%0 = load <4 x float>, ptr %a, align 16
885+
%bb = load <4 x float>, ptr %b, align 16
886+
%vecext = extractelement <4 x float> %0, i32 0
887+
%vecextb = extractelement <4 x float> %bb, i32 0
888+
%1 = tail call fast float @llvm.atan2.f32(float %vecext, float %vecextb)
889+
%vecins = insertelement <4 x float> undef, float %1, i32 0
890+
%vecext.1 = extractelement <4 x float> %0, i32 1
891+
%vecextb.1 = extractelement <4 x float> %bb, i32 1
892+
%2 = tail call fast float @llvm.atan2.f32(float %vecext.1, float %vecextb.1)
893+
%vecins.1 = insertelement <4 x float> %vecins, float %2, i32 1
894+
%vecext.2 = extractelement <4 x float> %0, i32 2
895+
%vecextb.2 = extractelement <4 x float> %bb, i32 2
896+
%3 = tail call fast float @llvm.atan2.f32(float %vecext.2, float %vecextb.2)
897+
%vecins.2 = insertelement <4 x float> %vecins.1, float %3, i32 2
898+
%vecext.3 = extractelement <4 x float> %0, i32 3
899+
%vecextb.3 = extractelement <4 x float> %bb, i32 3
900+
%4 = tail call fast float @llvm.atan2.f32(float %vecext.3, float %vecextb.3)
901+
%vecins.3 = insertelement <4 x float> %vecins.2, float %4, i32 3
902+
ret <4 x float> %vecins.3
903+
}
804904
declare float @sinhf(float) readonly nounwind willreturn
805905
define <4 x float> @sinh_4x(ptr %a) {
806906
; CHECK-LABEL: @sinh_4x(

0 commit comments

Comments
 (0)