[Clang] Replace some x86 sqrt builtins with the generic __builtin_elementwise_sqrt versions #165682

philnik777 · 2025-10-30T09:26:28Z

No description provided.

…ise versions

llvmbot · 2025-10-30T15:40:07Z

@llvm/pr-subscribers-backend-x86

Author: Nikolas Klauser (philnik777)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/165682.diff

8 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86.td (-26)
(modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (+19-45)
(modified) clang/lib/Headers/avx10_2_512bf16intrin.h (+1-1)
(modified) clang/lib/Headers/avx10_2bf16intrin.h (+2-2)
(modified) clang/lib/Headers/avx512vlfp16intrin.h (+2-2)
(modified) clang/lib/Headers/avxintrin.h (+4-8)
(modified) clang/lib/Headers/emmintrin.h (+2-3)
(modified) clang/lib/Headers/xmmintrin.h (+5-8)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index acd8f70c4a5f2..23808056296bb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
 }
 
@@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
-  def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
-  def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
   def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
   def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
   def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
@@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
   def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
   def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
   def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
-  def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
-  def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
   def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
 }
 
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
 let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
 }
@@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
 let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
 }
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..f96c0bab0fd34 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateBitCast(Res, Ops[0]->getType());
   }
 
-  case X86::BI__builtin_ia32_sqrtss:
-  case X86::BI__builtin_ia32_sqrtsd: {
-    Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
-    Function *F;
-    if (Builder.getIsFPConstrained()) {
-      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
-      F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
-                           A->getType());
-      A = Builder.CreateConstrainedFPCall(F, {A});
-    } else {
-      F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
-      A = Builder.CreateCall(F, {A});
-    }
-    return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
-  }
   case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
   }
-  case X86::BI__builtin_ia32_sqrtpd256:
-  case X86::BI__builtin_ia32_sqrtpd:
-  case X86::BI__builtin_ia32_sqrtps256:
-  case X86::BI__builtin_ia32_sqrtps:
-  case X86::BI__builtin_ia32_sqrtph256:
-  case X86::BI__builtin_ia32_sqrtph:
   case X86::BI__builtin_ia32_sqrtph512:
-  case X86::BI__builtin_ia32_vsqrtbf16256:
-  case X86::BI__builtin_ia32_vsqrtbf16:
-  case X86::BI__builtin_ia32_vsqrtbf16512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512: {
-    if (Ops.size() == 2) {
-      unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-      // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
-      // otherwise keep the intrinsic.
-      if (CC != 4) {
-        Intrinsic::ID IID;
-
-        switch (BuiltinID) {
-        default:
-          llvm_unreachable("Unsupported intrinsic!");
-        case X86::BI__builtin_ia32_sqrtph512:
-          IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtps512:
-          IID = Intrinsic::x86_avx512_sqrt_ps_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtpd512:
-          IID = Intrinsic::x86_avx512_sqrt_pd_512;
-          break;
-        }
-        return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+    unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+    // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+    // otherwise keep the intrinsic.
+    if (CC != 4) {
+      Intrinsic::ID IID;
+
+      switch (BuiltinID) {
+      default:
+        llvm_unreachable("Unsupported intrinsic!");
+      case X86::BI__builtin_ia32_sqrtph512:
+        IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtps512:
+        IID = Intrinsic::x86_avx512_sqrt_ps_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtpd512:
+        IID = Intrinsic::x86_avx512_sqrt_pd_512;
+        break;
       }
+      return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
     }
     if (Builder.getIsFPConstrained()) {
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 95e9bd7a36f9b..84075a6cabeeb 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
       (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
-  return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 0c7f381f04fa5..ca1eca8fb5db6 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
       (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
-  return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
-  return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 98ad9b54eef39..99bb6c52208bd 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
                                             (__mmask16)(U)))
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
-  return __builtin_ia32_sqrtph((__v8hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
 }
 
 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
-  return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index b8cfaee7cfb46..435157e00fb3b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
 ///    A 256-bit vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
 ///    values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
-  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the square roots of the values in a 256-bit vector of
@@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
 ///    values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
-  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e15a260514f2d..dd2e57d9b91b9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
 ///    bits are copied from the upper 64 bits of operand \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
                                                          __m128d __b) {
-  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
-  return __extension__(__m128d){__c[0], __a[1]};
+  return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
 }
 
 /// Calculates the square root of the each of two values stored in a
@@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
 ///    values in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
-  return __builtin_ia32_sqrtpd((__v2df)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6d44cff46661f..cb1665b6b1b71 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
 ///    used in the calculation.
 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 ///    value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
-  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+  __a[0] = __builtin_elementwise_sqrt(__a[0]);
+  return __a;
 }
 
 /// Calculates the square roots of the values stored in a 128-bit vector
@@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 ///    values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
-  return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the approximate reciprocal of the value stored in the

llvmbot · 2025-10-30T15:40:08Z

@llvm/pr-subscribers-clang-codegen

Author: Nikolas Klauser (philnik777)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/165682.diff

8 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86.td (-26)
(modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (+19-45)
(modified) clang/lib/Headers/avx10_2_512bf16intrin.h (+1-1)
(modified) clang/lib/Headers/avx10_2bf16intrin.h (+2-2)
(modified) clang/lib/Headers/avx512vlfp16intrin.h (+2-2)
(modified) clang/lib/Headers/avxintrin.h (+4-8)
(modified) clang/lib/Headers/emmintrin.h (+2-3)
(modified) clang/lib/Headers/xmmintrin.h (+5-8)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index acd8f70c4a5f2..23808056296bb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
 }
 
@@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
-  def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
-  def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
   def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
   def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
   def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
@@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
   def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
   def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
   def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
-  def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
-  def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
   def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
 }
 
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
 let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
 }
@@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
 let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
 }
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..f96c0bab0fd34 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateBitCast(Res, Ops[0]->getType());
   }
 
-  case X86::BI__builtin_ia32_sqrtss:
-  case X86::BI__builtin_ia32_sqrtsd: {
-    Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
-    Function *F;
-    if (Builder.getIsFPConstrained()) {
-      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
-      F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
-                           A->getType());
-      A = Builder.CreateConstrainedFPCall(F, {A});
-    } else {
-      F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
-      A = Builder.CreateCall(F, {A});
-    }
-    return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
-  }
   case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
   }
-  case X86::BI__builtin_ia32_sqrtpd256:
-  case X86::BI__builtin_ia32_sqrtpd:
-  case X86::BI__builtin_ia32_sqrtps256:
-  case X86::BI__builtin_ia32_sqrtps:
-  case X86::BI__builtin_ia32_sqrtph256:
-  case X86::BI__builtin_ia32_sqrtph:
   case X86::BI__builtin_ia32_sqrtph512:
-  case X86::BI__builtin_ia32_vsqrtbf16256:
-  case X86::BI__builtin_ia32_vsqrtbf16:
-  case X86::BI__builtin_ia32_vsqrtbf16512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512: {
-    if (Ops.size() == 2) {
-      unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-      // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
-      // otherwise keep the intrinsic.
-      if (CC != 4) {
-        Intrinsic::ID IID;
-
-        switch (BuiltinID) {
-        default:
-          llvm_unreachable("Unsupported intrinsic!");
-        case X86::BI__builtin_ia32_sqrtph512:
-          IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtps512:
-          IID = Intrinsic::x86_avx512_sqrt_ps_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtpd512:
-          IID = Intrinsic::x86_avx512_sqrt_pd_512;
-          break;
-        }
-        return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+    unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+    // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+    // otherwise keep the intrinsic.
+    if (CC != 4) {
+      Intrinsic::ID IID;
+
+      switch (BuiltinID) {
+      default:
+        llvm_unreachable("Unsupported intrinsic!");
+      case X86::BI__builtin_ia32_sqrtph512:
+        IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtps512:
+        IID = Intrinsic::x86_avx512_sqrt_ps_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtpd512:
+        IID = Intrinsic::x86_avx512_sqrt_pd_512;
+        break;
       }
+      return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
     }
     if (Builder.getIsFPConstrained()) {
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 95e9bd7a36f9b..84075a6cabeeb 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
       (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
-  return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 0c7f381f04fa5..ca1eca8fb5db6 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
       (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
-  return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
-  return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 98ad9b54eef39..99bb6c52208bd 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
                                             (__mmask16)(U)))
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
-  return __builtin_ia32_sqrtph((__v8hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
 }
 
 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
-  return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index b8cfaee7cfb46..435157e00fb3b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
 ///    A 256-bit vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
 ///    values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
-  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the square roots of the values in a 256-bit vector of
@@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
 ///    values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
-  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e15a260514f2d..dd2e57d9b91b9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
 ///    bits are copied from the upper 64 bits of operand \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
                                                          __m128d __b) {
-  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
-  return __extension__(__m128d){__c[0], __a[1]};
+  return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
 }
 
 /// Calculates the square root of the each of two values stored in a
@@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
 ///    values in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
-  return __builtin_ia32_sqrtpd((__v2df)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6d44cff46661f..cb1665b6b1b71 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
 ///    used in the calculation.
 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 ///    value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
-  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+  __a[0] = __builtin_elementwise_sqrt(__a[0]);
+  return __a;
 }
 
 /// Calculates the square roots of the values stored in a 128-bit vector
@@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 ///    values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
-  return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the approximate reciprocal of the value stored in the

llvmbot · 2025-10-30T15:40:08Z

@llvm/pr-subscribers-clang

Author: Nikolas Klauser (philnik777)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/165682.diff

8 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsX86.td (-26)
(modified) clang/lib/CodeGen/TargetBuiltins/X86.cpp (+19-45)
(modified) clang/lib/Headers/avx10_2_512bf16intrin.h (+1-1)
(modified) clang/lib/Headers/avx10_2bf16intrin.h (+2-2)
(modified) clang/lib/Headers/avx512vlfp16intrin.h (+2-2)
(modified) clang/lib/Headers/avxintrin.h (+4-8)
(modified) clang/lib/Headers/emmintrin.h (+2-3)
(modified) clang/lib/Headers/xmmintrin.h (+5-8)

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index acd8f70c4a5f2..23808056296bb 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -199,8 +199,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
 }
 
@@ -222,8 +220,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def pshuflw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
-  def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
-  def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
   def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
   def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
   def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
@@ -501,8 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
   def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
   def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
   def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
-  def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
-  def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def rcpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
   def roundpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
@@ -3603,14 +3597,6 @@ let Features = "avx512fp16", Attributes = [NoThrow, Const, RequiredVectorWidth<1
   def reducesh_mask : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>, _Vector<8, _Float16>, _Vector<8, _Float16>, unsigned char, _Constant int, _Constant int)">;
 }
 
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def sqrtph : X86Builtin<"_Vector<8, _Float16>(_Vector<8, _Float16>)">;
-}
-
-let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def sqrtph256 : X86Builtin<"_Vector<16, _Float16>(_Vector<16, _Float16>)">;
-}
-
 let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def sqrtph512 : X86Builtin<"_Vector<32, _Float16>(_Vector<32, _Float16>, _Constant int)">;
 }
@@ -5129,15 +5115,3 @@ let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<
 let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
   def vgetmantbf16512_mask : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>, _Constant int, _Vector<32, __bf16>, unsigned int)">;
 }
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vsqrtbf16 : X86Builtin<"_Vector<8, __bf16>(_Vector<8, __bf16>)">;
-}
-
-let Features = "avx10.2-256", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vsqrtbf16256 : X86Builtin<"_Vector<16, __bf16>(_Vector<16, __bf16>)">;
-}
-
-let Features = "avx10.2-512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vsqrtbf16512 : X86Builtin<"_Vector<32, __bf16>(_Vector<32, __bf16>)">;
-}
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index a4974e45caf10..f96c0bab0fd34 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2183,21 +2183,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return Builder.CreateBitCast(Res, Ops[0]->getType());
   }
 
-  case X86::BI__builtin_ia32_sqrtss:
-  case X86::BI__builtin_ia32_sqrtsd: {
-    Value *A = Builder.CreateExtractElement(Ops[0], (uint64_t)0);
-    Function *F;
-    if (Builder.getIsFPConstrained()) {
-      CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
-      F = CGM.getIntrinsic(Intrinsic::experimental_constrained_sqrt,
-                           A->getType());
-      A = Builder.CreateConstrainedFPCall(F, {A});
-    } else {
-      F = CGM.getIntrinsic(Intrinsic::sqrt, A->getType());
-      A = Builder.CreateCall(F, {A});
-    }
-    return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
-  }
   case X86::BI__builtin_ia32_sqrtsh_round_mask:
   case X86::BI__builtin_ia32_sqrtsd_round_mask:
   case X86::BI__builtin_ia32_sqrtss_round_mask: {
@@ -2237,40 +2222,29 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     A = EmitX86ScalarSelect(*this, Ops[3], A, Src);
     return Builder.CreateInsertElement(Ops[0], A, (uint64_t)0);
   }
-  case X86::BI__builtin_ia32_sqrtpd256:
-  case X86::BI__builtin_ia32_sqrtpd:
-  case X86::BI__builtin_ia32_sqrtps256:
-  case X86::BI__builtin_ia32_sqrtps:
-  case X86::BI__builtin_ia32_sqrtph256:
-  case X86::BI__builtin_ia32_sqrtph:
   case X86::BI__builtin_ia32_sqrtph512:
-  case X86::BI__builtin_ia32_vsqrtbf16256:
-  case X86::BI__builtin_ia32_vsqrtbf16:
-  case X86::BI__builtin_ia32_vsqrtbf16512:
   case X86::BI__builtin_ia32_sqrtps512:
   case X86::BI__builtin_ia32_sqrtpd512: {
-    if (Ops.size() == 2) {
-      unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
-      // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
-      // otherwise keep the intrinsic.
-      if (CC != 4) {
-        Intrinsic::ID IID;
-
-        switch (BuiltinID) {
-        default:
-          llvm_unreachable("Unsupported intrinsic!");
-        case X86::BI__builtin_ia32_sqrtph512:
-          IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtps512:
-          IID = Intrinsic::x86_avx512_sqrt_ps_512;
-          break;
-        case X86::BI__builtin_ia32_sqrtpd512:
-          IID = Intrinsic::x86_avx512_sqrt_pd_512;
-          break;
-        }
-        return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
+    unsigned CC = cast<llvm::ConstantInt>(Ops[1])->getZExtValue();
+    // Support only if the rounding mode is 4 (AKA CUR_DIRECTION),
+    // otherwise keep the intrinsic.
+    if (CC != 4) {
+      Intrinsic::ID IID;
+
+      switch (BuiltinID) {
+      default:
+        llvm_unreachable("Unsupported intrinsic!");
+      case X86::BI__builtin_ia32_sqrtph512:
+        IID = Intrinsic::x86_avx512fp16_sqrt_ph_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtps512:
+        IID = Intrinsic::x86_avx512_sqrt_ps_512;
+        break;
+      case X86::BI__builtin_ia32_sqrtpd512:
+        IID = Intrinsic::x86_avx512_sqrt_pd_512;
+        break;
       }
+      return Builder.CreateCall(CGM.getIntrinsic(IID), Ops);
     }
     if (Builder.getIsFPConstrained()) {
       CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 95e9bd7a36f9b..84075a6cabeeb 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -423,7 +423,7 @@ _mm512_maskz_rsqrt_pbh(__mmask32 __U, __m512bh __A) {
       (__v32bf)_mm512_setzero_pbh(), (__mmask32)(__U)))
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_sqrt_pbh(__m512bh __A) {
-  return (__m512bh)__builtin_ia32_vsqrtbf16512((__v32bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 0c7f381f04fa5..ca1eca8fb5db6 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -818,7 +818,7 @@ _mm_maskz_rsqrt_pbh(__mmask8 __U, __m128bh __A) {
       (__v8bf)_mm_setzero_pbh(), (__mmask8)(__U)))
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256 _mm256_sqrt_pbh(__m256bh __A) {
-  return (__m256bh)__builtin_ia32_vsqrtbf16256((__v16bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
@@ -835,7 +835,7 @@ _mm256_maskz_sqrt_pbh(__mmask16 __U, __m256bh __A) {
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128 _mm_sqrt_pbh(__m128bh __A) {
-  return (__m128bh)__builtin_ia32_vsqrtbf16((__v8bf)__A);
+  return __builtin_elementwise_sqrt(__A);
 }
 
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 98ad9b54eef39..99bb6c52208bd 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -611,7 +611,7 @@ _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
                                             (__mmask16)(U)))
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
-  return __builtin_ia32_sqrtph((__v8hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
@@ -628,7 +628,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
 }
 
 static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
-  return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 static __inline__ __m256h __DEFAULT_FN_ATTRS256
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index b8cfaee7cfb46..435157e00fb3b 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -344,10 +344,8 @@ static __inline __m256 __DEFAULT_FN_ATTRS_CONSTEXPR _mm256_mul_ps(__m256 __a,
 ///    A 256-bit vector of [4 x double].
 /// \returns A 256-bit vector of [4 x double] containing the square roots of the
 ///    values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
-  return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+static __inline __m256d __DEFAULT_FN_ATTRS _mm256_sqrt_pd(__m256d __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the square roots of the values in a 256-bit vector of
@@ -361,10 +359,8 @@ _mm256_sqrt_pd(__m256d __a)
 ///    A 256-bit vector of [8 x float].
 /// \returns A 256-bit vector of [8 x float] containing the square roots of the
 ///    values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
-  return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+static __inline __m256 __DEFAULT_FN_ATTRS _mm256_sqrt_ps(__m256 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the reciprocal square roots of the values in a 256-bit
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index e15a260514f2d..dd2e57d9b91b9 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -248,8 +248,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS_CONSTEXPR _mm_div_pd(__m128d __a,
 ///    bits are copied from the upper 64 bits of operand \a __a.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
                                                          __m128d __b) {
-  __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
-  return __extension__(__m128d){__c[0], __a[1]};
+  return __extension__(__m128d){__builtin_elementwise_sqrt(__b[0]), __a[1]};
 }
 
 /// Calculates the square root of the each of two values stored in a
@@ -264,7 +263,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
 ///    values in the operand.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
-  return __builtin_ia32_sqrtpd((__v2df)__a);
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Compares lower 64-bit double-precision values of both operands, and
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 6d44cff46661f..cb1665b6b1b71 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -239,10 +239,9 @@ _mm_div_ps(__m128 __a, __m128 __b) {
 ///    used in the calculation.
 /// \returns A 128-bit vector of [4 x float] containing the square root of the
 ///    value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
-  return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) {
+  __a[0] = __builtin_elementwise_sqrt(__a[0]);
+  return __a;
 }
 
 /// Calculates the square roots of the values stored in a 128-bit vector
@@ -256,10 +255,8 @@ _mm_sqrt_ss(__m128 __a)
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
 ///    values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
-  return __builtin_ia32_sqrtps((__v4sf)__a);
+static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ps(__m128 __a) {
+  return __builtin_elementwise_sqrt(__a);
 }
 
 /// Calculates the approximate reciprocal of the value stored in the

clang/lib/Headers/emmintrin.h

phoebewang

LGTM.

RKSimon

(minor) add basic constrained test file coverage

Help to unblock llvm#165682 I have the avx10_2 bf16 test coverage as well, but its currently breaking as we're missing bf16 strict_fsqrt lowering in the backend

…#167692) Help to unblock #165682 I have the avx10_2 bf16 test coverage as well, but its currently breaking as we're missing bf16 strict_fsqrt lowering in the backend

RKSimon

please can you resolve the merge conflicts - I think you're almost there now

…c __builtin_elementwise_sqrt Followup to llvm#165682

… __builtin_elementwise_sqrt (#168057) Followup to #165682

github-actions · 2025-11-25T12:19:03Z

🐧 Linux x64 Test Results

84744 tests passed
1117 tests skipped
5 tests failed

Failed Tests

(click on a test name to see its output)

Clang

Clang.CodeGen/X86/sse-builtins-constrained.c

Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins-constrained.c -triple=x86_64-unknown-linux-gnu -target-feature +sse -emit-llvm -o - -Wall -Werror | /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins-constrained.c --check-prefix=UNCONSTRAINED --check-prefix=COMMON --check-prefix=COMMONIR
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins-constrained.c -triple=x86_64-unknown-linux-gnu -target-feature +sse -emit-llvm -o - -Wall -Werror
# note: command had no output on stdout or stderr
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins-constrained.c --check-prefix=UNCONSTRAINED --check-prefix=COMMON --check-prefix=COMMONIR
# .---command stderr------------
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins-constrained.c:31:15: error: COMMONIR: expected string not found in input
# |  // COMMONIR: extractelement <4 x float> {{.*}}, i64 0
# |               ^
# | <stdin>:20:43: note: scanning from here
# | define dso_local <4 x float> @test_sqrt_ss(<4 x float> noundef %x) #0 {
# |                                           ^
# | <stdin>:28:14: note: possible intended match here
# |  %vecext.i = extractelement <4 x float> %1, i32 0
# |              ^
# | 
# | Input file: <stdin>
# | Check file: /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins-constrained.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             .
# |             .
# |             .
# |            15:  %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %1) 
# |            16:  ret <4 x float> %2 
# |            17: } 
# |            18:  
# |            19: ; Function Attrs: noinline nounwind optnone 
# |            20: define dso_local <4 x float> @test_sqrt_ss(<4 x float> noundef %x) #0 { 
# | check:31'0                                               X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |            21: entry: 
# | check:31'0     ~~~~~~~
# |            22:  %__a.addr.i = alloca <4 x float>, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            23:  %x.addr = alloca <4 x float>, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            24:  store <4 x float> %x, ptr %x.addr, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            25:  %0 = load <4 x float>, ptr %x.addr, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            26:  store <4 x float> %0, ptr %__a.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            27:  %1 = load <4 x float>, ptr %__a.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            28:  %vecext.i = extractelement <4 x float> %1, i32 0 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:31'1                  ?                                     possible intended match
# |            29:  %2 = call float @llvm.sqrt.f32(float %vecext.i) 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            30:  %3 = load <4 x float>, ptr %__a.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            31:  %vecins.i = insertelement <4 x float> %3, float %2, i32 0 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            32:  store <4 x float> %vecins.i, ptr %__a.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            33:  %4 = load <4 x float>, ptr %__a.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             .
# |             .
# |             .
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--

Clang.CodeGen/X86/sse-builtins.c

Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -flax-vector-conversions=none -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins.c -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror | /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins.c
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -flax-vector-conversions=none -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins.c -triple=x86_64-apple-darwin -target-feature +sse -emit-llvm -o - -Wall -Werror
# note: command had no output on stdout or stderr
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins.c
# .---command stderr------------
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins.c:754:12: error: CHECK: expected string not found in input
# |  // CHECK: extractelement <4 x float> {{.*}}, i64 0
# |            ^
# | <stdin>:1825:36: note: scanning from here
# | define <4 x float> @test_mm_sqrt_ss(<4 x float> noundef %x) #0 {
# |                                    ^
# | <stdin>:1833:14: note: possible intended match here
# |  %vecext.i = extractelement <4 x float> %1, i32 0
# |              ^
# | 
# | Input file: <stdin>
# | Check file: /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse-builtins.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |              .
# |              .
# |              .
# |           1820:  %2 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %1) 
# |           1821:  ret <4 x float> %2 
# |           1822: } 
# |           1823:  
# |           1824: ; Function Attrs: noinline nounwind optnone 
# |           1825: define <4 x float> @test_mm_sqrt_ss(<4 x float> noundef %x) #0 { 
# | check:754'0                                        X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |           1826: entry: 
# | check:754'0     ~~~~~~~
# |           1827:  %__a.addr.i = alloca <4 x float>, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1828:  %x.addr = alloca <4 x float>, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1829:  store <4 x float> %x, ptr %x.addr, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1830:  %0 = load <4 x float>, ptr %x.addr, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1831:  store <4 x float> %0, ptr %__a.addr.i, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1832:  %1 = load <4 x float>, ptr %__a.addr.i, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1833:  %vecext.i = extractelement <4 x float> %1, i32 0 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:754'1                  ?                                     possible intended match
# |           1834:  %2 = call float @llvm.sqrt.f32(float %vecext.i) 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1835:  %3 = load <4 x float>, ptr %__a.addr.i, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1836:  %vecins.i = insertelement <4 x float> %3, float %2, i32 0 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1837:  store <4 x float> %vecins.i, ptr %__a.addr.i, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |           1838:  %4 = load <4 x float>, ptr %__a.addr.i, align 16 
# | check:754'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |              .
# |              .
# |              .
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--

Clang.CodeGen/X86/sse2-builtins-constrained.c

Exit Code: 1

Command Output (stdout):
--
# RUN: at line 2
/home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins-constrained.c -triple=x86_64-unknown-linux-gnu -target-feature +sse2 -emit-llvm -o - -Wall -Werror | /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins-constrained.c --check-prefix=UNCONSTRAINED --check-prefix=COMMON --check-prefix=COMMONIR
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins-constrained.c -triple=x86_64-unknown-linux-gnu -target-feature +sse2 -emit-llvm -o - -Wall -Werror
# note: command had no output on stdout or stderr
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins-constrained.c --check-prefix=UNCONSTRAINED --check-prefix=COMMON --check-prefix=COMMONIR
# .---command stderr------------
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins-constrained.c:31:15: error: COMMONIR: expected string not found in input
# |  // COMMONIR: extractelement <2 x double> {{.*}}, i64 0
# |               ^
# | <stdin>:20:44: note: scanning from here
# | define dso_local <2 x double> @test_sqrt_sd(<2 x double> noundef %x, <2 x double> noundef %y) #0 {
# |                                            ^
# | <stdin>:34:14: note: possible intended match here
# |  %vecext.i = extractelement <2 x double> %2, i32 0
# |              ^
# | 
# | Input file: <stdin>
# | Check file: /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins-constrained.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             .
# |             .
# |             .
# |            15:  %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %1) 
# |            16:  ret <2 x double> %2 
# |            17: } 
# |            18:  
# |            19: ; Function Attrs: noinline nounwind optnone 
# |            20: define dso_local <2 x double> @test_sqrt_sd(<2 x double> noundef %x, <2 x double> noundef %y) #0 { 
# | check:31'0                                                X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |            21: entry: 
# | check:31'0     ~~~~~~~
# |            22:  %__a.addr.i = alloca <2 x double>, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            23:  %__b.addr.i = alloca <2 x double>, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            24:  %.compoundliteral.i = alloca <2 x double>, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            25:  %x.addr = alloca <2 x double>, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             .
# |             .
# |             .
# |            29:  %0 = load <2 x double>, ptr %x.addr, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            30:  %1 = load <2 x double>, ptr %y.addr, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            31:  store <2 x double> %0, ptr %__a.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            32:  store <2 x double> %1, ptr %__b.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            33:  %2 = load <2 x double>, ptr %__b.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            34:  %vecext.i = extractelement <2 x double> %2, i32 0 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:31'1                  ?                                      possible intended match
# |            35:  %3 = call double @llvm.sqrt.f64(double %vecext.i) 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            36:  %vecinit.i = insertelement <2 x double> poison, double %3, i32 0 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            37:  %4 = load <2 x double>, ptr %__a.addr.i, align 16 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            38:  %vecext1.i = extractelement <2 x double> %4, i32 1 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            39:  %vecinit2.i = insertelement <2 x double> %vecinit.i, double %vecext1.i, i32 1 
# | check:31'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |             .
# |             .
# |             .
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--

Clang.CodeGen/X86/sse2-builtins.c

Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -flax-vector-conversions=none -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins.c -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror | /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins.c --check-prefixes=CHECK,X64
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -x c -flax-vector-conversions=none -ffreestanding /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins.c -triple=x86_64-apple-darwin -target-feature +sse2 -emit-llvm -o - -Wall -Werror
# note: command had no output on stdout or stderr
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/FileCheck /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins.c --check-prefixes=CHECK,X64
# .---command stderr------------
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins.c:1443:12: error: CHECK: expected string not found in input
# |  // CHECK: extractelement <2 x double> %{{.*}}, i64 0
# |            ^
# | <stdin>:4145:37: note: scanning from here
# | define <2 x double> @test_mm_sqrt_sd(<2 x double> noundef %A, <2 x double> noundef %B) #0 {
# |                                     ^
# | <stdin>:4159:14: note: possible intended match here
# |  %vecext.i = extractelement <2 x double> %2, i32 0
# |              ^
# | 
# | Input file: <stdin>
# | Check file: /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/X86/sse2-builtins.c
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |               .
# |               .
# |               .
# |            4140:  %2 = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %1) 
# |            4141:  ret <2 x double> %2 
# |            4142: } 
# |            4143:  
# |            4144: ; Function Attrs: noinline nounwind optnone 
# |            4145: define <2 x double> @test_mm_sqrt_sd(<2 x double> noundef %A, <2 x double> noundef %B) #0 { 
# | check:1443'0                                         X~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ error: no match found
# |            4146: entry: 
# | check:1443'0     ~~~~~~~
# |            4147:  %__a.addr.i = alloca <2 x double>, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4148:  %__b.addr.i = alloca <2 x double>, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4149:  %.compoundliteral.i = alloca <2 x double>, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4150:  %A.addr = alloca <2 x double>, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |               .
# |               .
# |               .
# |            4154:  %0 = load <2 x double>, ptr %A.addr, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4155:  %1 = load <2 x double>, ptr %B.addr, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4156:  store <2 x double> %0, ptr %__a.addr.i, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4157:  store <2 x double> %1, ptr %__b.addr.i, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4158:  %2 = load <2 x double>, ptr %__b.addr.i, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4159:  %vecext.i = extractelement <2 x double> %2, i32 0 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | check:1443'1                  ?                                      possible intended match
# |            4160:  %3 = call double @llvm.sqrt.f64(double %vecext.i) 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4161:  %vecinit.i = insertelement <2 x double> poison, double %3, i32 0 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4162:  %4 = load <2 x double>, ptr %__a.addr.i, align 16 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4163:  %vecext1.i = extractelement <2 x double> %4, i32 1 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            4164:  %vecinit2.i = insertelement <2 x double> %vecinit.i, double %vecext1.i, i32 1 
# | check:1443'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |               .
# |               .
# |               .
# | >>>>>>
# `-----------------------------
# error: command failed with exit status: 1

--

Clang.CodeGen/builtins-x86.c

Exit Code: 1

Command Output (stdout):
--
# RUN: at line 1
/home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +shstk -target-feature +wbnoinvd -target-feature +cldemote -Wno-implicit-function-declaration -emit-llvm -o /home/gha/actions-runner/_work/llvm-project/llvm-project/build/tools/clang/test/CodeGen/Output/builtins-x86.c.tmp /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c
# executed command: /home/gha/actions-runner/_work/llvm-project/llvm-project/build/bin/clang -cc1 -internal-isystem /home/gha/actions-runner/_work/llvm-project/llvm-project/build/lib/clang/22/include -nostdsysteminc -DUSE_64 -triple x86_64-unknown-unknown -target-feature +fxsr -target-feature +avx -target-feature +xsaveopt -target-feature +xsaves -target-feature +xsavec -target-feature +mwaitx -target-feature +clzero -target-feature +shstk -target-feature +wbnoinvd -target-feature +cldemote -Wno-implicit-function-declaration -emit-llvm -o /home/gha/actions-runner/_work/llvm-project/llvm-project/build/tools/clang/test/CodeGen/Output/builtins-x86.c.tmp /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c
# .---command stderr------------
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c:285:11: error: assigning to 'V4f' (vector of 4 'float' values) from incompatible type 'int'
# |   285 |   tmp_V4f = __builtin_ia32_sqrtps(tmp_V4f);
# |       |           ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c:286:11: error: assigning to 'V4f' (vector of 4 'float' values) from incompatible type 'int'
# |   286 |   tmp_V4f = __builtin_ia32_sqrtss(tmp_V4f);
# |       |           ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c:295:11: error: assigning to 'V2d' (vector of 2 'double' values) from incompatible type 'int'
# |   295 |   tmp_V2d = __builtin_ia32_sqrtpd(tmp_V2d);
# |       |           ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c:296:11: error: assigning to 'V2d' (vector of 2 'double' values) from incompatible type 'int'
# |   296 |   tmp_V2d = __builtin_ia32_sqrtsd(tmp_V2d);
# |       |           ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c:403:11: error: assigning to 'V4d' (vector of 4 'double' values) from incompatible type 'int'
# |   403 |   tmp_V4d = __builtin_ia32_sqrtpd256(tmp_V4d);
# |       |           ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | /home/gha/actions-runner/_work/llvm-project/llvm-project/clang/test/CodeGen/builtins-x86.c:404:11: error: assigning to 'V8f' (vector of 8 'float' values) from incompatible type 'int'
# |   404 |   tmp_V8f = __builtin_ia32_sqrtps256(tmp_V8f);
# |       |           ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# | 6 errors generated.
# `-----------------------------
# error: command failed with exit status: 1

--

If these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the infrastructure label.

RKSimon

Remove out of date builtins from builtins-x86.c and update scalar test checks to use i32 instead of i64 for the element index

RKSimon

LGTM - cheers

RKSimon · 2025-11-27T11:29:15Z

@philnik777 Please can you get this committed soon after the CI has passed? We have a lot of churn on BuiltinsX86.td at the moment with the constexpr work so this will bitrot if we're not quick :)

Niram7777 · 2025-11-30T10:06:07Z

hello,
with the latest version of this project, we dont have __builtin_ia32_sqrtsd anymore, which was handy because it is the same name as the gcc one.
it would have been nice to mark it as deprecated before removing it suddenly.
https://godbolt.org/z/6fo7zchx4 (ignore msvc, clang 21.1.0 works)
can you say what I have to use instead pls?

phoebewang · 2025-11-30T11:31:53Z

hello, with the latest version of this project, we dont have __builtin_ia32_sqrtsd anymore, which was handy because it is the same name as the gcc one. it would have been nice to mark it as deprecated before removing it suddenly. https://godbolt.org/z/6fo7zchx4 (ignore msvc, clang 21.1.0 works) can you say what I have to use instead pls?

You can either target intrinsic _mm_sqrt_sd or use the common __builtin_sqrt instead. Note, you need -fno-math-errno for the latter, https://godbolt.org/z/sGWEzqYWE. We don't guarantee compatibilities of target specific builtins.

Latest LLVM 22.x has removed __builtin_ia32_sqrtsd llvm/llvm-project#165682

…mentwise_sqrt versions (llvm#165682)

On clang 22 main with this PR [1], __builtin_ia32_sqrtps was removed in favor of the __builtin_elementwise_sqrt. As that builtin is not supported by gcc, this commit uses the underlying intrinsic instead. [1] llvm/llvm-project#165682

On clang 22 main with this PR [1], __builtin_ia32_sqrtps was removed in favor of the __builtin_elementwise_sqrt. As that builtin is not by GCC we only use it on clang. Using the underlying _mm_sqrt_ps instrinsic was also an option however including xmmintrin.h is best avoided as it is a large header. This change doesn't increase our clang version requirement as this builtin is available since 18.1.0. [1] llvm/llvm-project#165682

[Clang] Replaec some x86 builtins with the generic __builtin_elementw…

f1b0206

…ise versions

philnik777 changed the title ~~[Clang] Replaec some x86 builtins with the generic __builtin_elementwise versions~~ [Clang] Replace some x86 builtins with the generic __builtin_elementwise versions Oct 30, 2025

philnik777 marked this pull request as ready for review October 30, 2025 15:39

llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:headers Headers provided by Clang, e.g. for intrinsics clang:codegen IR generation bugs: mangling, exceptions, etc. labels Oct 30, 2025

RKSimon requested review from RKSimon and phoebewang October 30, 2025 15:48

phoebewang reviewed Oct 31, 2025

View reviewed changes

clang/lib/Headers/emmintrin.h Show resolved Hide resolved

RKSimon changed the title ~~[Clang] Replace some x86 builtins with the generic __builtin_elementwise versions~~ [Clang] Replace some x86 sqrt builtins with the generic __builtin_elementwise_sqrt versions Oct 31, 2025

phoebewang approved these changes Oct 31, 2025

View reviewed changes

RKSimon requested changes Nov 3, 2025

View reviewed changes

RKSimon mentioned this pull request Nov 4, 2025

[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow SSE/AVX/AVX512 square root intrinsics to be used in constexpr #160516

Open

This was referenced Nov 12, 2025

[clang][x86] Add SSE2/AVX/AVX512VLFP16 constrained sqrt test coverage #167692

Merged

[X86] Failure to handle bf16 strict_fsqrt #167694

Closed

RKSimon requested changes Nov 13, 2025

View reviewed changes

RKSimon added a commit to RKSimon/llvm-project that referenced this pull request Nov 14, 2025

[X86] Replace default _mm512_sqrt_pd/s/h imeplementations with generi…

f12c986

…c __builtin_elementwise_sqrt Followup to llvm#165682

RKSimon mentioned this pull request Nov 14, 2025

[X86] Replace default _mm512_sqrt_pd/s/h implementations with generic __builtin_elementwise_sqrt #168057

Merged

RKSimon added a commit that referenced this pull request Nov 15, 2025

[X86] Replace default _mm512_sqrt_pd/s/h implementations with generic…

4cd8c11

… __builtin_elementwise_sqrt (#168057) Followup to #165682

Merge branch 'main' into use_elementwise_builtins

6941324

RKSimon requested changes Nov 25, 2025

View reviewed changes

philnik777 mentioned this pull request Nov 27, 2025

[CIR][X86] Implement lowering for sqrt builtins #169310

Merged

Fix CI

12e60d9

RKSimon self-requested a review November 27, 2025 11:15

RKSimon approved these changes Nov 27, 2025

View reviewed changes

philnik777 enabled auto-merge (squash) November 27, 2025 11:30

philnik777 merged commit 0dbedd1 into llvm:main Nov 27, 2025
9 of 10 checks passed

FantasqueX mentioned this pull request Nov 27, 2025

[CIR][NFC] Fix build problem due to OG changes #169818

Closed

Niram7777 added a commit to Niram7777/darling that referenced this pull request Nov 30, 2025

Use built-in sqrt functions

fe2a93d

Latest LLVM 22.x has removed __builtin_ia32_sqrtsd llvm/llvm-project#165682

Niram7777 mentioned this pull request Nov 30, 2025

Use built-in sqrt functions darlinghq/darling#1636

Merged

augusto2112 pushed a commit to augusto2112/llvm-project that referenced this pull request Dec 3, 2025

[Clang] Replace some x86 sqrt builtins with the generic __builtin_ele…

9638c25

…mentwise_sqrt versions (llvm#165682)

kcloudy0717 pushed a commit to kcloudy0717/llvm-project that referenced this pull request Dec 4, 2025

[Clang] Replace some x86 sqrt builtins with the generic __builtin_ele…

26533f0

…mentwise_sqrt versions (llvm#165682)

R-Goc mentioned this pull request Dec 17, 2025

AK: Remove SIMDMath.h LadybirdBrowser/ladybird#7162

Merged

[Clang] Replace some x86 sqrt builtins with the generic __builtin_elementwise_sqrt versions #165682

[Clang] Replace some x86 sqrt builtins with the generic __builtin_elementwise_sqrt versions #165682

Uh oh!

Conversation

philnik777 commented Oct 30, 2025

Uh oh!

llvmbot commented Oct 30, 2025

Uh oh!

llvmbot commented Oct 30, 2025

Uh oh!

llvmbot commented Oct 30, 2025

Uh oh!

Uh oh!

phoebewang left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Nov 25, 2025

🐧 Linux x64 Test Results

Failed Tests

Clang

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

RKSimon commented Nov 27, 2025

Uh oh!

Uh oh!

Niram7777 commented Nov 30, 2025

Uh oh!

phoebewang commented Nov 30, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

5 participants