[CIR] Implement builtin reduce fadd/fmul/fmax/fmin (#171633)

badumbatish · web-flow · commit 80ec43d455a5 · 2025-12-12T10:50:30.000Z
New files are created to match the structure over at OGs
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -1552,26 +1552,52 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_vpshrdw128:
   case X86::BI__builtin_ia32_vpshrdw256:
   case X86::BI__builtin_ia32_vpshrdw512:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_reduce_fadd_pd512:
   case X86::BI__builtin_ia32_reduce_fadd_ps512:
   case X86::BI__builtin_ia32_reduce_fadd_ph512:
   case X86::BI__builtin_ia32_reduce_fadd_ph256:
-  case X86::BI__builtin_ia32_reduce_fadd_ph128:
+  case X86::BI__builtin_ia32_reduce_fadd_ph128: {
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "vector.reduce.fadd", ops[0].getType(),
+                               mlir::ValueRange{ops[0], ops[1]});
+  }
   case X86::BI__builtin_ia32_reduce_fmul_pd512:
   case X86::BI__builtin_ia32_reduce_fmul_ps512:
   case X86::BI__builtin_ia32_reduce_fmul_ph512:
   case X86::BI__builtin_ia32_reduce_fmul_ph256:
-  case X86::BI__builtin_ia32_reduce_fmul_ph128:
+  case X86::BI__builtin_ia32_reduce_fmul_ph128: {
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "vector.reduce.fmul", ops[0].getType(),
+                               mlir::ValueRange{ops[0], ops[1]});
+  }
   case X86::BI__builtin_ia32_reduce_fmax_pd512:
   case X86::BI__builtin_ia32_reduce_fmax_ps512:
   case X86::BI__builtin_ia32_reduce_fmax_ph512:
   case X86::BI__builtin_ia32_reduce_fmax_ph256:
-  case X86::BI__builtin_ia32_reduce_fmax_ph128:
+  case X86::BI__builtin_ia32_reduce_fmax_ph128: {
+    assert(!cir::MissingFeatures::fastMathFlags());
+    cir::VectorType vecTy = cast<cir::VectorType>(ops[0].getType());
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "vector.reduce.fmax", vecTy.getElementType(),
+                               mlir::ValueRange{ops[0]});
+  }
   case X86::BI__builtin_ia32_reduce_fmin_pd512:
   case X86::BI__builtin_ia32_reduce_fmin_ps512:
   case X86::BI__builtin_ia32_reduce_fmin_ph512:
   case X86::BI__builtin_ia32_reduce_fmin_ph256:
-  case X86::BI__builtin_ia32_reduce_fmin_ph128:
+  case X86::BI__builtin_ia32_reduce_fmin_ph128: {
+    assert(!cir::MissingFeatures::fastMathFlags());
+    cir::VectorType vecTy = cast<cir::VectorType>(ops[0].getType());
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "vector.reduce.fmin", vecTy.getElementType(),
+                               mlir::ValueRange{ops[0]});
+  }
   case X86::BI__builtin_ia32_rdrand16_step:
   case X86::BI__builtin_ia32_rdrand32_step:
   case X86::BI__builtin_ia32_rdrand64_step:
diff --git a/clang/test/CIR/CodeGen/X86/avx512-reduceIntrin.c b/clang/test/CIR/CodeGen/X86/avx512-reduceIntrin.c
@@ -0,0 +1,71 @@
+// RUN: %clang_cc1 -x c -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -fclangir -emit-cir -o - -Wall -Werror | FileCheck %s --check-prefixes=CIR
+// RUN: %clang_cc1 -x c -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -fclangir -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=LLVM
+// RUN: %clang_cc1 -x c -ffreestanding %s -O2 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+double test_mm512_reduce_add_pd(__m512d __W, double ExtraAddOp){
+
+  // CIR-LABEL: _mm512_reduce_add_pd
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fadd" %[[R:.*]], %[[V:.*]] : (!cir.double, !cir.vector<8 x !cir.double>) -> !cir.double
+
+  // CIR-LABEL: test_mm512_reduce_add_pd
+  // CIR: cir.call @_mm512_reduce_add_pd(%[[VEC:.*]]) : (!cir.vector<8 x !cir.double>) -> !cir.double
+
+  // LLVM-LABEL: test_mm512_reduce_add_pd
+  // LLVM: call double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_add_pd
+  // OGCG-NOT: reassoc
+  // OGCG: call reassoc {{.*}}double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> %{{.*}})
+  // OGCG-NOT: reassoc
+  return _mm512_reduce_add_pd(__W) + ExtraAddOp;
+}
+
+double test_mm512_reduce_mul_pd(__m512d __W, double ExtraMulOp){
+  // CIR-LABEL: _mm512_reduce_mul_pd
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmul" %[[R:.*]], %[[V:.*]] : (!cir.double, !cir.vector<8 x !cir.double>) -> !cir.double
+
+  // CIR-LABEL: test_mm512_reduce_mul_pd
+  // CIR: cir.call @_mm512_reduce_mul_pd(%[[VEC:.*]]) : (!cir.vector<8 x !cir.double>) -> !cir.double
+
+  // LLVM-LABEL: test_mm512_reduce_mul_pd
+  // LLVM: call double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_mul_pd
+  // OGCG-NOT: reassoc
+  // OGCG:    call reassoc {{.*}}double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> %{{.*}})
+  // OGCG-NOT: reassoc
+  return _mm512_reduce_mul_pd(__W) * ExtraMulOp;
+}
+
+
+float test_mm512_reduce_add_ps(__m512 __W){
+  // CIR-LABEL: _mm512_reduce_add_ps
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fadd" %[[R:.*]], %[[V:.*]] : (!cir.float, !cir.vector<16 x !cir.float>) -> !cir.float
+
+  // CIR-LABEL: test_mm512_reduce_add_ps
+  // CIR: cir.call @_mm512_reduce_add_ps(%[[VEC:.*]]) : (!cir.vector<16 x !cir.float>) -> !cir.float
+
+  // LLVM-LABEL: test_mm512_reduce_add_ps
+  // LLVM: call float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_add_ps
+  // OGCG: call reassoc {{.*}}float @llvm.vector.reduce.fadd.v16f32(float -0.000000e+00, <16 x float> %{{.*}})
+  return _mm512_reduce_add_ps(__W);
+}
+
+float test_mm512_reduce_mul_ps(__m512 __W){
+  // CIR-LABEL: _mm512_reduce_mul_ps
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmul" %[[R:.*]], %[[V:.*]] : (!cir.float, !cir.vector<16 x !cir.float>) -> !cir.float
+
+  // CIR-LABEL: test_mm512_reduce_mul_ps
+  // CIR: cir.call @_mm512_reduce_mul_ps(%[[VEC:.*]]) : (!cir.vector<16 x !cir.float>) -> !cir.float
+
+  // LLVM-LABEL: test_mm512_reduce_mul_ps
+  // LLVM: call float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_mul_ps
+  // OGCG:    call reassoc {{.*}}float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> %{{.*}})
+  return _mm512_reduce_mul_ps(__W);
+}
diff --git a/clang/test/CIR/CodeGen/X86/avx512-reduceMinMaxIntrin.c b/clang/test/CIR/CodeGen/X86/avx512-reduceMinMaxIntrin.c
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -fclangir -emit-cir -o - -Wall -Werror | FileCheck %s --check-prefixes=CIR
+// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -fclangir -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=LLVM
+// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=x86_64-apple-darwin -target-cpu skylake-avx512 -emit-llvm -o - -Wall -Werror | FileCheck %s --check-prefixes=OGCG
+
+#include <immintrin.h>
+
+double test_mm512_reduce_max_pd(__m512d __W, double ExtraAddOp){
+  // CIR-LABEL: _mm512_reduce_max_pd
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmax" %[[V:.*]] : (!cir.vector<8 x !cir.double>) -> !cir.double
+
+  // CIR-LABEL: test_mm512_reduce_max_pd
+  // CIR: cir.call @_mm512_reduce_max_pd(%[[VEC:.*]]) : (!cir.vector<8 x !cir.double>) -> !cir.double
+
+  // LLVM-LABEL: test_mm512_reduce_max_pd
+  // LLVM: call double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_max_pd
+  // OGCG-NOT: nnan
+  // OGCG: call nnan {{.*}}double @llvm.vector.reduce.fmax.v8f64(<8 x double> %{{.*}})
+  // OGCG-NOT: nnan
+  return _mm512_reduce_max_pd(__W) + ExtraAddOp;
+}
+
+double test_mm512_reduce_min_pd(__m512d __W, double ExtraMulOp){
+  // CIR-LABEL: _mm512_reduce_min_pd
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmin" %[[V:.*]] : (!cir.vector<8 x !cir.double>) -> !cir.double
+
+  // CIR-LABEL: test_mm512_reduce_min_pd
+  // CIR: cir.call @_mm512_reduce_min_pd(%[[VEC:.*]]) : (!cir.vector<8 x !cir.double>) -> !cir.double
+
+  // LLVM-LABEL: test_mm512_reduce_min_pd
+  // LLVM: call double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_min_pd
+  // OGCG-NOT: nnan
+  // OGCG:    call nnan {{.*}}double @llvm.vector.reduce.fmin.v8f64(<8 x double> %{{.*}})
+  // OGCG-NOT: nnan
+  return _mm512_reduce_min_pd(__W) * ExtraMulOp;
+}
+
+float test_mm512_reduce_max_ps(__m512 __W){
+  // CIR-LABEL: _mm512_reduce_max_ps
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmax" %[[V:.*]] : (!cir.vector<16 x !cir.float>) -> !cir.float
+
+  // CIR-LABEL: test_mm512_reduce_max_ps
+  // CIR: cir.call @_mm512_reduce_max_ps(%[[VEC:.*]]) : (!cir.vector<16 x !cir.float>) -> !cir.float
+
+  // LLVM-LABEL: test_mm512_reduce_max_ps
+  // LLVM: call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_max_ps
+  // OGCG: call nnan {{.*}}float @llvm.vector.reduce.fmax.v16f32(<16 x float> %{{.*}})
+  return _mm512_reduce_max_ps(__W);
+}
+
+float test_mm512_reduce_min_ps(__m512 __W){
+  // CIR-LABEL: _mm512_reduce_min_ps
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmin" %[[V:.*]] : (!cir.vector<16 x !cir.float>) -> !cir.float
+
+  // CIR-LABEL: test_mm512_reduce_min_ps
+  // CIR: cir.call @_mm512_reduce_min_ps(%[[VEC:.*]]) : (!cir.vector<16 x !cir.float>) -> !cir.float
+
+  // LLVM-LABEL: test_mm512_reduce_min_ps
+  // LLVM: call float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_min_ps
+  // OGCG: call nnan {{.*}}float @llvm.vector.reduce.fmin.v16f32(<16 x float> %{{.*}})
+  return _mm512_reduce_min_ps(__W);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512fp16-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512fp16-builtins.c
@@ -63,4 +63,64 @@ __m512h test_mm512_undefined_ph(void) {
   // OGCG-LABEL: test_mm512_undefined_ph
   // OGCG: ret <32 x half> zeroinitializer
   return _mm512_undefined_ph();
-}
+}
+
+_Float16 test_mm512_reduce_add_ph(__m512h __W) {
+  // CIR-LABEL: _mm512_reduce_add_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fadd" %[[R:.*]], %[[V:.*]] : (!cir.f16, !cir.vector<32 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm512_reduce_add_ph
+  // CIR: cir.call @_mm512_reduce_add_ph(%[[VEC:.*]]) : (!cir.vector<32 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm512_reduce_add_ph
+  // LLVM: call half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_add_ph
+  // OGCG: call reassoc {{.*}}half @llvm.vector.reduce.fadd.v32f16(half 0xH8000, <32 x half> %{{.*}})
+  return _mm512_reduce_add_ph(__W);
+}
+
+_Float16 test_mm512_reduce_mul_ph(__m512h __W) {
+  // CIR-LABEL: _mm512_reduce_mul_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmul" %[[R:.*]], %[[V:.*]] : (!cir.f16, !cir.vector<32 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm512_reduce_mul_ph
+  // CIR: cir.call @_mm512_reduce_mul_ph(%[[VEC:.*]]) : (!cir.vector<32 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm512_reduce_mul_ph
+  // LLVM: call half @llvm.vector.reduce.fmul.v32f16(half 0xH3C00, <32 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_mul_ph
+  // OGCG: call reassoc {{.*}}half @llvm.vector.reduce.fmul.v32f16(half 0xH3C00, <32 x half> %{{.*}})
+  return _mm512_reduce_mul_ph(__W);
+}
+
+_Float16 test_mm512_reduce_max_ph(__m512h __W) {
+  // CIR-LABEL: _mm512_reduce_max_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmax" %[[V:.*]] (!cir.vector<32 x !cir.f16>) -> !cir.f16 
+
+  // CIR-LABEL: test_mm512_reduce_max_ph
+  // CIR: cir.call @_mm512_reduce_max_ph(%[[VEC:.*]]) : (!cir.vector<32 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm512_reduce_max_ph
+  // LLVM: call half @llvm.vector.reduce.fmax.v32f16(<32 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_max_ph
+  // OGCG: call nnan {{.*}}half @llvm.vector.reduce.fmax.v32f16(<32 x half> %{{.*}})
+  return _mm512_reduce_max_ph(__W);
+}
+
+_Float16 test_mm512_reduce_min_ph(__m512h __W) {
+  // CIR-LABEL: _mm512_reduce_min_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmin" %[[V:.*]] (!cir.vector<32 x !cir.f16>) -> !cir.f16 
+
+  // CIR-LABEL: test_mm512_reduce_min_ph
+  // CIR: cir.call @_mm512_reduce_min_ph(%[[VEC:.*]]) : (!cir.vector<32 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm512_reduce_min_ph
+  // LLVM: call half @llvm.vector.reduce.fmin.v32f16(<32 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm512_reduce_min_ph
+  // OGCG: call nnan {{.*}}half @llvm.vector.reduce.fmin.v32f16(<32 x half> %{{.*}})
+  return _mm512_reduce_min_ph(__W);
+}
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlfp16-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlfp16-builtins.c
@@ -0,0 +1,129 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512fp16 -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512fp16 -fclangir -emit-llvm -o %t.ll  -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512fp16 -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vl -target-feature +avx512fp16 -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+#include <immintrin.h>
+
+_Float16 test_mm256_reduce_add_ph(__m256h __W) {
+  // CIR-LABEL: _mm256_reduce_add_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fadd" %[[R:.*]], %[[V:.*]] : (!cir.f16, !cir.vector<16 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm256_reduce_add_ph
+  // CIR: cir.call @_mm256_reduce_add_ph(%[[VEC:.*]]) : (!cir.vector<16 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm256_reduce_add_ph
+  // LLVM: call half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm256_reduce_add_ph
+  // OGCG: call reassoc {{.*}}@llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> %{{.*}})
+  return _mm256_reduce_add_ph(__W);
+}
+
+_Float16 test_mm256_reduce_mul_ph(__m256h __W) {
+  // CIR-LABEL: _mm256_reduce_mul_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmul" %[[R:.*]], %[[V:.*]] : (!cir.f16, !cir.vector<16 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm256_reduce_mul_ph
+  // CIR: cir.call @_mm256_reduce_mul_ph(%[[VEC:.*]]) : (!cir.vector<16 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm256_reduce_mul_ph
+  // LLVM: call half @llvm.vector.reduce.fmul.v16f16(half 0xH3C00, <16 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm256_reduce_mul_ph
+  // OGCG: call reassoc {{.*}}@llvm.vector.reduce.fmul.v16f16(half 0xH3C00, <16 x half> %{{.*}})
+  return _mm256_reduce_mul_ph(__W);
+}
+
+_Float16 test_mm256_reduce_max_ph(__m256h __W) {
+  // CIR-LABEL: _mm256_reduce_max_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmax" %[[V:.*]] (!cir.vector<16 x !cir.f16>) -> !cir.f16 
+
+  // CIR-LABEL: test_mm256_reduce_max_ph
+  // CIR: cir.call @_mm256_reduce_max_ph(%[[VEC:.*]]) : (!cir.vector<16 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm256_reduce_max_ph
+  // LLVM: call half @llvm.vector.reduce.fmax.v16f16(<16 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm256_reduce_max_ph
+  // OGCG: call nnan {{.*}}@llvm.vector.reduce.fmax.v16f16(<16 x half> %{{.*}})
+  return _mm256_reduce_max_ph(__W);
+}
+
+_Float16 test_mm256_reduce_min_ph(__m256h __W) {
+  // CIR-LABEL: _mm256_reduce_min_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmin" %[[V:.*]] : (!cir.vector<16 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm256_reduce_min_ph
+  // CIR: cir.call @_mm256_reduce_min_ph(%[[VEC:.*]]) : (!cir.vector<16 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm256_reduce_min_ph
+  // LLVM: call half @llvm.vector.reduce.fmin.v16f16(<16 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm256_reduce_min_ph
+  // OGCG: call nnan {{.*}}@llvm.vector.reduce.fmin.v16f16(<16 x half> %{{.*}})
+  return _mm256_reduce_min_ph(__W);
+}
+
+_Float16 test_mm_reduce_add_ph(__m128h __W) {
+  // CIR-LABEL: _mm_reduce_add_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fadd" %[[R:.*]], %[[V:.*]] : (!cir.f16, !cir.vector<8 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm_reduce_add_ph
+  // CIR: cir.call @_mm_reduce_add_ph(%[[VEC:.*]]) : (!cir.vector<8 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm_reduce_add_ph
+  // LLVM: call half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm_reduce_add_ph
+  // OGCG: call reassoc {{.*}}@llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> %{{.*}})
+  return _mm_reduce_add_ph(__W);
+}
+
+_Float16 test_mm_reduce_mul_ph(__m128h __W) {
+  // CIR-LABEL: _mm_reduce_mul_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmul" %[[R:.*]], %[[V:.*]] : (!cir.f16, !cir.vector<8 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm_reduce_mul_ph
+  // CIR: cir.call @_mm_reduce_mul_ph(%[[VEC:.*]]) : (!cir.vector<8 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm_reduce_mul_ph
+  // LLVM: call half @llvm.vector.reduce.fmul.v8f16(half 0xH3C00, <8 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm_reduce_mul_ph
+  // OGCG: call reassoc {{.*}}@llvm.vector.reduce.fmul.v8f16(half 0xH3C00, <8 x half> %{{.*}})
+  return _mm_reduce_mul_ph(__W);
+}
+
+_Float16 test_mm_reduce_max_ph(__m128h __W) {
+  // CIR-LABEL: _mm_reduce_max_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmax" %[[V:.*]] (!cir.vector<8 x !cir.f16>) -> !cir.f16 
+
+  // CIR-LABEL: test_mm_reduce_max_ph
+  // CIR: cir.call @_mm_reduce_max_ph(%[[VEC:.*]]) : (!cir.vector<8 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm_reduce_max_ph
+  // LLVM: call half @llvm.vector.reduce.fmax.v8f16(<8 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm_reduce_max_ph
+  // OGCG: call nnan {{.*}}@llvm.vector.reduce.fmax.v8f16(<8 x half> %{{.*}})
+  return _mm_reduce_max_ph(__W);
+}
+
+_Float16 test_mm_reduce_min_ph(__m128h __W) {
+  // CIR-LABEL: _mm_reduce_min_ph
+  // CIR: cir.call_llvm_intrinsic "vector.reduce.fmin" %[[V:.*]] : (!cir.vector<8 x !cir.f16>) -> !cir.f16
+
+  // CIR-LABEL: test_mm_reduce_min_ph
+  // CIR: cir.call @_mm_reduce_min_ph(%[[VEC:.*]]) : (!cir.vector<8 x !cir.f16>) -> !cir.f16
+
+  // LLVM-LABEL: test_mm_reduce_min_ph
+  // LLVM: call half @llvm.vector.reduce.fmin.v8f16(<8 x half> %{{.*}})
+
+  // OGCG-LABEL: test_mm_reduce_min_ph
+  // OGCG: call nnan {{.*}}@llvm.vector.reduce.fmin.v8f16(<8 x half> %{{.*}})
+  return _mm_reduce_min_ph(__W);
+}
+