From 4e2532e5b6f0f9d42bf2e96d30210bb1acd10a64 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 14 Sep 2023 11:15:26 +0300
Subject: [PATCH] clang/AMDGPU: Fix accidental behavior change for
 __builtin_amdgcn_ldexph

eece6ba283bd763e6d7109ae9e155e81cfee0651 swapped this to use the generic intrinsic
which doesn't have the same behavior for exponent values outside of the range of
short. The instruction will implicitly truncate, but the generic instruction clamps
the value. The builtin should probably have used a short argument type to begin with.
---
 clang/lib/CodeGen/CGBuiltin.cpp                | 12 ++++++++++--
 clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl |  3 ++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 037a2f9f7b153..52868ca260290 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17392,14 +17392,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_log_clampf:
     return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_log_clamp);
   case AMDGPU::BI__builtin_amdgcn_ldexp:
-  case AMDGPU::BI__builtin_amdgcn_ldexpf:
-  case AMDGPU::BI__builtin_amdgcn_ldexph: {
+  case AMDGPU::BI__builtin_amdgcn_ldexpf: {
     llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
     llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
     llvm::Function *F =
         CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Src1->getType()});
     return Builder.CreateCall(F, {Src0, Src1});
   }
+  case AMDGPU::BI__builtin_amdgcn_ldexph: {
+    // The raw instruction has a different behavior for out of bounds exponent
+    // values (implicit truncation instead of saturate to short_min/short_max).
+    llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
+    llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
+    llvm::Function *F =
+        CGM.getIntrinsic(Intrinsic::ldexp, {Src0->getType(), Int16Ty});
+    return Builder.CreateCall(F, {Src0, Builder.CreateTrunc(Src1, Int16Ty)});
+  }
   case AMDGPU::BI__builtin_amdgcn_frexp_mant:
   case AMDGPU::BI__builtin_amdgcn_frexp_mantf:
   case AMDGPU::BI__builtin_amdgcn_frexp_manth:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
index 756f90b282a9a..ff8618a5c727e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
@@ -52,7 +52,8 @@ void test_cos_f16(global half* out, half a)
 }
 
 // CHECK-LABEL: @test_ldexp_f16
-// CHECK: call half @llvm.ldexp.f16.i32
+// CHECK: [[TRUNC:%[0-9a-z]+]] = trunc i32
+// CHECK: call half @llvm.ldexp.f16.i16(half %a, i16 [[TRUNC]])
 void test_ldexp_f16(global half* out, half a, int b)
 {
   *out = __builtin_amdgcn_ldexph(a, b);