From a63e05d2e090edf7834fb62296bccd071a8e38b8 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Thu, 14 Nov 2024 11:53:39 -0700
Subject: [PATCH 01/17] [HLSL] Implement elementwise firstbitlow builtin

---
 clang/include/clang/Basic/Builtins.td         |   6 +
 clang/lib/CodeGen/CGBuiltin.cpp               |   9 +-
 clang/lib/CodeGen/CGHLSLRuntime.h             |   1 +
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  72 ++++++++
 clang/lib/Sema/SemaHLSL.cpp                   |   3 +-
 .../CodeGenHLSL/builtins/firstbitlow.hlsl     | 153 ++++++++++++++++
 .../BuiltIns/firstbithigh-errors.hlsl         |   6 +-
 .../SemaHLSL/BuiltIns/firstbitlow-errors.hlsl |  26 +++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   1 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   1 +
 llvm/lib/Target/DirectX/DXIL.td               |  13 ++
 .../DirectX/DirectXTargetTransformInfo.cpp    |   1 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 169 ++++++++++++++++++
 llvm/test/CodeGen/DirectX/firstbitlow.ll      |  47 +++++
 .../test/CodeGen/DirectX/firstbitlow_error.ll |  10 ++
 .../SPIRV/hlsl-intrinsics/firstbitlow.ll      | 104 +++++++++++
 16 files changed, 616 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/firstbitlow.ll
 create mode 100644 llvm/test/CodeGen/DirectX/firstbitlow_error.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 32a09e2ceb385..a4fb671e47930 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4834,6 +4834,12 @@ def HLSLFirstBitHigh : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "void(...)";
 }
 
+def HLSLFirstBitLow : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_firstbitlow"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLFrac : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_elementwise_frac"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c2e983eebebc1..cbd4c931b05b0 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19255,7 +19255,6 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         "hlsl.dot4add.u8packed");
   }
   case Builtin::BI__builtin_hlsl_elementwise_firstbithigh: {
-
     Value *X = EmitScalarExpr(E->getArg(0));
 
     return Builder.CreateIntrinsic(
@@ -19263,6 +19262,14 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         getFirstBitHighIntrinsic(CGM.getHLSLRuntime(), E->getArg(0)->getType()),
         ArrayRef<Value *>{X}, nullptr, "hlsl.firstbithigh");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_firstbitlow: {
+    Value *X = EmitScalarExpr(E->getArg(0));
+
+    return Builder.CreateIntrinsic(
+        /*ReturnType=*/ConvertType(E->getType()),
+        CGM.getHLSLRuntime().getFirstBitLowIntrinsic(), ArrayRef<Value *>{X},
+        nullptr, "hlsl.firstbitlow");
+  }
   case Builtin::BI__builtin_hlsl_lerp: {
     Value *X = EmitScalarExpr(E->getArg(0));
     Value *Y = EmitScalarExpr(E->getArg(1));
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index bb120c8b5e9e6..df285e185173d 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -97,6 +97,7 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
   GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitUHigh, firstbituhigh)
   GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitSHigh, firstbitshigh)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(FirstBitLow, firstbitlow)
   GENERATE_HLSL_INTRINSIC_FUNCTION(NClamp, nclamp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(SClamp, sclamp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UClamp, uclamp)
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 1126e13600f8a..c132c300da27a 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -1121,6 +1121,78 @@ uint3 firstbithigh(uint64_t3);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbithigh)
 uint4 firstbithigh(uint64_t4);
 
+//===----------------------------------------------------------------------===//
+// firstbitlow builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn T firstbitlow(T Val)
+/// \brief Returns the location of the first set bit starting from the lowest
+/// order bit and working upward, per component.
+/// \param Val the input value.
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(int16_t4);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.2)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint firstbitlow(uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint2 firstbitlow(uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint3 firstbitlow(uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_firstbitlow)
+uint4 firstbitlow(uint64_t4);
+
 //===----------------------------------------------------------------------===//
 // floor builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 88db3e1254119..bf74c62aa8f50 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2014,7 +2014,8 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
-  case Builtin::BI__builtin_hlsl_elementwise_firstbithigh: {
+  case Builtin::BI__builtin_hlsl_elementwise_firstbithigh:
+  case Builtin::BI__builtin_hlsl_elementwise_firstbitlow: {
     if (SemaRef.PrepareBuiltinElementwiseMathOneArgCall(TheCall))
       return true;
 
diff --git a/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
new file mode 100644
index 0000000000000..5d490fabc5bc8
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/firstbitlow.hlsl
@@ -0,0 +1,153 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN: -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s -DTARGET=spv
+
+#ifdef __HLSL_ENABLE_16_BIT
+// CHECK-LABEL: test_firstbitlow_ushort
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i16
+uint test_firstbitlow_ushort(uint16_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ushort2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i16
+uint2 test_firstbitlow_ushort2(uint16_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ushort3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i16
+uint3 test_firstbitlow_ushort3(uint16_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ushort4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i16
+uint4 test_firstbitlow_ushort4(uint16_t4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i16
+uint test_firstbitlow_short(int16_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i16
+uint2 test_firstbitlow_short2(int16_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i16
+uint3 test_firstbitlow_short3(int16_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_short4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i16
+uint4 test_firstbitlow_short4(int16_t4 p0) {
+  return firstbitlow(p0);
+}
+#endif // __HLSL_ENABLE_16_BIT
+
+// CHECK-LABEL: test_firstbitlow_uint
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i32
+uint test_firstbitlow_uint(uint p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_uint2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i32
+uint2 test_firstbitlow_uint2(uint2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_uint3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i32
+uint3 test_firstbitlow_uint3(uint3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_uint4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i32
+uint4 test_firstbitlow_uint4(uint4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i64
+uint test_firstbitlow_ulong(uint64_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i64
+uint2 test_firstbitlow_ulong2(uint64_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i64
+uint3 test_firstbitlow_ulong3(uint64_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_ulong4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i64
+uint4 test_firstbitlow_ulong4(uint64_t4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i32
+uint test_firstbitlow_int(int p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i32
+uint2 test_firstbitlow_int2(int2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i32
+uint3 test_firstbitlow_int3(int3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_int4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i32
+uint4 test_firstbitlow_int4(int4 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long
+// CHECK: call i32 @llvm.[[TARGET]].firstbitlow.i64
+uint test_firstbitlow_long(int64_t p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long2
+// CHECK: call <2 x i32> @llvm.[[TARGET]].firstbitlow.v2i64
+uint2 test_firstbitlow_long2(int64_t2 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long3
+// CHECK: call <3 x i32> @llvm.[[TARGET]].firstbitlow.v3i64
+uint3 test_firstbitlow_long3(int64_t3 p0) {
+  return firstbitlow(p0);
+}
+
+// CHECK-LABEL: test_firstbitlow_long4
+// CHECK: call <4 x i32> @llvm.[[TARGET]].firstbitlow.v4i64
+uint4 test_firstbitlow_long4(int64_t4 p0) {
+  return firstbitlow(p0);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
index 1912ab3ae806b..b4024418dbba4 100644
--- a/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/firstbithigh-errors.hlsl
@@ -17,12 +17,10 @@ double test_int_builtin(double p0) {
 
 double2 test_int_builtin_2(double2 p0) {
   return __builtin_hlsl_elementwise_firstbithigh(p0);
-  // expected-error@-1 {{1st argument must be a vector of integers
-  // (was 'double2' (aka 'vector<double, 2>'))}}
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double2' (aka 'vector<double, 2>'))}}
 }
 
 float test_int_builtin_3(float p0) {
   return __builtin_hlsl_elementwise_firstbithigh(p0);
-  // expected-error@-1 {{1st argument must be a vector of integers
-  // (was 'float')}}
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double')}}
 }
diff --git a/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
new file mode 100644
index 0000000000000..95c25e9e2fb60
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/firstbitlow-errors.hlsl
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -verify-ignore-unexpected
+
+int test_too_few_arg() {
+  return firstbitlow();
+  // expected-error@-1 {{no matching function for call to 'firstbitlow'}}
+}
+
+int test_too_many_arg(int p0) {
+  return firstbitlow(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'firstbitlow'}}
+}
+
+double test_int_builtin(double p0) {
+  return firstbitlow(p0);
+  // expected-error@-1 {{call to 'firstbitlow' is ambiguous}}
+}
+
+double2 test_int_builtin_2(double2 p0) {
+  return __builtin_hlsl_elementwise_firstbitlow(p0);
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double2' (aka 'vector<double, 2>'))}}
+}
+
+float test_int_builtin_3(float p0) {
+  return __builtin_hlsl_elementwise_firstbitlow(p0);
+  // expected-error@-1 {{1st argument must be a vector of integers (was 'double')}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 5696345a617fe..1a182250b610b 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -110,6 +110,7 @@ def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>
 def int_dx_discard : DefaultAttrsIntrinsic<[], [llvm_i1_ty], []>;
 def int_dx_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 def int_dx_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
+def int_dx_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 
 def int_dx_group_memory_barrier_with_group_sync : DefaultAttrsIntrinsic<[], [], []>;
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 1ae3129774e50..1b8dfc416441a 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -109,6 +109,7 @@ let TargetPrefix = "spv" in {
 
   def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
   def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
+  def int_spv_firstbitlow : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 
   def int_spv_bufferUpdateCounter
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index cff6cdce813de..a208ba7663a3b 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -618,6 +618,19 @@ def CountBits :  DXILOp<31, unaryBits> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def FirstbitLo : DXILOp<32, unaryBits> {
+  let Doc = "Returns the location of the first set bit starting from "
+            "the lowest order bit and working upward.";
+  let LLVMIntrinsic = int_dx_firstbitlow;
+  let arguments = [OverloadTy];
+  let result = Int32Ty;
+  let overloads =
+      [Overloads<DXIL1_0, [Int16Ty, Int32Ty, Int64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  // TODO: check these
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
 def FirstbitHi :  DXILOp<33, unaryBits> {
   let Doc = "Returns the location of the first set bit starting from "
             "the highest order bit and working downward.";
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 2ca4e23594d56..0c0d324b21cdd 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -45,6 +45,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_splitdouble:
   case Intrinsic::dx_firstbituhigh:
   case Intrinsic::dx_firstbitshigh:
+  case Intrinsic::dx_firstbitlow:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3a98b74b3d675..fe8879a699104 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -106,6 +106,18 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectFirstBitHigh64(Register ResVReg, const SPIRVType *ResType,
                             MachineInstr &I, bool IsSigned) const;
 
+  bool selectFirstBitLow(Register ResVReg, const SPIRVType *ResType,
+                         MachineInstr &I) const;
+
+  bool selectFirstBitLow16(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I) const;
+
+  bool selectFirstBitLow32(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, Register SrcReg) const;
+
+  bool selectFirstBitLow64(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I) const;
+
   bool selectGlobalValue(Register ResVReg, MachineInstr &I,
                          const MachineInstr *Init = nullptr) const;
 
@@ -2895,6 +2907,9 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectFirstBitHigh(ResVReg, ResType, I, /*IsSigned=*/false);
   case Intrinsic::spv_firstbitshigh: // There is no CL equivalent of FindSMsb
     return selectFirstBitHigh(ResVReg, ResType, I, /*IsSigned=*/true);
+  case Intrinsic::spv_firstbitlow: // There is no CL equivlent of FindILsb
+                                   // (true?)
+    return selectFirstBitLow(ResVReg, ResType, I);
   case Intrinsic::spv_group_memory_barrier_with_group_sync: {
     bool Result = true;
     auto MemSemConstant =
@@ -3292,6 +3307,160 @@ bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
   }
 }
 
+bool SPIRVInstructionSelector::selectFirstBitLow16(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I) const {
+  // OpUConvert treats the operand bits as an unsigned i16 and zero extends it
+  // to an unsigned i32. As this leaves all the least significant bits unchanged
+  // the first set bit from the LSB side doesn't change.
+  Register ExtReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
+  bool Result = selectNAryOpWithSrcs(ExtReg, ResType, I, {I.getOperand(2).getReg()},
+                                  SPIRV::OpUConvert);
+  return Result && selectFirstBitLow32(ResVReg, ResType, I, ExtReg);
+}
+
+bool SPIRVInstructionSelector::selectFirstBitLow32(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I,
+                                                   Register SrcReg) const {
+  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
+      .addImm(GL::FindILsb)
+      .addUse(SrcReg)
+      .constrainAllUses(TII, TRI, RBI);
+}
+
+bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I) const {
+  Register OpReg = I.getOperand(2).getReg();
+
+  // 1. Split int64 into 2 pieces using a bitcast
+  unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
+  SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
+  MachineIRBuilder MIRBuilder(I);
+  SPIRVType *PostCastType =
+      GR.getOrCreateSPIRVVectorType(BaseType, 2 * ComponentCount, MIRBuilder);
+  Register BitcastReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
+  bool Result =
+      selectUnOpWithSrc(BitcastReg, PostCastType, I, OpReg, SPIRV::OpBitcast);
+
+  // 2. Find the first set bit from the LSB side for all the pieces in #1
+  Register FBLReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
+  Result = Result && selectFirstBitLow32(FBLReg, PostCastType, I, BitcastReg);
+
+  // 3. Split result vector into high bits and low bits
+  Register HighReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
+  Register LowReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
+
+  bool ZeroAsNull = STI.isOpenCLEnv();
+  bool IsScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
+  if (IsScalarRes) {
+    // if scalar do a vector extract
+    Result = Result && selectNAryOpWithSrcs(
+        HighReg, ResType, I,
+        {FBLReg, GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull)},
+        SPIRV::OpVectorExtractDynamic);
+    Result = Result && selectNAryOpWithSrcs(
+        LowReg, ResType, I,
+        {FBLReg, GR.getOrCreateConstInt(1, I, ResType, TII, ZeroAsNull)},
+        SPIRV::OpVectorExtractDynamic);
+  } else {
+    // if vector do a shufflevector
+    auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                       TII.get(SPIRV::OpVectorShuffle))
+                   .addDef(HighReg)
+                   .addUse(GR.getSPIRVTypeID(ResType))
+                   .addUse(FBLReg)
+                   // Per the spec, repeat the vector if only one vec is needed
+                   .addUse(FBLReg);
+
+    // high bits are store in even indexes. Extract them from FBLReg
+    for (unsigned j = 0; j < ComponentCount * 2; j += 2) {
+      MIB.addImm(j);
+    }
+    Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+
+    MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                  TII.get(SPIRV::OpVectorShuffle))
+              .addDef(LowReg)
+              .addUse(GR.getSPIRVTypeID(ResType))
+              .addUse(FBLReg)
+              // Per the spec, repeat the vector if only one vec is needed
+              .addUse(FBLReg);
+
+    // low bits are store in odd indexes. Extract them from FBLReg
+    for (unsigned j = 1; j < ComponentCount * 2; j += 2) {
+      MIB.addImm(j);
+    }
+    Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+  }
+
+  // 4. Check if result of each bottom 32 bits is == -1
+  SPIRVType *BoolType = GR.getOrCreateSPIRVBoolType(I, TII);
+  Register NegOneReg;
+  Register Reg0;
+  Register Reg32;
+  unsigned SelectOp;
+  unsigned AddOp;
+
+  if (IsScalarRes) {
+    NegOneReg =
+        GR.getOrCreateConstInt((unsigned)-1, I, ResType, TII, ZeroAsNull);
+    Reg0 = GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull);
+    Reg32 = GR.getOrCreateConstInt(32, I, ResType, TII, ZeroAsNull);
+    SelectOp = SPIRV::OpSelectSISCond;
+    AddOp = SPIRV::OpIAddS;
+  } else {
+    BoolType = GR.getOrCreateSPIRVVectorType(BoolType, ComponentCount, MIRBuilder);
+    NegOneReg =
+        GR.getOrCreateConstVector((unsigned)-1, I, ResType, TII, ZeroAsNull);
+    Reg0 = GR.getOrCreateConstVector(0, I, ResType, TII, ZeroAsNull);
+    Reg32 = GR.getOrCreateConstVector(32, I, ResType, TII, ZeroAsNull);
+    SelectOp = SPIRV::OpSelectVIVCond;
+    AddOp = SPIRV::OpIAddV;
+  }
+
+  // Check if the low bits are == -1; true if -1
+  Register BReg = MRI->createVirtualRegister(GR.getRegClass(BoolType));
+  Result = Result && selectNAryOpWithSrcs(BReg, BoolType, I, {LowReg, NegOneReg},
+                                 SPIRV::OpIEqual);
+
+  // Select high bits if true in BReg, otherwise low bits
+  Register TmpReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
+  Result = Result && selectNAryOpWithSrcs(TmpReg, ResType, I, {BReg, HighReg, LowReg},
+                                 SelectOp);
+
+  // Add 32 for high bits, 0 for low bits
+  Register ValReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
+  Result = Result &&
+      selectNAryOpWithSrcs(ValReg, ResType, I, {BReg, Reg32, Reg0}, SelectOp);
+
+  return Result &&
+         selectNAryOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, AddOp);
+}
+
+bool SPIRVInstructionSelector::selectFirstBitLow(Register ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 MachineInstr &I) const {
+  // FindILsb intrinsic only supports 32 bit integers
+  Register OpReg = I.getOperand(2).getReg();
+  SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
+
+  switch (GR.getScalarOrVectorBitWidth(OpType)) {
+  case 16:
+    return selectFirstBitLow16(ResVReg, ResType, I);
+  case 32:
+    return selectFirstBitLow32(ResVReg, ResType, I, OpReg);
+  case 64:
+    return selectFirstBitLow64(ResVReg, ResType, I);
+  default:
+    report_fatal_error("spv_firstbitlow only supports 16,32,64 bits.");
+  }
+}
+
 bool SPIRVInstructionSelector::selectAllocaArray(Register ResVReg,
                                                  const SPIRVType *ResType,
                                                  MachineInstr &I) const {
diff --git a/llvm/test/CodeGen/DirectX/firstbitlow.ll b/llvm/test/CodeGen/DirectX/firstbitlow.ll
new file mode 100644
index 0000000000000..884ec1164fc99
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/firstbitlow.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
+
+; Make sure dxil operation function calls for firstbitlow are generated for all integer types.
+
+define noundef i32 @test_firstbitlow_short(i16 noundef %a) {
+entry:
+; CHECK: call i32 @dx.op.unaryBits.i16(i32 32, i16 %{{.*}})
+  %elt.firstbitlow = call i32 @llvm.dx.firstbitlow.i16(i16 %a)
+  ret i32 %elt.firstbitlow
+}
+
+define noundef i32 @test_firstbitlow_int(i32 noundef %a) {
+entry:
+; CHECK: call i32 @dx.op.unaryBits.i32(i32 32, i32 %{{.*}})
+  %elt.firstbitlow = call i32 @llvm.dx.firstbitlow.i32(i32 %a)
+  ret i32 %elt.firstbitlow
+}
+
+define noundef i32 @test_firstbitlow_long(i64 noundef %a) {
+entry:
+; CHECK: call i32 @dx.op.unaryBits.i64(i32 32, i64 %{{.*}})
+  %elt.firstbitlow = call i32 @llvm.dx.firstbitlow.i64(i64 %a)
+  ret i32 %elt.firstbitlow
+}
+
+define noundef <4 x i32> @test_firstbitlow_vec4_i32(<4 x i32> noundef %a)  {
+entry:
+  ; CHECK: [[ee0:%.*]] = extractelement <4 x i32> %a, i64 0
+  ; CHECK: [[ie0:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee0]])
+  ; CHECK: [[ee1:%.*]] = extractelement <4 x i32> %a, i64 1
+  ; CHECK: [[ie1:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee1]])
+  ; CHECK: [[ee2:%.*]] = extractelement <4 x i32> %a, i64 2
+  ; CHECK: [[ie2:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee2]])
+  ; CHECK: [[ee3:%.*]] = extractelement <4 x i32> %a, i64 3
+  ; CHECK: [[ie3:%.*]] = call i32 @dx.op.unaryBits.i32(i32 32, i32 [[ee3]])
+  ; CHECK: insertelement <4 x i32> poison, i32 [[ie0]], i64 0
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie1]], i64 1
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie2]], i64 2
+  ; CHECK: insertelement <4 x i32> %{{.*}}, i32 [[ie3]], i64 3
+  %2 = call <4 x i32> @llvm.dx.firstbitlow.v4i32(<4 x i32> %a)
+  ret <4 x i32> %2
+}
+
+declare i32 @llvm.dx.firstbitlow.i16(i16)
+declare i32 @llvm.dx.firstbitlow.i32(i32)
+declare i32 @llvm.dx.firstbitlow.i64(i64)
+declare <4 x i32> @llvm.dx.firstbitlow.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/DirectX/firstbitlow_error.ll b/llvm/test/CodeGen/DirectX/firstbitlow_error.ll
new file mode 100644
index 0000000000000..d8b9333067f4a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/firstbitlow_error.ll
@@ -0,0 +1,10 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation firstbitshigh does not support double overload type
+; CHECK: invalid intrinsic signature
+
+define noundef double @firstbitlow_double(double noundef %a) {
+entry:
+  %1 = call double @llvm.dx.firstbitlow.f64(double %a)
+  ret double %1
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
new file mode 100644
index 0000000000000..9ebd8cc511eb6
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
@@ -0,0 +1,104 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[glsl_450_ext:%.+]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: OpMemoryModel Logical GLSL450
+; CHECK-DAG: [[u32_t:%.+]] = OpTypeInt 32 0
+; CHECK-DAG: [[u32x2_t:%.+]] = OpTypeVector [[u32_t]] 2
+; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
+; CHECK-DAG: [[const_zero:%.*]] = OpConstant [[u32_t]] 0
+; CHECK-DAG: [[const_zerox2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_zero]] [[const_zero]]
+; CHECK-DAG: [[const_one:%.*]] = OpConstant [[u32_t]] 1
+; CHECK-DAG: [[const_thirty_two:%.*]] = OpConstant [[u32_t]] 32
+; CHECK-DAG: [[const_thirty_twox2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_thirty_two]] [[const_thirty_two]]
+; CHECK-DAG: [[const_neg_one:%.*]] = OpConstant [[u32_t]] 4294967295
+; CHECK-DAG: [[const_neg_onex2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg_one]] [[const_neg_one]]
+; CHECK-DAG: [[u16_t:%.+]] = OpTypeInt 16 0
+; CHECK-DAG: [[u16x2_t:%.+]] = OpTypeVector [[u16_t]] 2
+; CHECK-DAG: [[u64_t:%.+]] = OpTypeInt 64 0
+; CHECK-DAG: [[u64x2_t:%.+]] = OpTypeVector [[u64_t]] 2
+; CHECK-DAG: [[bool_t:%.+]] = OpTypeBool
+; CHECK-DAG: [[boolx2_t:%.+]] = OpTypeVector [[bool_t]] 2
+
+; CHECK-LABEL: Begin function firstbitlow_i32
+define noundef i32 @firstbitlow_i32(i32 noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call i32 @llvm.spv.firstbitlow.i32(i32 %a)
+  ret i32 %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_2xi32
+define noundef <2 x i32> @firstbitlow_2xi32(<2 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x2_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <2 x i32> @llvm.spv.firstbitlow.v2i32(<2 x i32> %a)
+  ret <2 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_i16
+define noundef i32 @firstbitlow_i16(i16 noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call i32 @llvm.spv.firstbitlow.i16(i16 %a)
+  ret i32 %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v2i16
+define noundef <2 x i32> @firstbitlow_v2i16(<2 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x2_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x2_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <2 x i32> @llvm.spv.firstbitlow.v2i16(<2 x i16> %a)
+  ret <2 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_i64
+define noundef i32 @firstbitlow_i64(i64 noundef %a) {
+entry:
+; CHECK: [[a64:%.+]] = OpFunctionParameter [[u64_t]]
+; CHECK: [[a32x2:%.+]] = OpBitcast [[u32x2_t]] [[a64]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a32x2]]
+; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_zero]]
+; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_one]]
+; CHECK: [[should_use_high:%.+]] = OpIEqual [[bool_t]] [[low_bits]] [[const_neg_one]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32_t]] [[should_use_high]] [[high_bits]] [[low_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_high]] [[const_thirty_two]] [[const_zero]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call i32 @llvm.spv.firstbitlow.i64(i64 %a)
+  ret i32 %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v2i64
+define noundef <2 x i32> @firstbitlow_v2i64(<2 x i64> noundef %a) {
+entry:
+; CHECK: [[a64x2:%.+]] = OpFunctionParameter [[u64x2_t]]
+; CHECK: [[a32x4:%.+]] = OpBitcast [[u32x4_t]] [[a64x2]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a32x4]]
+; CHECK: [[high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 0 2
+; CHECK: [[low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 1 3
+; CHECK: [[should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[low_bits]] [[const_neg_onex2]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32x2_t]] [[should_use_high]] [[high_bits]] [[low_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32x2_t]] [[should_use_high]] [[const_thirty_twox2]] [[const_zerox2]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32x2_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <2 x i32> @llvm.spv.firstbitlow.v2i64(<2 x i64> %a)
+  ret <2 x i32> %elt.firstbitlow
+}
+
+;declare i16 @llvm.spv.firstbitlow.i16(i16)
+;declare i32 @llvm.spv.firstbitlow.i32(i32)
+;declare i64 @llvm.spv.firstbitlow.i64(i64)
+;declare i16 @llvm.spv.firstbitlow.v2i16(<2 x i16>)
+;declare i32 @llvm.spv.firstbitlow.v2i32(<2 x i32>)
+;declare i64 @llvm.spv.firstbitlow.v2i64(<2 x i64>)

From 72f1999234cfa5de5bf3e46da46225a5b1e87924 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Tue, 19 Nov 2024 10:35:52 -0700
Subject: [PATCH 02/17] format

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fe8879a699104..dd00947f98549 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3314,8 +3314,8 @@ bool SPIRVInstructionSelector::selectFirstBitLow16(Register ResVReg,
   // to an unsigned i32. As this leaves all the least significant bits unchanged
   // the first set bit from the LSB side doesn't change.
   Register ExtReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  bool Result = selectNAryOpWithSrcs(ExtReg, ResType, I, {I.getOperand(2).getReg()},
-                                  SPIRV::OpUConvert);
+  bool Result = selectNAryOpWithSrcs(
+      ExtReg, ResType, I, {I.getOperand(2).getReg()}, SPIRV::OpUConvert);
   return Result && selectFirstBitLow32(ResVReg, ResType, I, ExtReg);
 }
 
@@ -3343,7 +3343,8 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
   MachineIRBuilder MIRBuilder(I);
   SPIRVType *PostCastType =
       GR.getOrCreateSPIRVVectorType(BaseType, 2 * ComponentCount, MIRBuilder);
-  Register BitcastReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
+  Register BitcastReg =
+      MRI->createVirtualRegister(GR.getRegClass(PostCastType));
   bool Result =
       selectUnOpWithSrc(BitcastReg, PostCastType, I, OpReg, SPIRV::OpBitcast);
 
@@ -3359,14 +3360,18 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
   bool IsScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
   if (IsScalarRes) {
     // if scalar do a vector extract
-    Result = Result && selectNAryOpWithSrcs(
-        HighReg, ResType, I,
-        {FBLReg, GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull)},
-        SPIRV::OpVectorExtractDynamic);
-    Result = Result && selectNAryOpWithSrcs(
-        LowReg, ResType, I,
-        {FBLReg, GR.getOrCreateConstInt(1, I, ResType, TII, ZeroAsNull)},
-        SPIRV::OpVectorExtractDynamic);
+    Result =
+        Result &&
+        selectNAryOpWithSrcs(
+            HighReg, ResType, I,
+            {FBLReg, GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull)},
+            SPIRV::OpVectorExtractDynamic);
+    Result =
+        Result &&
+        selectNAryOpWithSrcs(
+            LowReg, ResType, I,
+            {FBLReg, GR.getOrCreateConstInt(1, I, ResType, TII, ZeroAsNull)},
+            SPIRV::OpVectorExtractDynamic);
   } else {
     // if vector do a shufflevector
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -3414,7 +3419,8 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
     SelectOp = SPIRV::OpSelectSISCond;
     AddOp = SPIRV::OpIAddS;
   } else {
-    BoolType = GR.getOrCreateSPIRVVectorType(BoolType, ComponentCount, MIRBuilder);
+    BoolType =
+        GR.getOrCreateSPIRVVectorType(BoolType, ComponentCount, MIRBuilder);
     NegOneReg =
         GR.getOrCreateConstVector((unsigned)-1, I, ResType, TII, ZeroAsNull);
     Reg0 = GR.getOrCreateConstVector(0, I, ResType, TII, ZeroAsNull);
@@ -3425,18 +3431,18 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
 
   // Check if the low bits are == -1; true if -1
   Register BReg = MRI->createVirtualRegister(GR.getRegClass(BoolType));
-  Result = Result && selectNAryOpWithSrcs(BReg, BoolType, I, {LowReg, NegOneReg},
-                                 SPIRV::OpIEqual);
+  Result = Result && selectNAryOpWithSrcs(BReg, BoolType, I,
+                                          {LowReg, NegOneReg}, SPIRV::OpIEqual);
 
   // Select high bits if true in BReg, otherwise low bits
   Register TmpReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result = Result && selectNAryOpWithSrcs(TmpReg, ResType, I, {BReg, HighReg, LowReg},
-                                 SelectOp);
+  Result = Result && selectNAryOpWithSrcs(TmpReg, ResType, I,
+                                          {BReg, HighReg, LowReg}, SelectOp);
 
   // Add 32 for high bits, 0 for low bits
   Register ValReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result = Result &&
-      selectNAryOpWithSrcs(ValReg, ResType, I, {BReg, Reg32, Reg0}, SelectOp);
+  Result = Result && selectNAryOpWithSrcs(ValReg, ResType, I,
+                                          {BReg, Reg32, Reg0}, SelectOp);
 
   return Result &&
          selectNAryOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, AddOp);

From 8434e6ad8590baa3848192728433a0ad9fe02f4b Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Tue, 19 Nov 2024 11:56:07 -0700
Subject: [PATCH 03/17] cleanup

---
 llvm/lib/Target/DirectX/DXIL.td               |  1 -
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 11 ++++----
 .../SPIRV/hlsl-intrinsics/firstbitlow.ll      | 26 +++++++++----------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index a208ba7663a3b..d6d78581bafbf 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -627,7 +627,6 @@ def FirstbitLo : DXILOp<32, unaryBits> {
   let overloads =
       [Overloads<DXIL1_0, [Int16Ty, Int32Ty, Int64Ty]>];
   let stages = [Stages<DXIL1_0, [all_stages]>];
-  // TODO: check these
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index dd00947f98549..e1c58f8578554 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -2908,7 +2908,6 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_firstbitshigh: // There is no CL equivalent of FindSMsb
     return selectFirstBitHigh(ResVReg, ResType, I, /*IsSigned=*/true);
   case Intrinsic::spv_firstbitlow: // There is no CL equivlent of FindILsb
-                                   // (true?)
     return selectFirstBitLow(ResVReg, ResType, I);
   case Intrinsic::spv_group_memory_barrier_with_group_sync: {
     bool Result = true;
@@ -3382,7 +3381,7 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
                    // Per the spec, repeat the vector if only one vec is needed
                    .addUse(FBLReg);
 
-    // high bits are store in even indexes. Extract them from FBLReg
+    // high bits are stored in even indexes. Extract them from FBLReg
     for (unsigned j = 0; j < ComponentCount * 2; j += 2) {
       MIB.addImm(j);
     }
@@ -3396,14 +3395,14 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
               // Per the spec, repeat the vector if only one vec is needed
               .addUse(FBLReg);
 
-    // low bits are store in odd indexes. Extract them from FBLReg
+    // low bits are stored in odd indexes. Extract them from FBLReg
     for (unsigned j = 1; j < ComponentCount * 2; j += 2) {
       MIB.addImm(j);
     }
     Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
   }
 
-  // 4. Check if result of each bottom 32 bits is == -1
+  // 4. Check the result. When low bits == -1 use high, otherwise use low
   SPIRVType *BoolType = GR.getOrCreateSPIRVBoolType(I, TII);
   Register NegOneReg;
   Register Reg0;
@@ -3429,7 +3428,7 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
     AddOp = SPIRV::OpIAddV;
   }
 
-  // Check if the low bits are == -1; true if -1
+  // Check if the low bits are == -1
   Register BReg = MRI->createVirtualRegister(GR.getRegClass(BoolType));
   Result = Result && selectNAryOpWithSrcs(BReg, BoolType, I,
                                           {LowReg, NegOneReg}, SPIRV::OpIEqual);
@@ -3439,7 +3438,7 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
   Result = Result && selectNAryOpWithSrcs(TmpReg, ResType, I,
                                           {BReg, HighReg, LowReg}, SelectOp);
 
-  // Add 32 for high bits, 0 for low bits
+  // 5. Add 32 when high bits are used, otherwise 0 for low bits
   Register ValReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
   Result = Result && selectNAryOpWithSrcs(ValReg, ResType, I,
                                           {BReg, Reg32, Reg0}, SelectOp);
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
index 9ebd8cc511eb6..05488479e5bd0 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
@@ -6,13 +6,13 @@
 ; CHECK-DAG: [[u32_t:%.+]] = OpTypeInt 32 0
 ; CHECK-DAG: [[u32x2_t:%.+]] = OpTypeVector [[u32_t]] 2
 ; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
-; CHECK-DAG: [[const_zero:%.*]] = OpConstant [[u32_t]] 0
-; CHECK-DAG: [[const_zerox2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_zero]] [[const_zero]]
-; CHECK-DAG: [[const_one:%.*]] = OpConstant [[u32_t]] 1
-; CHECK-DAG: [[const_thirty_two:%.*]] = OpConstant [[u32_t]] 32
-; CHECK-DAG: [[const_thirty_twox2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_thirty_two]] [[const_thirty_two]]
-; CHECK-DAG: [[const_neg_one:%.*]] = OpConstant [[u32_t]] 4294967295
-; CHECK-DAG: [[const_neg_onex2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg_one]] [[const_neg_one]]
+; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
+; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
+; CHECK-DAG: [[const_1:%.*]] = OpConstant [[u32_t]] 1
+; CHECK-DAG: [[const_32:%.*]] = OpConstant [[u32_t]] 32
+; CHECK-DAG: [[const_32x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_32]] [[const_32]]
+; CHECK-DAG: [[const_neg1:%.*]] = OpConstant [[u32_t]] 4294967295
+; CHECK-DAG: [[const_neg1x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg1]] [[const_neg1]]
 ; CHECK-DAG: [[u16_t:%.+]] = OpTypeInt 16 0
 ; CHECK-DAG: [[u16x2_t:%.+]] = OpTypeVector [[u16_t]] 2
 ; CHECK-DAG: [[u64_t:%.+]] = OpTypeInt 64 0
@@ -68,11 +68,11 @@ entry:
 ; CHECK: [[a64:%.+]] = OpFunctionParameter [[u64_t]]
 ; CHECK: [[a32x2:%.+]] = OpBitcast [[u32x2_t]] [[a64]]
 ; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a32x2]]
-; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_zero]]
-; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_one]]
-; CHECK: [[should_use_high:%.+]] = OpIEqual [[bool_t]] [[low_bits]] [[const_neg_one]]
+; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_0]]
+; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_1]]
+; CHECK: [[should_use_high:%.+]] = OpIEqual [[bool_t]] [[low_bits]] [[const_neg1]]
 ; CHECK: [[ans_bits:%.+]] = OpSelect [[u32_t]] [[should_use_high]] [[high_bits]] [[low_bits]]
-; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_high]] [[const_thirty_two]] [[const_zero]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_high]] [[const_32]] [[const_0]]
 ; CHECK: [[ret:%.+]] = OpIAdd [[u32_t]] [[ans_offset]] [[ans_bits]]
 ; CHECK: OpReturnValue [[ret]]
   %elt.firstbitlow = call i32 @llvm.spv.firstbitlow.i64(i64 %a)
@@ -87,9 +87,9 @@ entry:
 ; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a32x4]]
 ; CHECK: [[high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 0 2
 ; CHECK: [[low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 1 3
-; CHECK: [[should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[low_bits]] [[const_neg_onex2]]
+; CHECK: [[should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[low_bits]] [[const_neg1x2]]
 ; CHECK: [[ans_bits:%.+]] = OpSelect [[u32x2_t]] [[should_use_high]] [[high_bits]] [[low_bits]]
-; CHECK: [[ans_offset:%.+]] = OpSelect [[u32x2_t]] [[should_use_high]] [[const_thirty_twox2]] [[const_zerox2]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32x2_t]] [[should_use_high]] [[const_32x2]] [[const_0x2]]
 ; CHECK: [[ret:%.+]] = OpIAdd [[u32x2_t]] [[ans_offset]] [[ans_bits]]
 ; CHECK: OpReturnValue [[ret]]
   %elt.firstbitlow = call <2 x i32> @llvm.spv.firstbitlow.v2i64(<2 x i64> %a)

From b6bdc0dffb05163dedb5e5e82bf8b9f079298225 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Fri, 22 Nov 2024 13:19:43 -0700
Subject: [PATCH 04/17] Address comments

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 327 ++++++------------
 1 file changed, 108 insertions(+), 219 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index e1c58f8578554..bca67585d2858 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -96,27 +96,20 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectFirstBitHigh(Register ResVReg, const SPIRVType *ResType,
                           MachineInstr &I, bool IsSigned) const;
 
-  bool selectFirstBitHigh16(Register ResVReg, const SPIRVType *ResType,
-                            MachineInstr &I, bool IsSigned) const;
-
-  bool selectFirstBitHigh32(Register ResVReg, const SPIRVType *ResType,
-                            MachineInstr &I, Register SrcReg,
-                            bool IsSigned) const;
-
-  bool selectFirstBitHigh64(Register ResVReg, const SPIRVType *ResType,
-                            MachineInstr &I, bool IsSigned) const;
-
   bool selectFirstBitLow(Register ResVReg, const SPIRVType *ResType,
                          MachineInstr &I) const;
 
-  bool selectFirstBitLow16(Register ResVReg, const SPIRVType *ResType,
-                           MachineInstr &I) const;
+  bool selectFirstBitSet16(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, unsigned ExtendOpcode,
+                           unsigned BitSetOpcode) const;
 
-  bool selectFirstBitLow32(Register ResVReg, const SPIRVType *ResType,
-                           MachineInstr &I, Register SrcReg) const;
+  bool selectFirstBitSet32(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, Register SrcReg,
+                           unsigned Opcode) const;
 
-  bool selectFirstBitLow64(Register ResVReg, const SPIRVType *ResType,
-                           MachineInstr &I) const;
+  bool selectFirstBitSet64(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, unsigned ExtendOpcode,
+                           unsigned BitSetOpcode, bool SwapPrimarySide) const;
 
   bool selectGlobalValue(Register ResVReg, MachineInstr &I,
                          const MachineInstr *Init = nullptr) const;
@@ -3153,187 +3146,34 @@ Register SPIRVInstructionSelector::buildPointerToResource(
   return AcReg;
 }
 
-bool SPIRVInstructionSelector::selectFirstBitHigh16(Register ResVReg,
-                                                    const SPIRVType *ResType,
-                                                    MachineInstr &I,
-                                                    bool IsSigned) const {
-  unsigned Opcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert;
-  // zero or sign extend
+bool SPIRVInstructionSelector::selectFirstBitSet16(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    unsigned ExtendOpcode, unsigned BitSetOpcode) const {
   Register ExtReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  bool Result =
-      selectOpWithSrcs(ExtReg, ResType, I, {I.getOperand(2).getReg()}, Opcode);
-  return Result && selectFirstBitHigh32(ResVReg, ResType, I, ExtReg, IsSigned);
-}
-
-bool SPIRVInstructionSelector::selectFirstBitHigh32(Register ResVReg,
-                                                    const SPIRVType *ResType,
-                                                    MachineInstr &I,
-                                                    Register SrcReg,
-                                                    bool IsSigned) const {
-  unsigned Opcode = IsSigned ? GL::FindSMsb : GL::FindUMsb;
-  return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
-      .addDef(ResVReg)
-      .addUse(GR.getSPIRVTypeID(ResType))
-      .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
-      .addImm(Opcode)
-      .addUse(SrcReg)
-      .constrainAllUses(TII, TRI, RBI);
-}
-
-bool SPIRVInstructionSelector::selectFirstBitHigh64(Register ResVReg,
-                                                    const SPIRVType *ResType,
-                                                    MachineInstr &I,
-                                                    bool IsSigned) const {
-  Register OpReg = I.getOperand(2).getReg();
-  // 1. split our int64 into 2 pieces using a bitcast
-  unsigned count = GR.getScalarOrVectorComponentCount(ResType);
-  SPIRVType *baseType = GR.retrieveScalarOrVectorIntType(ResType);
-  MachineIRBuilder MIRBuilder(I);
-  SPIRVType *postCastT =
-      GR.getOrCreateSPIRVVectorType(baseType, 2 * count, MIRBuilder);
-  Register bitcastReg = MRI->createVirtualRegister(GR.getRegClass(postCastT));
-  bool Result =
-      selectOpWithSrcs(bitcastReg, postCastT, I, {OpReg}, SPIRV::OpBitcast);
-
-  // 2. call firstbithigh
-  Register FBHReg = MRI->createVirtualRegister(GR.getRegClass(postCastT));
-  Result &= selectFirstBitHigh32(FBHReg, postCastT, I, bitcastReg, IsSigned);
-
-  // 3. split result vector into high bits and low bits
-  Register HighReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Register LowReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-
-  bool ZeroAsNull = STI.isOpenCLEnv();
-  bool isScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
-  if (isScalarRes) {
-    // if scalar do a vector extract
-    Result &= selectOpWithSrcs(
-        HighReg, ResType, I,
-        {FBHReg, GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull)},
-        SPIRV::OpVectorExtractDynamic);
-    Result &= selectOpWithSrcs(
-        LowReg, ResType, I,
-        {FBHReg, GR.getOrCreateConstInt(1, I, ResType, TII, ZeroAsNull)},
-        SPIRV::OpVectorExtractDynamic);
-  } else { // vector case do a shufflevector
-    auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                       TII.get(SPIRV::OpVectorShuffle))
-                   .addDef(HighReg)
-                   .addUse(GR.getSPIRVTypeID(ResType))
-                   .addUse(FBHReg)
-                   .addUse(FBHReg);
-    // ^^ this vector will not be selected from; could be empty
-    unsigned j;
-    for (j = 0; j < count * 2; j += 2) {
-      MIB.addImm(j);
-    }
-    Result &= MIB.constrainAllUses(TII, TRI, RBI);
-
-    // get low bits
-    MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                  TII.get(SPIRV::OpVectorShuffle))
-              .addDef(LowReg)
-              .addUse(GR.getSPIRVTypeID(ResType))
-              .addUse(FBHReg)
-              .addUse(FBHReg);
-    // ^^ this vector will not be selected from; could be empty
-    for (j = 1; j < count * 2; j += 2) {
-      MIB.addImm(j);
-    }
-    Result &= MIB.constrainAllUses(TII, TRI, RBI);
-  }
-
-  // 4. check if result of each top 32 bits is == -1
-  SPIRVType *BoolType = GR.getOrCreateSPIRVBoolType(I, TII);
-  Register NegOneReg;
-  Register Reg0;
-  Register Reg32;
-  unsigned selectOp;
-  unsigned addOp;
-  if (isScalarRes) {
-    NegOneReg =
-        GR.getOrCreateConstInt((unsigned)-1, I, ResType, TII, ZeroAsNull);
-    Reg0 = GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull);
-    Reg32 = GR.getOrCreateConstInt(32, I, ResType, TII, ZeroAsNull);
-    selectOp = SPIRV::OpSelectSISCond;
-    addOp = SPIRV::OpIAddS;
-  } else {
-    BoolType = GR.getOrCreateSPIRVVectorType(BoolType, count, MIRBuilder);
-    NegOneReg =
-        GR.getOrCreateConstVector((unsigned)-1, I, ResType, TII, ZeroAsNull);
-    Reg0 = GR.getOrCreateConstVector(0, I, ResType, TII, ZeroAsNull);
-    Reg32 = GR.getOrCreateConstVector(32, I, ResType, TII, ZeroAsNull);
-    selectOp = SPIRV::OpSelectVIVCond;
-    addOp = SPIRV::OpIAddV;
-  }
-
-  // check if the high bits are == -1; true if -1
-  Register BReg = MRI->createVirtualRegister(GR.getRegClass(BoolType));
-  Result &= selectOpWithSrcs(BReg, BoolType, I, {HighReg, NegOneReg},
-                             SPIRV::OpIEqual);
-
-  // Select low bits if true in BReg, otherwise high bits
-  Register TmpReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result &=
-      selectOpWithSrcs(TmpReg, ResType, I, {BReg, LowReg, HighReg}, selectOp);
-
-  // Add 32 for high bits, 0 for low bits
-  Register ValReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result &= selectOpWithSrcs(ValReg, ResType, I, {BReg, Reg0, Reg32}, selectOp);
+  bool Result = selectOpWithSrcs(ExtReg, ResType, I, {I.getOperand(2).getReg()},
+                                 ExtendOpcode);
 
   return Result &&
-         selectOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, addOp);
-}
-
-bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
-                                                  const SPIRVType *ResType,
-                                                  MachineInstr &I,
-                                                  bool IsSigned) const {
-  // FindUMsb and FindSMsb intrinsics only support 32 bit integers
-  Register OpReg = I.getOperand(2).getReg();
-  SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
-
-  switch (GR.getScalarOrVectorBitWidth(OpType)) {
-  case 16:
-    return selectFirstBitHigh16(ResVReg, ResType, I, IsSigned);
-  case 32:
-    return selectFirstBitHigh32(ResVReg, ResType, I, OpReg, IsSigned);
-  case 64:
-    return selectFirstBitHigh64(ResVReg, ResType, I, IsSigned);
-  default:
-    report_fatal_error(
-        "spv_firstbituhigh and spv_firstbitshigh only support 16,32,64 bits.");
-  }
+         selectFirstBitSet32(ResVReg, ResType, I, ExtReg, BitSetOpcode);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitLow16(Register ResVReg,
-                                                   const SPIRVType *ResType,
-                                                   MachineInstr &I) const {
-  // OpUConvert treats the operand bits as an unsigned i16 and zero extends it
-  // to an unsigned i32. As this leaves all the least significant bits unchanged
-  // the first set bit from the LSB side doesn't change.
-  Register ExtReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  bool Result = selectNAryOpWithSrcs(
-      ExtReg, ResType, I, {I.getOperand(2).getReg()}, SPIRV::OpUConvert);
-  return Result && selectFirstBitLow32(ResVReg, ResType, I, ExtReg);
-}
-
-bool SPIRVInstructionSelector::selectFirstBitLow32(Register ResVReg,
+bool SPIRVInstructionSelector::selectFirstBitSet32(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I,
-                                                   Register SrcReg) const {
+                                                   Register SrcReg,
+                                                   unsigned Opcode) const {
   return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
       .addDef(ResVReg)
       .addUse(GR.getSPIRVTypeID(ResType))
       .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
-      .addImm(GL::FindILsb)
+      .addImm(Opcode)
       .addUse(SrcReg)
       .constrainAllUses(TII, TRI, RBI);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
-                                                   const SPIRVType *ResType,
-                                                   MachineInstr &I) const {
+bool SPIRVInstructionSelector::selectFirstBitSet64(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    unsigned ExtendOpcode, unsigned BitSetOpcode, bool SwapPrimarySide) const {
   Register OpReg = I.getOperand(2).getReg();
 
   // 1. Split int64 into 2 pieces using a bitcast
@@ -3345,11 +3185,12 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
   Register BitcastReg =
       MRI->createVirtualRegister(GR.getRegClass(PostCastType));
   bool Result =
-      selectUnOpWithSrc(BitcastReg, PostCastType, I, OpReg, SPIRV::OpBitcast);
+      selectOpWithSrcs(BitcastReg, PostCastType, I, {OpReg}, SPIRV::OpBitcast);
 
-  // 2. Find the first set bit from the LSB side for all the pieces in #1
-  Register FBLReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
-  Result = Result && selectFirstBitLow32(FBLReg, PostCastType, I, BitcastReg);
+  // 2. Find the first set bit from the primary side for all the pieces in #1
+  Register FBPReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
+  Result = Result && selectFirstBitSet32(FBPReg, PostCastType, I, BitcastReg,
+                                         BitSetOpcode);
 
   // 3. Split result vector into high bits and low bits
   Register HighReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
@@ -3359,31 +3200,29 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
   bool IsScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
   if (IsScalarRes) {
     // if scalar do a vector extract
-    Result =
-        Result &&
-        selectNAryOpWithSrcs(
-            HighReg, ResType, I,
-            {FBLReg, GR.getOrCreateConstInt(0, I, ResType, TII, ZeroAsNull)},
-            SPIRV::OpVectorExtractDynamic);
-    Result =
-        Result &&
-        selectNAryOpWithSrcs(
-            LowReg, ResType, I,
-            {FBLReg, GR.getOrCreateConstInt(1, I, ResType, TII, ZeroAsNull)},
-            SPIRV::OpVectorExtractDynamic);
+    Result = Result &&
+             selectOpWithSrcs(HighReg, ResType, I,
+                              {FBPReg, GR.getOrCreateConstInt(0, I, ResType,
+                                                              TII, ZeroAsNull)},
+                              SPIRV::OpVectorExtractDynamic);
+    Result = Result &&
+             selectOpWithSrcs(LowReg, ResType, I,
+                              {FBPReg, GR.getOrCreateConstInt(1, I, ResType,
+                                                              TII, ZeroAsNull)},
+                              SPIRV::OpVectorExtractDynamic);
   } else {
     // if vector do a shufflevector
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                        TII.get(SPIRV::OpVectorShuffle))
                    .addDef(HighReg)
                    .addUse(GR.getSPIRVTypeID(ResType))
-                   .addUse(FBLReg)
+                   .addUse(FBPReg)
                    // Per the spec, repeat the vector if only one vec is needed
-                   .addUse(FBLReg);
+                   .addUse(FBPReg);
 
     // high bits are stored in even indexes. Extract them from FBLReg
-    for (unsigned j = 0; j < ComponentCount * 2; j += 2) {
-      MIB.addImm(j);
+    for (unsigned J = 0; J < ComponentCount * 2; J += 2) {
+      MIB.addImm(J);
     }
     Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
 
@@ -3391,18 +3230,19 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
                   TII.get(SPIRV::OpVectorShuffle))
               .addDef(LowReg)
               .addUse(GR.getSPIRVTypeID(ResType))
-              .addUse(FBLReg)
+              .addUse(FBPReg)
               // Per the spec, repeat the vector if only one vec is needed
-              .addUse(FBLReg);
+              .addUse(FBPReg);
 
     // low bits are stored in odd indexes. Extract them from FBLReg
-    for (unsigned j = 1; j < ComponentCount * 2; j += 2) {
-      MIB.addImm(j);
+    for (unsigned J = 1; J < ComponentCount * 2; J += 2) {
+      MIB.addImm(J);
     }
     Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
   }
 
-  // 4. Check the result. When low bits == -1 use high, otherwise use low
+  // 4. Check the result. When primary bits == -1 use secondary, otherwise use
+  // primary
   SPIRVType *BoolType = GR.getOrCreateSPIRVBoolType(I, TII);
   Register NegOneReg;
   Register Reg0;
@@ -3428,23 +3268,66 @@ bool SPIRVInstructionSelector::selectFirstBitLow64(Register ResVReg,
     AddOp = SPIRV::OpIAddV;
   }
 
-  // Check if the low bits are == -1
+  Register PrimaryReg;
+  Register SecondaryReg;
+  Register PrimaryShiftReg;
+  Register SecondaryShiftReg;
+  if (SwapPrimarySide) {
+    PrimaryReg = LowReg;
+    SecondaryReg = HighReg;
+    PrimaryShiftReg = Reg0;
+    SecondaryShiftReg = Reg32;
+  } else {
+    PrimaryReg = HighReg;
+    SecondaryReg = LowReg;
+    PrimaryShiftReg = Reg32;
+    SecondaryShiftReg = Reg0;
+  }
+
+  // Check if the primary bits are == -1
   Register BReg = MRI->createVirtualRegister(GR.getRegClass(BoolType));
-  Result = Result && selectNAryOpWithSrcs(BReg, BoolType, I,
-                                          {LowReg, NegOneReg}, SPIRV::OpIEqual);
+  Result = Result && selectOpWithSrcs(BReg, BoolType, I,
+                                      {PrimaryReg, NegOneReg}, SPIRV::OpIEqual);
 
-  // Select high bits if true in BReg, otherwise low bits
+  // Select secondary bits if true in BReg, otherwise primary bits
   Register TmpReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result = Result && selectNAryOpWithSrcs(TmpReg, ResType, I,
-                                          {BReg, HighReg, LowReg}, SelectOp);
+  Result =
+      Result && selectOpWithSrcs(TmpReg, ResType, I,
+                                 {BReg, SecondaryReg, PrimaryReg}, SelectOp);
 
   // 5. Add 32 when high bits are used, otherwise 0 for low bits
   Register ValReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result = Result && selectNAryOpWithSrcs(ValReg, ResType, I,
-                                          {BReg, Reg32, Reg0}, SelectOp);
+  Result = Result && selectOpWithSrcs(
+                         ValReg, ResType, I,
+                         {BReg, SecondaryShiftReg, PrimaryShiftReg}, SelectOp);
 
   return Result &&
-         selectNAryOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, AddOp);
+         selectOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, AddOp);
+}
+
+bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
+                                                  const SPIRVType *ResType,
+                                                  MachineInstr &I,
+                                                  bool IsSigned) const {
+  // FindUMsb and FindSMsb intrinsics only support 32 bit integers
+  Register OpReg = I.getOperand(2).getReg();
+  SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
+  // zero or sign extend
+  unsigned ExtendOpcode = IsSigned ? SPIRV::OpSConvert : SPIRV::OpUConvert;
+  unsigned BitSetOpcode = IsSigned ? GL::FindSMsb : GL::FindUMsb;
+
+  switch (GR.getScalarOrVectorBitWidth(OpType)) {
+  case 16:
+    return selectFirstBitSet16(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode);
+  case 32:
+    return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
+  case 64:
+    return selectFirstBitSet64(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode,
+                               /*SwapPrimarySide=*/false);
+  default:
+    report_fatal_error(
+        "spv_firstbituhigh and spv_firstbitshigh only support 16,32,64 bits.");
+  }
 }
 
 bool SPIRVInstructionSelector::selectFirstBitLow(Register ResVReg,
@@ -3453,14 +3336,20 @@ bool SPIRVInstructionSelector::selectFirstBitLow(Register ResVReg,
   // FindILsb intrinsic only supports 32 bit integers
   Register OpReg = I.getOperand(2).getReg();
   SPIRVType *OpType = GR.getSPIRVTypeForVReg(OpReg);
+  // OpUConvert treats the operand bits as an unsigned i16 and zero extends it
+  // to an unsigned i32. As this leaves all the least significant bits unchanged
+  // so the first set bit from the LSB side doesn't change.
+  unsigned ExtendOpcode = SPIRV::OpUConvert;
+  unsigned BitSetOpcode = GL::FindILsb;
 
   switch (GR.getScalarOrVectorBitWidth(OpType)) {
   case 16:
-    return selectFirstBitLow16(ResVReg, ResType, I);
+    return selectFirstBitSet16(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode);
   case 32:
-    return selectFirstBitLow32(ResVReg, ResType, I, OpReg);
+    return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitLow64(ResVReg, ResType, I);
+    return selectFirstBitSet64(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode,
+                               /*SwapPrimarySide=*/true);
   default:
     report_fatal_error("spv_firstbitlow only supports 16,32,64 bits.");
   }

From 3c74bfe7d2835aded89dabbd2cc07cda9a987a7a Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Fri, 22 Nov 2024 13:40:45 -0700
Subject: [PATCH 05/17] cleanup

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index bca67585d2858..cb5e7c6be3573 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -108,8 +108,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
                            unsigned Opcode) const;
 
   bool selectFirstBitSet64(Register ResVReg, const SPIRVType *ResType,
-                           MachineInstr &I, unsigned ExtendOpcode,
-                           unsigned BitSetOpcode, bool SwapPrimarySide) const;
+                           MachineInstr &I, unsigned BitSetOpcode,
+                           bool SwapPrimarySide) const;
 
   bool selectGlobalValue(Register ResVReg, MachineInstr &I,
                          const MachineInstr *Init = nullptr) const;
@@ -3171,9 +3171,11 @@ bool SPIRVInstructionSelector::selectFirstBitSet32(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitSet64(
-    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
-    unsigned ExtendOpcode, unsigned BitSetOpcode, bool SwapPrimarySide) const {
+bool SPIRVInstructionSelector::selectFirstBitSet64(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I,
+                                                   unsigned BitSetOpcode,
+                                                   bool SwapPrimarySide) const {
   Register OpReg = I.getOperand(2).getReg();
 
   // 1. Split int64 into 2 pieces using a bitcast
@@ -3188,8 +3190,8 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
       selectOpWithSrcs(BitcastReg, PostCastType, I, {OpReg}, SPIRV::OpBitcast);
 
   // 2. Find the first set bit from the primary side for all the pieces in #1
-  Register FBPReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
-  Result = Result && selectFirstBitSet32(FBPReg, PostCastType, I, BitcastReg,
+  Register FBSReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
+  Result = Result && selectFirstBitSet32(FBSReg, PostCastType, I, BitcastReg,
                                          BitSetOpcode);
 
   // 3. Split result vector into high bits and low bits
@@ -3202,12 +3204,12 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
     // if scalar do a vector extract
     Result = Result &&
              selectOpWithSrcs(HighReg, ResType, I,
-                              {FBPReg, GR.getOrCreateConstInt(0, I, ResType,
+                              {FBSReg, GR.getOrCreateConstInt(0, I, ResType,
                                                               TII, ZeroAsNull)},
                               SPIRV::OpVectorExtractDynamic);
     Result = Result &&
              selectOpWithSrcs(LowReg, ResType, I,
-                              {FBPReg, GR.getOrCreateConstInt(1, I, ResType,
+                              {FBSReg, GR.getOrCreateConstInt(1, I, ResType,
                                                               TII, ZeroAsNull)},
                               SPIRV::OpVectorExtractDynamic);
   } else {
@@ -3216,11 +3218,11 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
                        TII.get(SPIRV::OpVectorShuffle))
                    .addDef(HighReg)
                    .addUse(GR.getSPIRVTypeID(ResType))
-                   .addUse(FBPReg)
+                   .addUse(FBSReg)
                    // Per the spec, repeat the vector if only one vec is needed
-                   .addUse(FBPReg);
+                   .addUse(FBSReg);
 
-    // high bits are stored in even indexes. Extract them from FBLReg
+    // high bits are stored in even indexes. Extract them from FBSReg
     for (unsigned J = 0; J < ComponentCount * 2; J += 2) {
       MIB.addImm(J);
     }
@@ -3230,11 +3232,11 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
                   TII.get(SPIRV::OpVectorShuffle))
               .addDef(LowReg)
               .addUse(GR.getSPIRVTypeID(ResType))
-              .addUse(FBPReg)
+              .addUse(FBSReg)
               // Per the spec, repeat the vector if only one vec is needed
-              .addUse(FBPReg);
+              .addUse(FBSReg);
 
-    // low bits are stored in odd indexes. Extract them from FBLReg
+    // low bits are stored in odd indexes. Extract them from FBSReg
     for (unsigned J = 1; J < ComponentCount * 2; J += 2) {
       MIB.addImm(J);
     }
@@ -3322,7 +3324,7 @@ bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
   case 32:
     return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitSet64(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode,
+    return selectFirstBitSet64(ResVReg, ResType, I, BitSetOpcode,
                                /*SwapPrimarySide=*/false);
   default:
     report_fatal_error(
@@ -3348,7 +3350,7 @@ bool SPIRVInstructionSelector::selectFirstBitLow(Register ResVReg,
   case 32:
     return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitSet64(ResVReg, ResType, I, ExtendOpcode, BitSetOpcode,
+    return selectFirstBitSet64(ResVReg, ResType, I, BitSetOpcode,
                                /*SwapPrimarySide=*/true);
   default:
     report_fatal_error("spv_firstbitlow only supports 16,32,64 bits.");

From a90026c858f8db3b9f1bcb2b45d764d255672c99 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Fri, 22 Nov 2024 16:58:29 -0700
Subject: [PATCH 06/17] Divide vectors that surpass 4 element limit

---
 llvm/lib/Target/DirectX/DXIL.td               |   2 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 136 ++++++++++++++----
 .../SPIRV/hlsl-intrinsics/firstbitlow.ll      | 119 ++++++++++++++-
 3 files changed, 230 insertions(+), 27 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index d6d78581bafbf..367009d7f92e6 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -621,7 +621,7 @@ def CountBits :  DXILOp<31, unaryBits> {
 def FirstbitLo : DXILOp<32, unaryBits> {
   let Doc = "Returns the location of the first set bit starting from "
             "the lowest order bit and working upward.";
-  let LLVMIntrinsic = int_dx_firstbitlow;
+  let intrinsics = [ IntrinSelect<int_dx_firstbitlow> ];
   let arguments = [OverloadTy];
   let result = Int32Ty;
   let overloads =
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index cb5e7c6be3573..b2115528b8dcb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -108,8 +108,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
                            unsigned Opcode) const;
 
   bool selectFirstBitSet64(Register ResVReg, const SPIRVType *ResType,
-                           MachineInstr &I, unsigned BitSetOpcode,
-                           bool SwapPrimarySide) const;
+                           MachineInstr &I, Register SrcReg,
+                           unsigned BitSetOpcode, bool SwapPrimarySide) const;
 
   bool selectGlobalValue(Register ResVReg, MachineInstr &I,
                          const MachineInstr *Init = nullptr) const;
@@ -3171,23 +3171,116 @@ bool SPIRVInstructionSelector::selectFirstBitSet32(Register ResVReg,
       .constrainAllUses(TII, TRI, RBI);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitSet64(Register ResVReg,
-                                                   const SPIRVType *ResType,
-                                                   MachineInstr &I,
-                                                   unsigned BitSetOpcode,
-                                                   bool SwapPrimarySide) const {
-  Register OpReg = I.getOperand(2).getReg();
-
-  // 1. Split int64 into 2 pieces using a bitcast
+bool SPIRVInstructionSelector::selectFirstBitSet64(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
   unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
   SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
+  bool ZeroAsNull = STI.isOpenCLEnv();
+  Register ConstIntZero =
+      GR.getOrCreateConstInt(0, I, BaseType, TII, ZeroAsNull);
+  Register ConstIntOne =
+      GR.getOrCreateConstInt(1, I, BaseType, TII, ZeroAsNull);
+
+  // SPIRV doesn't support vectors with more than 4 components. Since the
+  // algoritm below converts i64 -> i32x2 and i64x4 -> i32x8 it can only
+  // operate on vectors with 2 or less components. When largers vectors are
+  // seen. Split them, recurse, then recombine them.
+  if (ComponentCount > 2) {
+    unsigned LeftComponentCount = ComponentCount / 2;
+    unsigned RightComponentCount = ComponentCount - LeftComponentCount;
+    bool LeftIsVector = LeftComponentCount > 1;
+
+    // Split the SrcReg in half into 2 smaller vec registers
+    // (ie i64x4 -> i64x2, i64x2)
+    MachineIRBuilder MIRBuilder(I);
+    SPIRVType *OpType = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
+    SPIRVType *LeftVecOpType;
+    SPIRVType *LeftVecResType;
+    if (LeftIsVector) {
+      LeftVecOpType =
+          GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
+      LeftVecResType = GR.getOrCreateSPIRVVectorType(
+          BaseType, LeftComponentCount, MIRBuilder);
+    } else {
+      LeftVecOpType = OpType;
+      LeftVecResType = BaseType;
+    }
+
+    SPIRVType *RightVecOpType =
+        GR.getOrCreateSPIRVVectorType(OpType, RightComponentCount, MIRBuilder);
+    SPIRVType *RightVecResType = GR.getOrCreateSPIRVVectorType(
+        BaseType, RightComponentCount, MIRBuilder);
+
+    Register LeftSideIn =
+        MRI->createVirtualRegister(GR.getRegClass(LeftVecOpType));
+    Register RightSideIn =
+        MRI->createVirtualRegister(GR.getRegClass(RightVecOpType));
+
+    bool Result;
+
+    if (LeftIsVector) {
+      auto MIB =
+          BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                  TII.get(SPIRV::OpVectorShuffle))
+              .addDef(LeftSideIn)
+              .addUse(GR.getSPIRVTypeID(LeftVecOpType))
+              .addUse(SrcReg)
+              // Per the spec, repeat the vector if only one vec is needed
+              .addUse(SrcReg);
+
+      for (unsigned J = 0; J < LeftComponentCount; J++) {
+        MIB.addImm(J);
+      }
+
+      Result = MIB.constrainAllUses(TII, TRI, RBI);
+    } else {
+      Result =
+          selectOpWithSrcs(LeftSideIn, LeftVecOpType, I, {SrcReg, ConstIntZero},
+                           SPIRV::OpVectorExtractDynamic);
+    }
+
+    auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                       TII.get(SPIRV::OpVectorShuffle))
+                   .addDef(RightSideIn)
+                   .addUse(GR.getSPIRVTypeID(RightVecOpType))
+                   .addUse(SrcReg)
+                   // Per the spec, repeat the vector if only one vec is needed
+                   .addUse(SrcReg);
+
+    for (unsigned J = LeftComponentCount; J < ComponentCount; J++) {
+      MIB.addImm(J);
+    }
+
+    Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+
+    // Recursively call selectFirstBitSet64 on the 2 registers
+    Register LeftSideOut =
+        MRI->createVirtualRegister(GR.getRegClass(LeftVecResType));
+    Register RightSideOut =
+        MRI->createVirtualRegister(GR.getRegClass(RightVecResType));
+    Result = Result &&
+             selectFirstBitSet64(LeftSideOut, LeftVecResType, I, LeftSideIn,
+                                 BitSetOpcode, SwapPrimarySide);
+    Result = Result &&
+             selectFirstBitSet64(RightSideOut, RightVecResType, I, RightSideIn,
+                                 BitSetOpcode, SwapPrimarySide);
+
+    // Join the two resulting registers back into the return type
+    // (ie i32x2, i32x2 -> i32x4)
+    return Result &&
+           selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
+                            SPIRV::OpCompositeConstruct);
+  }
+
+  // 1. Split int64 into 2 pieces using a bitcast
   MachineIRBuilder MIRBuilder(I);
   SPIRVType *PostCastType =
       GR.getOrCreateSPIRVVectorType(BaseType, 2 * ComponentCount, MIRBuilder);
   Register BitcastReg =
       MRI->createVirtualRegister(GR.getRegClass(PostCastType));
   bool Result =
-      selectOpWithSrcs(BitcastReg, PostCastType, I, {OpReg}, SPIRV::OpBitcast);
+      selectOpWithSrcs(BitcastReg, PostCastType, I, {SrcReg}, SPIRV::OpBitcast);
 
   // 2. Find the first set bit from the primary side for all the pieces in #1
   Register FBSReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
@@ -3198,20 +3291,15 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(Register ResVReg,
   Register HighReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
   Register LowReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
 
-  bool ZeroAsNull = STI.isOpenCLEnv();
   bool IsScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
   if (IsScalarRes) {
     // if scalar do a vector extract
-    Result = Result &&
-             selectOpWithSrcs(HighReg, ResType, I,
-                              {FBSReg, GR.getOrCreateConstInt(0, I, ResType,
-                                                              TII, ZeroAsNull)},
-                              SPIRV::OpVectorExtractDynamic);
-    Result = Result &&
-             selectOpWithSrcs(LowReg, ResType, I,
-                              {FBSReg, GR.getOrCreateConstInt(1, I, ResType,
-                                                              TII, ZeroAsNull)},
-                              SPIRV::OpVectorExtractDynamic);
+    Result =
+        Result && selectOpWithSrcs(HighReg, ResType, I, {FBSReg, ConstIntZero},
+                                   SPIRV::OpVectorExtractDynamic);
+    Result =
+        Result && selectOpWithSrcs(LowReg, ResType, I, {FBSReg, ConstIntOne},
+                                   SPIRV::OpVectorExtractDynamic);
   } else {
     // if vector do a shufflevector
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -3324,7 +3412,7 @@ bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,
   case 32:
     return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitSet64(ResVReg, ResType, I, BitSetOpcode,
+    return selectFirstBitSet64(ResVReg, ResType, I, OpReg, BitSetOpcode,
                                /*SwapPrimarySide=*/false);
   default:
     report_fatal_error(
@@ -3350,7 +3438,7 @@ bool SPIRVInstructionSelector::selectFirstBitLow(Register ResVReg,
   case 32:
     return selectFirstBitSet32(ResVReg, ResType, I, OpReg, BitSetOpcode);
   case 64:
-    return selectFirstBitSet64(ResVReg, ResType, I, BitSetOpcode,
+    return selectFirstBitSet64(ResVReg, ResType, I, OpReg, BitSetOpcode,
                                /*SwapPrimarySide=*/true);
   default:
     report_fatal_error("spv_firstbitlow only supports 16,32,64 bits.");
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
index 05488479e5bd0..f3cc73637b136 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
@@ -5,6 +5,7 @@
 ; CHECK-DAG: OpMemoryModel Logical GLSL450
 ; CHECK-DAG: [[u32_t:%.+]] = OpTypeInt 32 0
 ; CHECK-DAG: [[u32x2_t:%.+]] = OpTypeVector [[u32_t]] 2
+; CHECK-DAG: [[u32x3_t:%.+]] = OpTypeVector [[u32_t]] 3
 ; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
 ; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
 ; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
@@ -15,8 +16,12 @@
 ; CHECK-DAG: [[const_neg1x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg1]] [[const_neg1]]
 ; CHECK-DAG: [[u16_t:%.+]] = OpTypeInt 16 0
 ; CHECK-DAG: [[u16x2_t:%.+]] = OpTypeVector [[u16_t]] 2
+; CHECK-DAG: [[u16x3_t:%.+]] = OpTypeVector [[u16_t]] 3
+; CHECK-DAG: [[u16x4_t:%.+]] = OpTypeVector [[u16_t]] 4
 ; CHECK-DAG: [[u64_t:%.+]] = OpTypeInt 64 0
 ; CHECK-DAG: [[u64x2_t:%.+]] = OpTypeVector [[u64_t]] 2
+; CHECK-DAG: [[u64x3_t:%.+]] = OpTypeVector [[u64_t]] 3
+; CHECK-DAG: [[u64x4_t:%.+]] = OpTypeVector [[u64_t]] 4
 ; CHECK-DAG: [[bool_t:%.+]] = OpTypeBool
 ; CHECK-DAG: [[boolx2_t:%.+]] = OpTypeVector [[bool_t]] 2
 
@@ -30,8 +35,8 @@ entry:
   ret i32 %elt.firstbitlow
 }
 
-; CHECK-LABEL: Begin function firstbitlow_2xi32
-define noundef <2 x i32> @firstbitlow_2xi32(<2 x i32> noundef %a) {
+; CHECK-LABEL: Begin function firstbitlow_v2xi32
+define noundef <2 x i32> @firstbitlow_v2xi32(<2 x i32> noundef %a) {
 entry:
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x2_t]]
 ; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[a]]
@@ -40,6 +45,26 @@ entry:
   ret <2 x i32> %elt.firstbitlow
 }
 
+; CHECK-LABEL: Begin function firstbitlow_v3xi32
+define noundef <3 x i32> @firstbitlow_v3xi32(<3 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x3_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i32(<3 x i32> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4xi32
+define noundef <4 x i32> @firstbitlow_v4xi32(<4 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x4_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i32(<4 x i32> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
 ; CHECK-LABEL: Begin function firstbitlow_i16
 define noundef i32 @firstbitlow_i16(i16 noundef %a) {
 entry:
@@ -62,6 +87,28 @@ entry:
   ret <2 x i32> %elt.firstbitlow
 }
 
+; CHECK-LABEL: Begin function firstbitlow_v3xi16
+define noundef <3 x i32> @firstbitlow_v3xi16(<3 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x3_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x3_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i16(<3 x i16> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4xi16
+define noundef <4 x i32> @firstbitlow_v4xi16(<4 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x4_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x4_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i16(<4 x i16> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
 ; CHECK-LABEL: Begin function firstbitlow_i64
 define noundef i32 @firstbitlow_i64(i64 noundef %a) {
 entry:
@@ -96,6 +143,74 @@ entry:
   ret <2 x i32> %elt.firstbitlow
 }
 
+; CHECK-LABEL: Begin function firstbitlow_v3i64
+define noundef <3 x i32> @firstbitlow_v3i64(<3 x i64> noundef %a) {
+entry:
+; Split the i64x3 into i64, i64x2
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
+; CHECK: [[left:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_0]]
+; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 1 2
+
+; Do firstbitlow on i64, i64x2
+; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x2_t]] [[left]]
+; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[left_cast]]
+; CHECK: [[left_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_0]]
+; CHECK: [[left_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_1]]
+; CHECK: [[left_should_use_high:%.+]] = OpIEqual [[bool_t]] [[left_low_bits]] [[const_neg1]]
+; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32_t]] [[left_should_use_high]] [[left_high_bits]] [[left_low_bits]]
+; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32_t]] [[left_should_use_high]] [[const_32]] [[const_0]]
+; CHECK: [[left_res:%.+]] = OpIAdd [[u32_t]] [[left_ans_offset]] [[left_ans_bits]]
+
+; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
+; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[right_cast]]
+; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
+; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
+; CHECK: [[right_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[right_low_bits]] [[const_neg1x2]]
+; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[right_high_bits]] [[right_low_bits]]
+; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+
+; Merge the resulting i32, i32x2 into the final i32x3 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[left_res]] [[right_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i64(<3 x i64> %a)
+  ret <3 x i32> %elt.firstbitlow
+}
+
+; CHECK-LABEL: Begin function firstbitlow_v4i64
+define noundef <4 x i32> @firstbitlow_v4i64(<4 x i64> noundef %a) {
+entry:
+; Split the i64x4 into 2 i64x2
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]
+; CHECK: [[left:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
+
+; Do firstbitlow on the 2 i64x2
+; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x4_t]] [[left]]
+; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[left_cast]]
+; CHECK: [[left_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 0 2
+; CHECK: [[left_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 1 3
+; CHECK: [[left_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[left_low_bits]] [[const_neg1x2]]
+; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_high]] [[left_high_bits]] [[left_low_bits]]
+; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[left_res:%.+]] = OpIAdd [[u32x2_t]] [[left_ans_offset]] [[left_ans_bits]]
+
+; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
+; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[right_cast]]
+; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
+; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
+; CHECK: [[right_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[right_low_bits]] [[const_neg1x2]]
+; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[right_high_bits]] [[right_low_bits]]
+; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+
+; Merge the resulting 2 i32x2 into the final i32x4 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[left_res]] [[right_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i64(<4 x i64> %a)
+  ret <4 x i32> %elt.firstbitlow
+}
+
 ;declare i16 @llvm.spv.firstbitlow.i16(i16)
 ;declare i32 @llvm.spv.firstbitlow.i32(i32)
 ;declare i64 @llvm.spv.firstbitlow.i64(i64)

From e67adb99590fcc2fe256ec04e0f31c39ea315ab8 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Mon, 16 Dec 2024 10:59:17 -0700
Subject: [PATCH 07/17] Address comments

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 86 ++++++++++++-------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index b2115528b8dcb..4588c3bcd2e77 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -105,12 +105,17 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool selectFirstBitSet32(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I, Register SrcReg,
-                           unsigned Opcode) const;
+                           unsigned BitSetOpcode) const;
 
   bool selectFirstBitSet64(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I, Register SrcReg,
                            unsigned BitSetOpcode, bool SwapPrimarySide) const;
 
+  bool selectFirstBitSet64Overflow(Register ResVReg, const SPIRVType *ResType,
+                                   MachineInstr &I, Register SrcReg,
+                                   unsigned BitSetOpcode,
+                                   bool SwapPrimarySide) const;
+
   bool selectGlobalValue(Register ResVReg, MachineInstr &I,
                          const MachineInstr *Init = nullptr) const;
 
@@ -3157,51 +3162,42 @@ bool SPIRVInstructionSelector::selectFirstBitSet16(
          selectFirstBitSet32(ResVReg, ResType, I, ExtReg, BitSetOpcode);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitSet32(Register ResVReg,
-                                                   const SPIRVType *ResType,
-                                                   MachineInstr &I,
-                                                   Register SrcReg,
-                                                   unsigned Opcode) const {
+bool SPIRVInstructionSelector::selectFirstBitSet32(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    Register SrcReg, unsigned BitSetOpcode) const {
   return BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(SPIRV::OpExtInst))
       .addDef(ResVReg)
       .addUse(GR.getSPIRVTypeID(ResType))
       .addImm(static_cast<uint32_t>(SPIRV::InstructionSet::GLSL_std_450))
-      .addImm(Opcode)
+      .addImm(BitSetOpcode)
       .addUse(SrcReg)
       .constrainAllUses(TII, TRI, RBI);
 }
 
-bool SPIRVInstructionSelector::selectFirstBitSet64(
+bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
     Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
     Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
+
   unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
   SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
   bool ZeroAsNull = STI.isOpenCLEnv();
   Register ConstIntZero =
       GR.getOrCreateConstInt(0, I, BaseType, TII, ZeroAsNull);
-  Register ConstIntOne =
-      GR.getOrCreateConstInt(1, I, BaseType, TII, ZeroAsNull);
+  unsigned LeftComponentCount = ComponentCount / 2;
+  unsigned RightComponentCount = ComponentCount - LeftComponentCount;
+  bool LeftIsVector = LeftComponentCount > 1;
 
-  // SPIRV doesn't support vectors with more than 4 components. Since the
-  // algoritm below converts i64 -> i32x2 and i64x4 -> i32x8 it can only
-  // operate on vectors with 2 or less components. When largers vectors are
-  // seen. Split them, recurse, then recombine them.
-  if (ComponentCount > 2) {
-    unsigned LeftComponentCount = ComponentCount / 2;
-    unsigned RightComponentCount = ComponentCount - LeftComponentCount;
-    bool LeftIsVector = LeftComponentCount > 1;
-
-    // Split the SrcReg in half into 2 smaller vec registers
-    // (ie i64x4 -> i64x2, i64x2)
-    MachineIRBuilder MIRBuilder(I);
-    SPIRVType *OpType = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
-    SPIRVType *LeftVecOpType;
-    SPIRVType *LeftVecResType;
-    if (LeftIsVector) {
-      LeftVecOpType =
-          GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
-      LeftVecResType = GR.getOrCreateSPIRVVectorType(
-          BaseType, LeftComponentCount, MIRBuilder);
+  // Split the SrcReg in half into 2 smaller vec registers
+  // (ie i64x4 -> i64x2, i64x2)
+  MachineIRBuilder MIRBuilder(I);
+  SPIRVType *OpType = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
+  SPIRVType *LeftVecOpType;
+  SPIRVType *LeftVecResType;
+  if (LeftIsVector) {
+    LeftVecOpType =
+        GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
+    LeftVecResType =
+        GR.getOrCreateSPIRVVectorType(BaseType, LeftComponentCount, MIRBuilder);
     } else {
       LeftVecOpType = OpType;
       LeftVecResType = BaseType;
@@ -3219,6 +3215,8 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
 
     bool Result;
 
+    // Extract the left half from the SrcReg into LeftSideIn
+    // accounting for the special case when it only has one element
     if (LeftIsVector) {
       auto MIB =
           BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -3240,6 +3238,9 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
                            SPIRV::OpVectorExtractDynamic);
     }
 
+    // Extract the right half from the SrcReg into RightSideIn.
+    // Right will always be a vector since the only time one element is left is
+    // when Component == 3, and in that case Left is one element.
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                        TII.get(SPIRV::OpVectorShuffle))
                    .addDef(RightSideIn)
@@ -3254,7 +3255,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
 
     Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
 
-    // Recursively call selectFirstBitSet64 on the 2 registers
+    // Recursively call selectFirstBitSet64 on the 2 halves
     Register LeftSideOut =
         MRI->createVirtualRegister(GR.getRegClass(LeftVecResType));
     Register RightSideOut =
@@ -3271,6 +3272,26 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
     return Result &&
            selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
                             SPIRV::OpCompositeConstruct);
+}
+
+bool SPIRVInstructionSelector::selectFirstBitSet64(
+    Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
+    Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
+  unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
+  SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
+  bool ZeroAsNull = STI.isOpenCLEnv();
+  Register ConstIntZero =
+      GR.getOrCreateConstInt(0, I, BaseType, TII, ZeroAsNull);
+  Register ConstIntOne =
+      GR.getOrCreateConstInt(1, I, BaseType, TII, ZeroAsNull);
+
+  // SPIRV doesn't support vectors with more than 4 components. Since the
+  // algoritm below converts i64 -> i32x2 and i64x4 -> i32x8 it can only
+  // operate on vectors with 2 or less components. When largers vectors are
+  // seen. Split them, recurse, then recombine them.
+  if (ComponentCount > 2) {
+    return selectFirstBitSet64Overflow(ResVReg, ResType, I, SrcReg,
+                                       BitSetOpcode, SwapPrimarySide);
   }
 
   // 1. Split int64 into 2 pieces using a bitcast
@@ -3362,6 +3383,9 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
   Register SecondaryReg;
   Register PrimaryShiftReg;
   Register SecondaryShiftReg;
+
+  // By default the emitted opcodes check for the set bit from the MSB side.
+  // Setting SwapPrimarySide checks the set bit from the LSB side
   if (SwapPrimarySide) {
     PrimaryReg = LowReg;
     SecondaryReg = HighReg;

From 7b1a8ccb9bef76f39947118e9236231f66ed5712 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Mon, 16 Dec 2024 11:09:19 -0700
Subject: [PATCH 08/17] format

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 119 +++++++++---------
 1 file changed, 59 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 4588c3bcd2e77..9fe14bc415e04 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3198,80 +3198,79 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
         GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
     LeftVecResType =
         GR.getOrCreateSPIRVVectorType(BaseType, LeftComponentCount, MIRBuilder);
-    } else {
-      LeftVecOpType = OpType;
-      LeftVecResType = BaseType;
-    }
-
-    SPIRVType *RightVecOpType =
-        GR.getOrCreateSPIRVVectorType(OpType, RightComponentCount, MIRBuilder);
-    SPIRVType *RightVecResType = GR.getOrCreateSPIRVVectorType(
-        BaseType, RightComponentCount, MIRBuilder);
-
-    Register LeftSideIn =
-        MRI->createVirtualRegister(GR.getRegClass(LeftVecOpType));
-    Register RightSideIn =
-        MRI->createVirtualRegister(GR.getRegClass(RightVecOpType));
-
-    bool Result;
+  } else {
+    LeftVecOpType = OpType;
+    LeftVecResType = BaseType;
+  }
 
-    // Extract the left half from the SrcReg into LeftSideIn
-    // accounting for the special case when it only has one element
-    if (LeftIsVector) {
-      auto MIB =
-          BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                  TII.get(SPIRV::OpVectorShuffle))
-              .addDef(LeftSideIn)
-              .addUse(GR.getSPIRVTypeID(LeftVecOpType))
-              .addUse(SrcReg)
-              // Per the spec, repeat the vector if only one vec is needed
-              .addUse(SrcReg);
+  SPIRVType *RightVecOpType =
+      GR.getOrCreateSPIRVVectorType(OpType, RightComponentCount, MIRBuilder);
+  SPIRVType *RightVecResType =
+      GR.getOrCreateSPIRVVectorType(BaseType, RightComponentCount, MIRBuilder);
 
-      for (unsigned J = 0; J < LeftComponentCount; J++) {
-        MIB.addImm(J);
-      }
+  Register LeftSideIn =
+      MRI->createVirtualRegister(GR.getRegClass(LeftVecOpType));
+  Register RightSideIn =
+      MRI->createVirtualRegister(GR.getRegClass(RightVecOpType));
 
-      Result = MIB.constrainAllUses(TII, TRI, RBI);
-    } else {
-      Result =
-          selectOpWithSrcs(LeftSideIn, LeftVecOpType, I, {SrcReg, ConstIntZero},
-                           SPIRV::OpVectorExtractDynamic);
-    }
+  bool Result;
 
-    // Extract the right half from the SrcReg into RightSideIn.
-    // Right will always be a vector since the only time one element is left is
-    // when Component == 3, and in that case Left is one element.
+  // Extract the left half from the SrcReg into LeftSideIn
+  // accounting for the special case when it only has one element
+  if (LeftIsVector) {
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                        TII.get(SPIRV::OpVectorShuffle))
-                   .addDef(RightSideIn)
-                   .addUse(GR.getSPIRVTypeID(RightVecOpType))
+                   .addDef(LeftSideIn)
+                   .addUse(GR.getSPIRVTypeID(LeftVecOpType))
                    .addUse(SrcReg)
                    // Per the spec, repeat the vector if only one vec is needed
                    .addUse(SrcReg);
 
-    for (unsigned J = LeftComponentCount; J < ComponentCount; J++) {
+    for (unsigned J = 0; J < LeftComponentCount; J++) {
       MIB.addImm(J);
     }
 
-    Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+    Result = MIB.constrainAllUses(TII, TRI, RBI);
+  } else {
+    Result =
+        selectOpWithSrcs(LeftSideIn, LeftVecOpType, I, {SrcReg, ConstIntZero},
+                         SPIRV::OpVectorExtractDynamic);
+  }
 
-    // Recursively call selectFirstBitSet64 on the 2 halves
-    Register LeftSideOut =
-        MRI->createVirtualRegister(GR.getRegClass(LeftVecResType));
-    Register RightSideOut =
-        MRI->createVirtualRegister(GR.getRegClass(RightVecResType));
-    Result = Result &&
-             selectFirstBitSet64(LeftSideOut, LeftVecResType, I, LeftSideIn,
-                                 BitSetOpcode, SwapPrimarySide);
-    Result = Result &&
-             selectFirstBitSet64(RightSideOut, RightVecResType, I, RightSideIn,
-                                 BitSetOpcode, SwapPrimarySide);
-
-    // Join the two resulting registers back into the return type
-    // (ie i32x2, i32x2 -> i32x4)
-    return Result &&
-           selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
-                            SPIRV::OpCompositeConstruct);
+  // Extract the right half from the SrcReg into RightSideIn.
+  // Right will always be a vector since the only time one element is left is
+  // when Component == 3, and in that case Left is one element.
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                     TII.get(SPIRV::OpVectorShuffle))
+                 .addDef(RightSideIn)
+                 .addUse(GR.getSPIRVTypeID(RightVecOpType))
+                 .addUse(SrcReg)
+                 // Per the spec, repeat the vector if only one vec is needed
+                 .addUse(SrcReg);
+
+  for (unsigned J = LeftComponentCount; J < ComponentCount; J++) {
+    MIB.addImm(J);
+  }
+
+  Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+
+  // Recursively call selectFirstBitSet64 on the 2 halves
+  Register LeftSideOut =
+      MRI->createVirtualRegister(GR.getRegClass(LeftVecResType));
+  Register RightSideOut =
+      MRI->createVirtualRegister(GR.getRegClass(RightVecResType));
+  Result =
+      Result && selectFirstBitSet64(LeftSideOut, LeftVecResType, I, LeftSideIn,
+                                    BitSetOpcode, SwapPrimarySide);
+  Result =
+      Result && selectFirstBitSet64(RightSideOut, RightVecResType, I,
+                                    RightSideIn, BitSetOpcode, SwapPrimarySide);
+
+  // Join the two resulting registers back into the return type
+  // (ie i32x2, i32x2 -> i32x4)
+  return Result &&
+         selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
+                          SPIRV::OpCompositeConstruct);
 }
 
 bool SPIRVInstructionSelector::selectFirstBitSet64(

From 742647b68a4c676b059a67e462d4399677756742 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Mon, 16 Dec 2024 14:02:52 -0700
Subject: [PATCH 09/17] Address comments

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 9fe14bc415e04..3872409be44c6 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3191,27 +3191,26 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
   // (ie i64x4 -> i64x2, i64x2)
   MachineIRBuilder MIRBuilder(I);
   SPIRVType *OpType = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
-  SPIRVType *LeftVecOpType;
-  SPIRVType *LeftVecResType;
+  SPIRVType *LeftOpType;
+  SPIRVType *LeftResType;
   if (LeftIsVector) {
-    LeftVecOpType =
+    LeftOpType =
         GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
-    LeftVecResType =
+    LeftResType =
         GR.getOrCreateSPIRVVectorType(BaseType, LeftComponentCount, MIRBuilder);
   } else {
-    LeftVecOpType = OpType;
-    LeftVecResType = BaseType;
+    LeftOpType = OpType;
+    LeftResType = BaseType;
   }
 
-  SPIRVType *RightVecOpType =
+  SPIRVType *RightOpType =
       GR.getOrCreateSPIRVVectorType(OpType, RightComponentCount, MIRBuilder);
-  SPIRVType *RightVecResType =
+  SPIRVType *RightResType =
       GR.getOrCreateSPIRVVectorType(BaseType, RightComponentCount, MIRBuilder);
 
-  Register LeftSideIn =
-      MRI->createVirtualRegister(GR.getRegClass(LeftVecOpType));
+  Register LeftSideIn = MRI->createVirtualRegister(GR.getRegClass(LeftOpType));
   Register RightSideIn =
-      MRI->createVirtualRegister(GR.getRegClass(RightVecOpType));
+      MRI->createVirtualRegister(GR.getRegClass(RightOpType));
 
   bool Result;
 
@@ -3221,7 +3220,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                        TII.get(SPIRV::OpVectorShuffle))
                    .addDef(LeftSideIn)
-                   .addUse(GR.getSPIRVTypeID(LeftVecOpType))
+                   .addUse(GR.getSPIRVTypeID(LeftOpType))
                    .addUse(SrcReg)
                    // Per the spec, repeat the vector if only one vec is needed
                    .addUse(SrcReg);
@@ -3232,9 +3231,8 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
 
     Result = MIB.constrainAllUses(TII, TRI, RBI);
   } else {
-    Result =
-        selectOpWithSrcs(LeftSideIn, LeftVecOpType, I, {SrcReg, ConstIntZero},
-                         SPIRV::OpVectorExtractDynamic);
+    Result = selectOpWithSrcs(LeftSideIn, LeftOpType, I, {SrcReg, ConstIntZero},
+                              SPIRV::OpVectorExtractDynamic);
   }
 
   // Extract the right half from the SrcReg into RightSideIn.
@@ -3243,7 +3241,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
   auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                      TII.get(SPIRV::OpVectorShuffle))
                  .addDef(RightSideIn)
-                 .addUse(GR.getSPIRVTypeID(RightVecOpType))
+                 .addUse(GR.getSPIRVTypeID(RightOpType))
                  .addUse(SrcReg)
                  // Per the spec, repeat the vector if only one vec is needed
                  .addUse(SrcReg);
@@ -3256,15 +3254,15 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
 
   // Recursively call selectFirstBitSet64 on the 2 halves
   Register LeftSideOut =
-      MRI->createVirtualRegister(GR.getRegClass(LeftVecResType));
+      MRI->createVirtualRegister(GR.getRegClass(LeftResType));
   Register RightSideOut =
-      MRI->createVirtualRegister(GR.getRegClass(RightVecResType));
+      MRI->createVirtualRegister(GR.getRegClass(RightResType));
   Result =
-      Result && selectFirstBitSet64(LeftSideOut, LeftVecResType, I, LeftSideIn,
+      Result && selectFirstBitSet64(LeftSideOut, LeftResType, I, LeftSideIn,
                                     BitSetOpcode, SwapPrimarySide);
   Result =
-      Result && selectFirstBitSet64(RightSideOut, RightVecResType, I,
-                                    RightSideIn, BitSetOpcode, SwapPrimarySide);
+      Result && selectFirstBitSet64(RightSideOut, RightResType, I, RightSideIn,
+                                    BitSetOpcode, SwapPrimarySide);
 
   // Join the two resulting registers back into the return type
   // (ie i32x2, i32x2 -> i32x4)

From 553335fb8f2e43bee60ec3c8d19e925231d215c1 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Mon, 16 Dec 2024 17:54:47 -0700
Subject: [PATCH 10/17] Update tests

---
 .../SPIRV/hlsl-intrinsics/firstbithigh.ll     | 236 +++++++++++++++---
 .../SPIRV/hlsl-intrinsics/firstbitlow.ll      |  16 +-
 2 files changed, 204 insertions(+), 48 deletions(-)

diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
index 3d35e102310f5..dee48061d2fe1 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
@@ -1,94 +1,250 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK: OpMemoryModel Logical GLSL450
-; CHECK-DAG: [[Z:%.*]] = OpConstant %[[#]] 0
-; CHECK-DAG: [[X:%.*]] = OpConstant %[[#]] 1
+; CHECK-DAG: [[glsl_450_ext:%.+]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: OpMemoryModel Logical GLSL450
+; CHECK-DAG: [[u32_t:%.+]] = OpTypeInt 32 0
+; CHECK-DAG: [[u32x2_t:%.+]] = OpTypeVector [[u32_t]] 2
+; CHECK-DAG: [[u32x3_t:%.+]] = OpTypeVector [[u32_t]] 3
+; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
+; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
+; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
+; CHECK-DAG: [[const_1:%.*]] = OpConstant [[u32_t]] 1
+; CHECK-DAG: [[const_32:%.*]] = OpConstant [[u32_t]] 32
+; CHECK-DAG: [[const_32x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_32]] [[const_32]]
+; CHECK-DAG: [[const_neg1:%.*]] = OpConstant [[u32_t]] 4294967295
+; CHECK-DAG: [[const_neg1x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_neg1]] [[const_neg1]]
+; CHECK-DAG: [[u16_t:%.+]] = OpTypeInt 16 0
+; CHECK-DAG: [[u16x2_t:%.+]] = OpTypeVector [[u16_t]] 2
+; CHECK-DAG: [[u16x3_t:%.+]] = OpTypeVector [[u16_t]] 3
+; CHECK-DAG: [[u16x4_t:%.+]] = OpTypeVector [[u16_t]] 4
+; CHECK-DAG: [[u64_t:%.+]] = OpTypeInt 64 0
+; CHECK-DAG: [[u64x2_t:%.+]] = OpTypeVector [[u64_t]] 2
+; CHECK-DAG: [[u64x3_t:%.+]] = OpTypeVector [[u64_t]] 3
+; CHECK-DAG: [[u64x4_t:%.+]] = OpTypeVector [[u64_t]] 4
+; CHECK-DAG: [[bool_t:%.+]] = OpTypeBool
+; CHECK-DAG: [[boolx2_t:%.+]] = OpTypeVector [[bool_t]] 2
 
+; CHECK-LABEL: Begin function firstbituhigh_i32
 define noundef i32 @firstbituhigh_i32(i32 noundef %a) {
 entry:
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb %[[#]]
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call i32 @llvm.spv.firstbituhigh.i32(i32 %a)
   ret i32 %elt.firstbituhigh
 }
 
-define noundef <2 x i32> @firstbituhigh_2xi32(<2 x i32> noundef %a) {
+; CHECK-LABEL: Begin function firstbituhigh_v2xi32
+define noundef <2 x i32> @firstbituhigh_v2xi32(<2 x i32> noundef %a) {
 entry:
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb %[[#]]
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x2_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <2 x i32> @llvm.spv.firstbituhigh.v2i32(<2 x i32> %a)
   ret <2 x i32> %elt.firstbituhigh
 }
 
+; CHECK-LABEL: Begin function firstbituhigh_v3xi32
+define noundef <3 x i32> @firstbituhigh_v3xi32(<3 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x3_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <3 x i32> @llvm.spv.firstbituhigh.v3i32(<3 x i32> %a)
+  ret <3 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_v4xi32
+define noundef <4 x i32> @firstbituhigh_v4xi32(<4 x i32> noundef %a) {
+entry:
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32x4_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <4 x i32> @llvm.spv.firstbituhigh.v4i32(<4 x i32> %a)
+  ret <4 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_i16
 define noundef i32 @firstbituhigh_i16(i16 noundef %a) {
 entry:
-; CHECK: [[A:%.*]] = OpUConvert %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb [[A]]
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call i32 @llvm.spv.firstbituhigh.i16(i16 %a)
   ret i32 %elt.firstbituhigh
 }
 
-define noundef <2 x i32> @firstbituhigh_v2i16(<2 x i16> noundef %a) {
+; CHECK-LABEL: Begin function firstbituhigh_v2xi16
+define noundef <2 x i32> @firstbituhigh_v2xi16(<2 x i16> noundef %a) {
 entry:
-; CHECK: [[A:%.*]] = OpUConvert %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindUMsb [[A]]
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x2_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x2_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <2 x i32> @llvm.spv.firstbituhigh.v2i16(<2 x i16> %a)
   ret <2 x i32> %elt.firstbituhigh
 }
 
+; CHECK-LABEL: Begin function firstbituhigh_v3xi16
+define noundef <3 x i32> @firstbituhigh_v3xi16(<3 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x3_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x3_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x3_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <3 x i32> @llvm.spv.firstbituhigh.v3i16(<3 x i16> %a)
+  ret <3 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_v4xi16
+define noundef <4 x i32> @firstbituhigh_v4xi16(<4 x i16> noundef %a) {
+entry:
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x4_t]]
+; CHECK: [[a32:%.+]] = OpUConvert [[u32x4_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <4 x i32> @llvm.spv.firstbituhigh.v4i16(<4 x i16> %a)
+  ret <4 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_i64
 define noundef i32 @firstbituhigh_i64(i64 noundef %a) {
 entry:
-; CHECK: [[O:%.*]] = OpBitcast %[[#]] %[[#]]
-; CHECK: [[N:%.*]] = OpExtInst %[[#]] %[[#]] FindUMsb [[O]]
-; CHECK: [[M:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[Z]]
-; CHECK: [[L:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[X]]
-; CHECK: [[I:%.*]] = OpIEqual %[[#]] [[M]] %[[#]]
-; CHECK: [[H:%.*]] = OpSelect %[[#]] [[I]] [[L]] [[M]]
-; CHECK: [[C:%.*]] = OpSelect %[[#]] [[I]] %[[#]] %[[#]]
-; CHECK: [[B:%.*]] = OpIAdd %[[#]] [[C]] [[H]]
+; CHECK: [[a64:%.+]] = OpFunctionParameter [[u64_t]]
+; CHECK: [[a32x2:%.+]] = OpBitcast [[u32x2_t]] [[a64]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[a32x2]]
+; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_0]]
+; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_1]]
+; CHECK: [[should_use_low:%.+]] = OpIEqual [[bool_t]] [[high_bits]] [[const_neg1]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[low_bits]] [[high_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[const_0]] [[const_32]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call i32 @llvm.spv.firstbituhigh.i64(i64 %a)
   ret i32 %elt.firstbituhigh
 }
 
-define noundef <2 x i32> @firstbituhigh_v2i64(<2 x i64> noundef %a) {
+; CHECK-LABEL: Begin function firstbituhigh_v2xi64
+define noundef <2 x i32> @firstbituhigh_v2xi64(<2 x i64> noundef %a) {
 entry:
-; CHECK: [[O:%.*]] = OpBitcast %[[#]] %[[#]]
-; CHECK: [[N:%.*]] = OpExtInst %[[#]] %[[#]] FindUMsb [[O]]
-; CHECK: [[M:%.*]] = OpVectorShuffle %[[#]] [[N]] [[N]] 0
-; CHECK: [[L:%.*]] = OpVectorShuffle %[[#]] [[N]] [[N]] 1
-; CHECK: [[I:%.*]] = OpIEqual %[[#]] [[M]] %[[#]]
-; CHECK: [[H:%.*]] = OpSelect %[[#]] [[I]] [[L]] [[M]]
-; CHECK: [[C:%.*]] = OpSelect %[[#]] [[I]] %[[#]] %[[#]]
-; CHECK: [[B:%.*]] = OpIAdd %[[#]] [[C]] [[H]]
-; CHECK: OpReturnValue [[B]]
+; CHECK: [[a64x2:%.+]] = OpFunctionParameter [[u64x2_t]]
+; CHECK: [[a32x4:%.+]] = OpBitcast [[u32x4_t]] [[a64x2]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[a32x4]]
+; CHECK: [[high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 0 2
+; CHECK: [[low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[lsb_bits]] [[lsb_bits]] 1 3
+; CHECK: [[should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[high_bits]] [[const_neg1x2]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32x2_t]] [[should_use_low]] [[low_bits]] [[high_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32x2_t]] [[should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32x2_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <2 x i32> @llvm.spv.firstbituhigh.v2i64(<2 x i64> %a)
   ret <2 x i32> %elt.firstbituhigh
 }
 
+; CHECK-LABEL: Begin function firstbituhigh_v3xi64
+define noundef <3 x i32> @firstbituhigh_v3xi64(<3 x i64> noundef %a) {
+entry:
+; Split the i64x3 into i64, i64x2
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
+; CHECK: [[left:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_0]]
+; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 1 2
+
+; Do firstbituhigh on i64, i64x2
+; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x2_t]] [[left]]
+; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[left_cast]]
+; CHECK: [[left_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_0]]
+; CHECK: [[left_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_1]]
+; CHECK: [[left_should_use_low:%.+]] = OpIEqual [[bool_t]] [[left_high_bits]] [[const_neg1]]
+; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32_t]] [[left_should_use_low]] [[left_low_bits]] [[left_high_bits]]
+; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32_t]] [[left_should_use_low]] [[const_0]] [[const_32]]
+; CHECK: [[left_res:%.+]] = OpIAdd [[u32_t]] [[left_ans_offset]] [[left_ans_bits]]
+
+; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
+; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[right_cast]]
+; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
+; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
+; CHECK: [[right_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[right_high_bits]] [[const_neg1x2]]
+; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[right_low_bits]] [[right_high_bits]]
+; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+
+; Merge the resulting i32, i32x2 into the final i32x3 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[left_res]] [[right_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <3 x i32> @llvm.spv.firstbituhigh.v3i64(<3 x i64> %a)
+  ret <3 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbituhigh_v4xi64
+define noundef <4 x i32> @firstbituhigh_v4xi64(<4 x i64> noundef %a) {
+entry:
+; Split the i64x4 into 2 i64x2
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]
+; CHECK: [[left:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
+
+; Do firstbithigh on the 2 i64x2
+; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x4_t]] [[left]]
+; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[left_cast]]
+; CHECK: [[left_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 0 2
+; CHECK: [[left_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 1 3
+; CHECK: [[left_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[left_high_bits]] [[const_neg1x2]]
+; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_low]] [[left_low_bits]] [[left_high_bits]]
+; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[left_res:%.+]] = OpIAdd [[u32x2_t]] [[left_ans_offset]] [[left_ans_bits]]
+
+; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
+; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[right_cast]]
+; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
+; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
+; CHECK: [[right_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[right_high_bits]] [[const_neg1x2]]
+; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[right_low_bits]] [[right_high_bits]]
+; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+
+; Merge the resulting 2 i32x2 into the final i32x4 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[left_res]] [[right_res]]
+; CHECK: OpReturnValue [[ret]]
+  %elt.firstbituhigh = call <4 x i32> @llvm.spv.firstbituhigh.v4i64(<4 x i64> %a)
+  ret <4 x i32> %elt.firstbituhigh
+}
+
+; CHECK-LABEL: Begin function firstbitshigh_i32
 define noundef i32 @firstbitshigh_i32(i32 noundef %a) {
 entry:
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindSMsb %[[#]]
+; CHECK: [[a:%.+]] = OpFunctionParameter [[u32_t]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindSMsb [[a]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbitshigh = call i32 @llvm.spv.firstbitshigh.i32(i32 %a)
   ret i32 %elt.firstbitshigh
 }
 
+; CHECK-LABEL: Begin function firstbitshigh_i16
 define noundef i32 @firstbitshigh_i16(i16 noundef %a) {
 entry:
-; CHECK: [[A:%.*]] = OpSConvert %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#]] %[[#]] FindSMsb %[[#]]
+; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16_t]]
+; CHECK: [[a32:%.+]] = OpSConvert [[u32_t]] [[a16]]
+; CHECK: [[ret:%.+]] = OpExtInst [[u32_t]] [[glsl_450_ext]] FindSMsb [[a32]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbitshigh = call i32 @llvm.spv.firstbitshigh.i16(i16 %a)
   ret i32 %elt.firstbitshigh
 }
 
+; CHECK-LABEL: Begin function firstbitshigh_i64
 define noundef i32 @firstbitshigh_i64(i64 noundef %a) {
 entry:
-; CHECK: [[O:%.*]] = OpBitcast %[[#]] %[[#]]
-; CHECK: [[N:%.*]] = OpExtInst %[[#]] %[[#]] FindSMsb [[O]]
-; CHECK: [[M:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[Z]]
-; CHECK: [[L:%.*]] = OpVectorExtractDynamic %[[#]] [[N]] [[X]]
-; CHECK: [[I:%.*]] = OpIEqual %[[#]] [[M]] %[[#]]
-; CHECK: [[H:%.*]] = OpSelect %[[#]] [[I]] [[L]] [[M]]
-; CHECK: [[C:%.*]] = OpSelect %[[#]] [[I]] %[[#]] %[[#]]
-; CHECK: [[B:%.*]] = OpIAdd %[[#]] [[C]] [[H]]
+; CHECK: [[a64:%.+]] = OpFunctionParameter [[u64_t]]
+; CHECK: [[a32x2:%.+]] = OpBitcast [[u32x2_t]] [[a64]]
+; CHECK: [[lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindSMsb [[a32x2]]
+; CHECK: [[high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_0]]
+; CHECK: [[low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[lsb_bits]] [[const_1]]
+; CHECK: [[should_use_low:%.+]] = OpIEqual [[bool_t]] [[high_bits]] [[const_neg1]]
+; CHECK: [[ans_bits:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[low_bits]] [[high_bits]]
+; CHECK: [[ans_offset:%.+]] = OpSelect [[u32_t]] [[should_use_low]] [[const_0]] [[const_32]]
+; CHECK: [[ret:%.+]] = OpIAdd [[u32_t]] [[ans_offset]] [[ans_bits]]
+; CHECK: OpReturnValue [[ret]]
   %elt.firstbitshigh = call i32 @llvm.spv.firstbitshigh.i64(i64 %a)
   ret i32 %elt.firstbitshigh
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
index f3cc73637b136..262cc2610600f 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
@@ -76,8 +76,8 @@ entry:
   ret i32 %elt.firstbitlow
 }
 
-; CHECK-LABEL: Begin function firstbitlow_v2i16
-define noundef <2 x i32> @firstbitlow_v2i16(<2 x i16> noundef %a) {
+; CHECK-LABEL: Begin function firstbitlow_v2xi16
+define noundef <2 x i32> @firstbitlow_v2xi16(<2 x i16> noundef %a) {
 entry:
 ; CHECK: [[a16:%.+]] = OpFunctionParameter [[u16x2_t]]
 ; CHECK: [[a32:%.+]] = OpUConvert [[u32x2_t]] [[a16]]
@@ -126,8 +126,8 @@ entry:
   ret i32 %elt.firstbitlow
 }
 
-; CHECK-LABEL: Begin function firstbitlow_v2i64
-define noundef <2 x i32> @firstbitlow_v2i64(<2 x i64> noundef %a) {
+; CHECK-LABEL: Begin function firstbitlow_v2xi64
+define noundef <2 x i32> @firstbitlow_v2xi64(<2 x i64> noundef %a) {
 entry:
 ; CHECK: [[a64x2:%.+]] = OpFunctionParameter [[u64x2_t]]
 ; CHECK: [[a32x4:%.+]] = OpBitcast [[u32x4_t]] [[a64x2]]
@@ -143,8 +143,8 @@ entry:
   ret <2 x i32> %elt.firstbitlow
 }
 
-; CHECK-LABEL: Begin function firstbitlow_v3i64
-define noundef <3 x i32> @firstbitlow_v3i64(<3 x i64> noundef %a) {
+; CHECK-LABEL: Begin function firstbitlow_v3xi64
+define noundef <3 x i32> @firstbitlow_v3xi64(<3 x i64> noundef %a) {
 entry:
 ; Split the i64x3 into i64, i64x2
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
@@ -177,8 +177,8 @@ entry:
   ret <3 x i32> %elt.firstbitlow
 }
 
-; CHECK-LABEL: Begin function firstbitlow_v4i64
-define noundef <4 x i32> @firstbitlow_v4i64(<4 x i64> noundef %a) {
+; CHECK-LABEL: Begin function firstbitlow_v4xi64
+define noundef <4 x i32> @firstbitlow_v4xi64(<4 x i64> noundef %a) {
 entry:
 ; Split the i64x4 into 2 i64x2
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]

From 525a6620435a26cc3eb2cc7bc25262d898780f90 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Tue, 17 Dec 2024 15:18:05 -0700
Subject: [PATCH 11/17] Address comments

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 105 +++++++++---------
 1 file changed, 51 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3872409be44c6..86d44705f0982 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3191,16 +3191,13 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
   // (ie i64x4 -> i64x2, i64x2)
   MachineIRBuilder MIRBuilder(I);
   SPIRVType *OpType = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
-  SPIRVType *LeftOpType;
-  SPIRVType *LeftResType;
+  SPIRVType *LeftOpType = OpType;
+  SPIRVType *LeftResType = BaseType;
   if (LeftIsVector) {
     LeftOpType =
         GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
     LeftResType =
         GR.getOrCreateSPIRVVectorType(BaseType, LeftComponentCount, MIRBuilder);
-  } else {
-    LeftOpType = OpType;
-    LeftResType = BaseType;
   }
 
   SPIRVType *RightOpType =
@@ -3212,8 +3209,6 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
   Register RightSideIn =
       MRI->createVirtualRegister(GR.getRegClass(RightOpType));
 
-  bool Result;
-
   // Extract the left half from the SrcReg into LeftSideIn
   // accounting for the special case when it only has one element
   if (LeftIsVector) {
@@ -3225,14 +3220,16 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
                    // Per the spec, repeat the vector if only one vec is needed
                    .addUse(SrcReg);
 
-    for (unsigned J = 0; J < LeftComponentCount; J++) {
+    for (unsigned J = 0; J < LeftComponentCount; J++)
       MIB.addImm(J);
-    }
 
-    Result = MIB.constrainAllUses(TII, TRI, RBI);
+    if (!MIB.constrainAllUses(TII, TRI, RBI))
+      return false;
+
   } else {
-    Result = selectOpWithSrcs(LeftSideIn, LeftOpType, I, {SrcReg, ConstIntZero},
-                              SPIRV::OpVectorExtractDynamic);
+    if (!selectOpWithSrcs(LeftSideIn, LeftOpType, I, {SrcReg, ConstIntZero},
+                          SPIRV::OpVectorExtractDynamic))
+      return false;
   }
 
   // Extract the right half from the SrcReg into RightSideIn.
@@ -3246,28 +3243,28 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
                  // Per the spec, repeat the vector if only one vec is needed
                  .addUse(SrcReg);
 
-  for (unsigned J = LeftComponentCount; J < ComponentCount; J++) {
+  for (unsigned J = LeftComponentCount; J < ComponentCount; J++)
     MIB.addImm(J);
-  }
 
-  Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+  if (!MIB.constrainAllUses(TII, TRI, RBI))
+    return false;
 
   // Recursively call selectFirstBitSet64 on the 2 halves
   Register LeftSideOut =
       MRI->createVirtualRegister(GR.getRegClass(LeftResType));
   Register RightSideOut =
       MRI->createVirtualRegister(GR.getRegClass(RightResType));
-  Result =
-      Result && selectFirstBitSet64(LeftSideOut, LeftResType, I, LeftSideIn,
-                                    BitSetOpcode, SwapPrimarySide);
-  Result =
-      Result && selectFirstBitSet64(RightSideOut, RightResType, I, RightSideIn,
-                                    BitSetOpcode, SwapPrimarySide);
+
+  if (!selectFirstBitSet64(LeftSideOut, LeftResType, I, LeftSideIn,
+                           BitSetOpcode, SwapPrimarySide))
+    return false;
+  if (!selectFirstBitSet64(RightSideOut, RightResType, I, RightSideIn,
+                           BitSetOpcode, SwapPrimarySide))
+    return false;
 
   // Join the two resulting registers back into the return type
   // (ie i32x2, i32x2 -> i32x4)
-  return Result &&
-         selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
+  return selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
                           SPIRV::OpCompositeConstruct);
 }
 
@@ -3297,13 +3294,15 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
       GR.getOrCreateSPIRVVectorType(BaseType, 2 * ComponentCount, MIRBuilder);
   Register BitcastReg =
       MRI->createVirtualRegister(GR.getRegClass(PostCastType));
-  bool Result =
-      selectOpWithSrcs(BitcastReg, PostCastType, I, {SrcReg}, SPIRV::OpBitcast);
+
+  if (!selectOpWithSrcs(BitcastReg, PostCastType, I, {SrcReg},
+                        SPIRV::OpBitcast))
+    return false;
 
   // 2. Find the first set bit from the primary side for all the pieces in #1
   Register FBSReg = MRI->createVirtualRegister(GR.getRegClass(PostCastType));
-  Result = Result && selectFirstBitSet32(FBSReg, PostCastType, I, BitcastReg,
-                                         BitSetOpcode);
+  if (!selectFirstBitSet32(FBSReg, PostCastType, I, BitcastReg, BitSetOpcode))
+    return false;
 
   // 3. Split result vector into high bits and low bits
   Register HighReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
@@ -3312,12 +3311,12 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
   bool IsScalarRes = ResType->getOpcode() != SPIRV::OpTypeVector;
   if (IsScalarRes) {
     // if scalar do a vector extract
-    Result =
-        Result && selectOpWithSrcs(HighReg, ResType, I, {FBSReg, ConstIntZero},
-                                   SPIRV::OpVectorExtractDynamic);
-    Result =
-        Result && selectOpWithSrcs(LowReg, ResType, I, {FBSReg, ConstIntOne},
-                                   SPIRV::OpVectorExtractDynamic);
+    if (!selectOpWithSrcs(HighReg, ResType, I, {FBSReg, ConstIntZero},
+                          SPIRV::OpVectorExtractDynamic))
+      return false;
+    if (!selectOpWithSrcs(LowReg, ResType, I, {FBSReg, ConstIntOne},
+                          SPIRV::OpVectorExtractDynamic))
+      return false;
   } else {
     // if vector do a shufflevector
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
@@ -3332,7 +3331,9 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
     for (unsigned J = 0; J < ComponentCount * 2; J += 2) {
       MIB.addImm(J);
     }
-    Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+
+    if (!MIB.constrainAllUses(TII, TRI, RBI))
+      return false;
 
     MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                   TII.get(SPIRV::OpVectorShuffle))
@@ -3346,7 +3347,8 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
     for (unsigned J = 1; J < ComponentCount * 2; J += 2) {
       MIB.addImm(J);
     }
-    Result = Result && MIB.constrainAllUses(TII, TRI, RBI);
+    if (!MIB.constrainAllUses(TII, TRI, RBI))
+      return false;
   }
 
   // 4. Check the result. When primary bits == -1 use secondary, otherwise use
@@ -3376,10 +3378,10 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
     AddOp = SPIRV::OpIAddV;
   }
 
-  Register PrimaryReg;
-  Register SecondaryReg;
-  Register PrimaryShiftReg;
-  Register SecondaryShiftReg;
+  Register PrimaryReg = HighReg;
+  Register SecondaryReg = LowReg;
+  Register PrimaryShiftReg = Reg32;
+  Register SecondaryShiftReg = Reg0;
 
   // By default the emitted opcodes check for the set bit from the MSB side.
   // Setting SwapPrimarySide checks the set bit from the LSB side
@@ -3388,32 +3390,27 @@ bool SPIRVInstructionSelector::selectFirstBitSet64(
     SecondaryReg = HighReg;
     PrimaryShiftReg = Reg0;
     SecondaryShiftReg = Reg32;
-  } else {
-    PrimaryReg = HighReg;
-    SecondaryReg = LowReg;
-    PrimaryShiftReg = Reg32;
-    SecondaryShiftReg = Reg0;
   }
 
   // Check if the primary bits are == -1
   Register BReg = MRI->createVirtualRegister(GR.getRegClass(BoolType));
-  Result = Result && selectOpWithSrcs(BReg, BoolType, I,
-                                      {PrimaryReg, NegOneReg}, SPIRV::OpIEqual);
+  if (!selectOpWithSrcs(BReg, BoolType, I, {PrimaryReg, NegOneReg},
+                        SPIRV::OpIEqual))
+    return false;
 
   // Select secondary bits if true in BReg, otherwise primary bits
   Register TmpReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result =
-      Result && selectOpWithSrcs(TmpReg, ResType, I,
-                                 {BReg, SecondaryReg, PrimaryReg}, SelectOp);
+  if (!selectOpWithSrcs(TmpReg, ResType, I, {BReg, SecondaryReg, PrimaryReg},
+                        SelectOp))
+    return false;
 
   // 5. Add 32 when high bits are used, otherwise 0 for low bits
   Register ValReg = MRI->createVirtualRegister(GR.getRegClass(ResType));
-  Result = Result && selectOpWithSrcs(
-                         ValReg, ResType, I,
-                         {BReg, SecondaryShiftReg, PrimaryShiftReg}, SelectOp);
+  if (!selectOpWithSrcs(ValReg, ResType, I,
+                        {BReg, SecondaryShiftReg, PrimaryShiftReg}, SelectOp))
+    return false;
 
-  return Result &&
-         selectOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, AddOp);
+  return selectOpWithSrcs(ResVReg, ResType, I, {ValReg, TmpReg}, AddOp);
 }
 
 bool SPIRVInstructionSelector::selectFirstBitHigh(Register ResVReg,

From 08a732a4e38df2da04b79e197ff422144e4e5cf2 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Wed, 18 Dec 2024 09:47:59 -0700
Subject: [PATCH 12/17] cleanup

---
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 86d44705f0982..a85f6dcd9382e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3220,7 +3220,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
                    // Per the spec, repeat the vector if only one vec is needed
                    .addUse(SrcReg);
 
-    for (unsigned J = 0; J < LeftComponentCount; J++)
+    for (unsigned J = 0; J < LeftComponentCount; ++J)
       MIB.addImm(J);
 
     if (!MIB.constrainAllUses(TII, TRI, RBI))
@@ -3243,7 +3243,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
                  // Per the spec, repeat the vector if only one vec is needed
                  .addUse(SrcReg);
 
-  for (unsigned J = LeftComponentCount; J < ComponentCount; J++)
+  for (unsigned J = LeftComponentCount; J < ComponentCount; ++J)
     MIB.addImm(J);
 
   if (!MIB.constrainAllUses(TII, TRI, RBI))

From 2dfcd279cf18d26b64366873181460bc480d7f1f Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Wed, 18 Dec 2024 10:07:33 -0700
Subject: [PATCH 13/17] add assert

---
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index a85f6dcd9382e..ab80aa9bdef64 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3179,6 +3179,13 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
     Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
 
   unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
+  // SPIR-V only allow vecs of size 2,3,4. Calling with a larger vec requires
+  // creating a return type with an invalid vec size. If that is resolved
+  // then this function is valid up to vec8 as the intermediate splitting
+  // would create 2 vec4.
+  assert(ComponentCount < 5 && "Vec 5+ will generate invalid SPIR-V ops");
+
+
   SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
   bool ZeroAsNull = STI.isOpenCLEnv();
   Register ConstIntZero =

From 15eaf6e97c9306129e85165a633d7b90ce89e15e Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Wed, 18 Dec 2024 13:42:37 -0700
Subject: [PATCH 14/17] use iterative approach

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 116 +++++++-----------
 .../SPIRV/hlsl-intrinsics/firstbithigh.ll     |  99 ++++++++-------
 .../SPIRV/hlsl-intrinsics/firstbitlow.ll      | 111 +++++++++--------
 3 files changed, 161 insertions(+), 165 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index ab80aa9bdef64..285068caac1c3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3178,100 +3178,74 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
     Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
     Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
 
-  unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
   // SPIR-V only allow vecs of size 2,3,4. Calling with a larger vec requires
-  // creating a return type with an invalid vec size. If that is resolved
-  // then this function is valid up to vec8 as the intermediate splitting
-  // would create 2 vec4.
+  // creating a param reg and return reg with an invalid vec size. If that is
+  // resolved then this function is valid for vectors of any component size.
+  unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
   assert(ComponentCount < 5 && "Vec 5+ will generate invalid SPIR-V ops");
 
-
-  SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
   bool ZeroAsNull = STI.isOpenCLEnv();
-  Register ConstIntZero =
-      GR.getOrCreateConstInt(0, I, BaseType, TII, ZeroAsNull);
-  unsigned LeftComponentCount = ComponentCount / 2;
-  unsigned RightComponentCount = ComponentCount - LeftComponentCount;
-  bool LeftIsVector = LeftComponentCount > 1;
-
-  // Split the SrcReg in half into 2 smaller vec registers
-  // (ie i64x4 -> i64x2, i64x2)
   MachineIRBuilder MIRBuilder(I);
-  SPIRVType *OpType = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
-  SPIRVType *LeftOpType = OpType;
-  SPIRVType *LeftResType = BaseType;
-  if (LeftIsVector) {
-    LeftOpType =
-        GR.getOrCreateSPIRVVectorType(OpType, LeftComponentCount, MIRBuilder);
-    LeftResType =
-        GR.getOrCreateSPIRVVectorType(BaseType, LeftComponentCount, MIRBuilder);
-  }
-
-  SPIRVType *RightOpType =
-      GR.getOrCreateSPIRVVectorType(OpType, RightComponentCount, MIRBuilder);
-  SPIRVType *RightResType =
-      GR.getOrCreateSPIRVVectorType(BaseType, RightComponentCount, MIRBuilder);
-
-  Register LeftSideIn = MRI->createVirtualRegister(GR.getRegClass(LeftOpType));
-  Register RightSideIn =
-      MRI->createVirtualRegister(GR.getRegClass(RightOpType));
-
-  // Extract the left half from the SrcReg into LeftSideIn
-  // accounting for the special case when it only has one element
-  if (LeftIsVector) {
+  SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
+  SPIRVType *I64Type = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
+  SPIRVType *I64x2Type = GR.getOrCreateSPIRVVectorType(I64Type, 2, MIRBuilder);
+  SPIRVType *Vec2ResType =
+      GR.getOrCreateSPIRVVectorType(BaseType, 2, MIRBuilder);
+
+  std::vector<Register> PartialRegs;
+
+  // Loops 0, 2, 4, ... but stops one loop early when ComponentCount is odd
+  unsigned CurrentComponent = 0;
+  for (; CurrentComponent + 1 < ComponentCount; CurrentComponent += 2) {
+    Register SubVecReg = MRI->createVirtualRegister(GR.getRegClass(I64x2Type));
+
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                        TII.get(SPIRV::OpVectorShuffle))
-                   .addDef(LeftSideIn)
-                   .addUse(GR.getSPIRVTypeID(LeftOpType))
+                   .addDef(SubVecReg)
+                   .addUse(GR.getSPIRVTypeID(I64x2Type))
                    .addUse(SrcReg)
                    // Per the spec, repeat the vector if only one vec is needed
                    .addUse(SrcReg);
 
-    for (unsigned J = 0; J < LeftComponentCount; ++J)
-      MIB.addImm(J);
+    MIB.addImm(CurrentComponent);
+    MIB.addImm(CurrentComponent + 1);
 
     if (!MIB.constrainAllUses(TII, TRI, RBI))
       return false;
 
-  } else {
-    if (!selectOpWithSrcs(LeftSideIn, LeftOpType, I, {SrcReg, ConstIntZero},
-                          SPIRV::OpVectorExtractDynamic))
+    Register SubVecBitSetReg =
+        MRI->createVirtualRegister(GR.getRegClass(Vec2ResType));
+
+    if (!selectFirstBitSet64(SubVecBitSetReg, Vec2ResType, I, SubVecReg,
+                             BitSetOpcode, SwapPrimarySide))
       return false;
+
+    PartialRegs.push_back(SubVecBitSetReg);
   }
 
-  // Extract the right half from the SrcReg into RightSideIn.
-  // Right will always be a vector since the only time one element is left is
-  // when Component == 3, and in that case Left is one element.
-  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                     TII.get(SPIRV::OpVectorShuffle))
-                 .addDef(RightSideIn)
-                 .addUse(GR.getSPIRVTypeID(RightOpType))
-                 .addUse(SrcReg)
-                 // Per the spec, repeat the vector if only one vec is needed
-                 .addUse(SrcReg);
+  // On odd component counts we need to handle one more component
+  if (CurrentComponent != ComponentCount) {
+    Register FinalElemReg = MRI->createVirtualRegister(GR.getRegClass(I64Type));
+    Register ConstIntLastIdx = GR.getOrCreateConstInt(
+        ComponentCount - 1, I, BaseType, TII, ZeroAsNull);
 
-  for (unsigned J = LeftComponentCount; J < ComponentCount; ++J)
-    MIB.addImm(J);
+    if (!selectOpWithSrcs(FinalElemReg, I64Type, I, {SrcReg, ConstIntLastIdx},
+                          SPIRV::OpVectorExtractDynamic))
+      return false;
 
-  if (!MIB.constrainAllUses(TII, TRI, RBI))
-    return false;
+    Register FinalElemBitSetReg =
+        MRI->createVirtualRegister(GR.getRegClass(BaseType));
 
-  // Recursively call selectFirstBitSet64 on the 2 halves
-  Register LeftSideOut =
-      MRI->createVirtualRegister(GR.getRegClass(LeftResType));
-  Register RightSideOut =
-      MRI->createVirtualRegister(GR.getRegClass(RightResType));
+    if (!selectFirstBitSet64(FinalElemBitSetReg, BaseType, I, FinalElemReg,
+                             BitSetOpcode, SwapPrimarySide))
+      return false;
 
-  if (!selectFirstBitSet64(LeftSideOut, LeftResType, I, LeftSideIn,
-                           BitSetOpcode, SwapPrimarySide))
-    return false;
-  if (!selectFirstBitSet64(RightSideOut, RightResType, I, RightSideIn,
-                           BitSetOpcode, SwapPrimarySide))
-    return false;
+    PartialRegs.push_back(FinalElemBitSetReg);
+  }
 
-  // Join the two resulting registers back into the return type
-  // (ie i32x2, i32x2 -> i32x4)
-  return selectOpWithSrcs(ResVReg, ResType, I, {LeftSideOut, RightSideOut},
+  // Join all the resulting registers back into the return type in order
+  // (ie i32x2, i32x2, i32x1 -> i32x5)
+  return selectOpWithSrcs(ResVReg, ResType, I, PartialRegs,
                           SPIRV::OpCompositeConstruct);
 }
 
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
index dee48061d2fe1..a4dd09d84d996 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbithigh.ll
@@ -8,6 +8,7 @@
 ; CHECK-DAG: [[u32x3_t:%.+]] = OpTypeVector [[u32_t]] 3
 ; CHECK-DAG: [[u32x4_t:%.+]] = OpTypeVector [[u32_t]] 4
 ; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
+; CHECK-DAG: [[const_2:%.*]] = OpConstant [[u32_t]] 2
 ; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
 ; CHECK-DAG: [[const_1:%.*]] = OpConstant [[u32_t]] 1
 ; CHECK-DAG: [[const_32:%.*]] = OpConstant [[u32_t]] 32
@@ -146,32 +147,37 @@ entry:
 ; CHECK-LABEL: Begin function firstbituhigh_v3xi64
 define noundef <3 x i32> @firstbituhigh_v3xi64(<3 x i64> noundef %a) {
 entry:
-; Split the i64x3 into i64, i64x2
+; Preamble
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
-; CHECK: [[left:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_0]]
-; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 1 2
 
-; Do firstbituhigh on i64, i64x2
-; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x2_t]] [[left]]
-; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[left_cast]]
-; CHECK: [[left_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_0]]
-; CHECK: [[left_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_1]]
-; CHECK: [[left_should_use_low:%.+]] = OpIEqual [[bool_t]] [[left_high_bits]] [[const_neg1]]
-; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32_t]] [[left_should_use_low]] [[left_low_bits]] [[left_high_bits]]
-; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32_t]] [[left_should_use_low]] [[const_0]] [[const_32]]
-; CHECK: [[left_res:%.+]] = OpIAdd [[u32_t]] [[left_ans_offset]] [[left_ans_bits]]
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
 
-; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
-; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[right_cast]]
-; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
-; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
-; CHECK: [[right_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[right_high_bits]] [[const_neg1x2]]
-; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[right_low_bits]] [[right_high_bits]]
-; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[const_0x2]] [[const_32x2]]
-; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+; Do firstbituhigh on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[pt1_high_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[pt1_low_bits]] [[pt1_high_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
 
-; Merge the resulting i32, i32x2 into the final i32x3 and return it
-; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[left_res]] [[right_res]]
+; Extract the last component from %a
+; CHECK: [[pt2:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_2]]
+
+; Do firstbituhigh on the last component
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x2_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindUMsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_0]]
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_1]]
+; CHECK: [[pt2_should_use_low:%.+]] = OpIEqual [[bool_t]] [[pt2_high_bits]] [[const_neg1]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_low]] [[pt2_low_bits]] [[pt2_high_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_low]] [[const_0]] [[const_32]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
+
+; Merge the parts into the final i32x3 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[pt1_res]] [[pt2_res]]
 ; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <3 x i32> @llvm.spv.firstbituhigh.v3i64(<3 x i64> %a)
   ret <3 x i32> %elt.firstbituhigh
@@ -180,32 +186,37 @@ entry:
 ; CHECK-LABEL: Begin function firstbituhigh_v4xi64
 define noundef <4 x i32> @firstbituhigh_v4xi64(<4 x i64> noundef %a) {
 entry:
-; Split the i64x4 into 2 i64x2
+; Preamble
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]
-; CHECK: [[left:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
-; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
 
-; Do firstbithigh on the 2 i64x2
-; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x4_t]] [[left]]
-; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[left_cast]]
-; CHECK: [[left_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 0 2
-; CHECK: [[left_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 1 3
-; CHECK: [[left_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[left_high_bits]] [[const_neg1x2]]
-; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_low]] [[left_low_bits]] [[left_high_bits]]
-; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_low]] [[const_0x2]] [[const_32x2]]
-; CHECK: [[left_res:%.+]] = OpIAdd [[u32x2_t]] [[left_ans_offset]] [[left_ans_bits]]
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+
+; Do firstbituhigh on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[pt1_high_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[pt1_low_bits]] [[pt1_high_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
+
+; Extract last 2 components from %a
+; CHECK: [[pt2:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
 
-; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
-; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[right_cast]]
-; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
-; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
-; CHECK: [[right_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[right_high_bits]] [[const_neg1x2]]
-; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[right_low_bits]] [[right_high_bits]]
-; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_low]] [[const_0x2]] [[const_32x2]]
-; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
+; Do firstbituhigh on the last 2 components
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindUMsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 0 2
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 1 3
+; CHECK: [[pt2_should_use_low:%.+]] = OpIEqual [[boolx2_t]] [[pt2_high_bits]] [[const_neg1x2]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_low]] [[pt2_low_bits]] [[pt2_high_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_low]] [[const_0x2]] [[const_32x2]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32x2_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
 
-; Merge the resulting 2 i32x2 into the final i32x4 and return it
-; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[left_res]] [[right_res]]
+; Merge the parts into the final i32x4 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[pt1_res]] [[pt2_res]]
 ; CHECK: OpReturnValue [[ret]]
   %elt.firstbituhigh = call <4 x i32> @llvm.spv.firstbituhigh.v4i64(<4 x i64> %a)
   ret <4 x i32> %elt.firstbituhigh
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
index 262cc2610600f..6de6cdc60ea9c 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/firstbitlow.ll
@@ -10,6 +10,7 @@
 ; CHECK-DAG: [[const_0:%.*]] = OpConstant [[u32_t]] 0
 ; CHECK-DAG: [[const_0x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_0]] [[const_0]]
 ; CHECK-DAG: [[const_1:%.*]] = OpConstant [[u32_t]] 1
+; CHECK-DAG: [[const_2:%.*]] = OpConstant [[u32_t]] 2
 ; CHECK-DAG: [[const_32:%.*]] = OpConstant [[u32_t]] 32
 ; CHECK-DAG: [[const_32x2:%.*]] = OpConstantComposite [[u32x2_t]] [[const_32]] [[const_32]]
 ; CHECK-DAG: [[const_neg1:%.*]] = OpConstant [[u32_t]] 4294967295
@@ -146,32 +147,37 @@ entry:
 ; CHECK-LABEL: Begin function firstbitlow_v3xi64
 define noundef <3 x i32> @firstbitlow_v3xi64(<3 x i64> noundef %a) {
 entry:
-; Split the i64x3 into i64, i64x2
+; Preamble
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x3_t]]
-; CHECK: [[left:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_0]]
-; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 1 2
-
-; Do firstbitlow on i64, i64x2
-; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x2_t]] [[left]]
-; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[left_cast]]
-; CHECK: [[left_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_0]]
-; CHECK: [[left_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[left_lsb_bits]] [[const_1]]
-; CHECK: [[left_should_use_high:%.+]] = OpIEqual [[bool_t]] [[left_low_bits]] [[const_neg1]]
-; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32_t]] [[left_should_use_high]] [[left_high_bits]] [[left_low_bits]]
-; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32_t]] [[left_should_use_high]] [[const_32]] [[const_0]]
-; CHECK: [[left_res:%.+]] = OpIAdd [[u32_t]] [[left_ans_offset]] [[left_ans_bits]]
-
-; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
-; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[right_cast]]
-; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
-; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
-; CHECK: [[right_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[right_low_bits]] [[const_neg1x2]]
-; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[right_high_bits]] [[right_low_bits]]
-; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[const_32x2]] [[const_0x2]]
-; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
-
-; Merge the resulting i32, i32x2 into the final i32x3 and return it
-; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[left_res]] [[right_res]]
+
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+
+; Do firstbitlow on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[pt1_low_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[pt1_high_bits]] [[pt1_low_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
+
+; Extract the last component from %a
+; CHECK: [[pt2:%.+]] = OpVectorExtractDynamic [[u64_t]] [[a]] [[const_2]]
+
+; Do firstbitlow on the last component
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x2_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x2_t]] [[glsl_450_ext]] FindILsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_0]]
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorExtractDynamic [[u32_t]] [[pt2_lsb_bits]] [[const_1]]
+; CHECK: [[pt2_should_use_high:%.+]] = OpIEqual [[bool_t]] [[pt2_low_bits]] [[const_neg1]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_high]] [[pt2_high_bits]] [[pt2_low_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32_t]] [[pt2_should_use_high]] [[const_32]] [[const_0]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
+
+; Merge the parts into the final i32x3 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x3_t]] [[pt1_res]] [[pt2_res]]
 ; CHECK: OpReturnValue [[ret]]
   %elt.firstbitlow = call <3 x i32> @llvm.spv.firstbitlow.v3i64(<3 x i64> %a)
   ret <3 x i32> %elt.firstbitlow
@@ -180,32 +186,37 @@ entry:
 ; CHECK-LABEL: Begin function firstbitlow_v4xi64
 define noundef <4 x i32> @firstbitlow_v4xi64(<4 x i64> noundef %a) {
 entry:
-; Split the i64x4 into 2 i64x2
+; Preamble
 ; CHECK: [[a:%.+]] = OpFunctionParameter [[u64x4_t]]
-; CHECK: [[left:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
-; CHECK: [[right:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
-
-; Do firstbitlow on the 2 i64x2
-; CHECK: [[left_cast:%.+]] = OpBitcast [[u32x4_t]] [[left]]
-; CHECK: [[left_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[left_cast]]
-; CHECK: [[left_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 0 2
-; CHECK: [[left_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[left_lsb_bits]] [[left_lsb_bits]] 1 3
-; CHECK: [[left_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[left_low_bits]] [[const_neg1x2]]
-; CHECK: [[left_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_high]] [[left_high_bits]] [[left_low_bits]]
-; CHECK: [[left_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[left_should_use_high]] [[const_32x2]] [[const_0x2]]
-; CHECK: [[left_res:%.+]] = OpIAdd [[u32x2_t]] [[left_ans_offset]] [[left_ans_bits]]
-
-; CHECK: [[right_cast:%.+]] = OpBitcast [[u32x4_t]] [[right]]
-; CHECK: [[right_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[right_cast]]
-; CHECK: [[right_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 0 2
-; CHECK: [[right_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[right_lsb_bits]] [[right_lsb_bits]] 1 3
-; CHECK: [[right_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[right_low_bits]] [[const_neg1x2]]
-; CHECK: [[right_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[right_high_bits]] [[right_low_bits]]
-; CHECK: [[right_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[right_should_use_high]] [[const_32x2]] [[const_0x2]]
-; CHECK: [[right_res:%.+]] = OpIAdd [[u32x2_t]] [[right_ans_offset]] [[right_ans_bits]]
-
-; Merge the resulting 2 i32x2 into the final i32x4 and return it
-; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[left_res]] [[right_res]]
+
+; Extract first 2 components from %a
+; CHECK: [[pt1:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 0 1
+
+; Do firstbitlow on the first 2 components
+; CHECK: [[pt1_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt1]]
+; CHECK: [[pt1_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[pt1_cast]]
+; CHECK: [[pt1_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 0 2
+; CHECK: [[pt1_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt1_lsb_bits]] [[pt1_lsb_bits]] 1 3
+; CHECK: [[pt1_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[pt1_low_bits]] [[const_neg1x2]]
+; CHECK: [[pt1_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[pt1_high_bits]] [[pt1_low_bits]]
+; CHECK: [[pt1_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt1_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[pt1_res:%.+]] = OpIAdd [[u32x2_t]] [[pt1_ans_offset]] [[pt1_ans_bits]]
+
+; Extract last 2 components from %a
+; CHECK: [[pt2:%.+]] = OpVectorShuffle [[u64x2_t]] [[a]] [[a]] 2 3
+
+; Do firstbituhigh on the last 2 components
+; CHECK: [[pt2_cast:%.+]] = OpBitcast [[u32x4_t]] [[pt2]]
+; CHECK: [[pt2_lsb_bits:%.+]] = OpExtInst [[u32x4_t]] [[glsl_450_ext]] FindILsb [[pt2_cast]]
+; CHECK: [[pt2_high_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 0 2
+; CHECK: [[pt2_low_bits:%.+]] = OpVectorShuffle [[u32x2_t]] [[pt2_lsb_bits]] [[pt2_lsb_bits]] 1 3
+; CHECK: [[pt2_should_use_high:%.+]] = OpIEqual [[boolx2_t]] [[pt2_low_bits]] [[const_neg1x2]]
+; CHECK: [[pt2_ans_bits:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_high]] [[pt2_high_bits]] [[pt2_low_bits]]
+; CHECK: [[pt2_ans_offset:%.+]] = OpSelect [[u32x2_t]] [[pt2_should_use_high]] [[const_32x2]] [[const_0x2]]
+; CHECK: [[pt2_res:%.+]] = OpIAdd [[u32x2_t]] [[pt2_ans_offset]] [[pt2_ans_bits]]
+
+; Merge the parts into the final i32x4 and return it
+; CHECK: [[ret:%.+]] = OpCompositeConstruct [[u32x4_t]] [[pt1_res]] [[pt2_res]]
 ; CHECK: OpReturnValue [[ret]]
   %elt.firstbitlow = call <4 x i32> @llvm.spv.firstbitlow.v4i64(<4 x i64> %a)
   ret <4 x i32> %elt.firstbitlow

From 2b8d7f0f98f81a00bc7bd6534ec41a1d6dbe25fc Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Thu, 9 Jan 2025 13:08:36 -0700
Subject: [PATCH 15/17] Address comments

---
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 6441435fca5f8..7d8182fc2d421 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3239,11 +3239,14 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
   // Loops 0, 2, 4, ... but stops one loop early when ComponentCount is odd
   unsigned CurrentComponent = 0;
   for (; CurrentComponent + 1 < ComponentCount; CurrentComponent += 2) {
-    Register SubVecReg = MRI->createVirtualRegister(GR.getRegClass(I64x2Type));
+    // This register holds the firstbitX result for each of the i64x2 vectors
+    // extracted from SrcReg
+    Register BitSetResult =
+        MRI->createVirtualRegister(GR.getRegClass(I64x2Type));
 
     auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
                        TII.get(SPIRV::OpVectorShuffle))
-                   .addDef(SubVecReg)
+                   .addDef(BitSetResult)
                    .addUse(GR.getSPIRVTypeID(I64x2Type))
                    .addUse(SrcReg)
                    // Per the spec, repeat the vector if only one vec is needed
@@ -3258,7 +3261,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
     Register SubVecBitSetReg =
         MRI->createVirtualRegister(GR.getRegClass(Vec2ResType));
 
-    if (!selectFirstBitSet64(SubVecBitSetReg, Vec2ResType, I, SubVecReg,
+    if (!selectFirstBitSet64(SubVecBitSetReg, Vec2ResType, I, BitSetResult,
                              BitSetOpcode, SwapPrimarySide))
       return false;
 

From 2c0e21658dc00321f0cc5d05740fe883b7441520 Mon Sep 17 00:00:00 2001
From: Ashley Coleman <me@ashleycoleman.me>
Date: Fri, 10 Jan 2025 10:55:43 -0700
Subject: [PATCH 16/17] Update
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp

Co-authored-by: Steven Perron <stevenperron@google.com>
---
 llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 7d8182fc2d421..c1506651605fb 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3220,9 +3220,9 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
     Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
     Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
 
-  // SPIR-V only allow vecs of size 2,3,4. Calling with a larger vec requires
-  // creating a param reg and return reg with an invalid vec size. If that is
-  // resolved then this function is valid for vectors of any component size.
+  // SPIR-V allow vectors of size 2,3,4 only. Calling with a larger vectors requires
+  // creating a param register and return register with an invalid vector size. If that is
+  // resolved, then this function can be used for vectors of any component size.
   unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
   assert(ComponentCount < 5 && "Vec 5+ will generate invalid SPIR-V ops");
 

From c1b7fadadaee8c43234daa87ed2c3489ecbea06e Mon Sep 17 00:00:00 2001
From: Ashley Coleman <ascoleman@microsoft.com>
Date: Fri, 10 Jan 2025 12:54:40 -0700
Subject: [PATCH 17/17] Address comments

---
 .../Target/SPIRV/SPIRVInstructionSelector.cpp   | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index c1506651605fb..64690d16d5c41 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3220,13 +3220,13 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
     Register ResVReg, const SPIRVType *ResType, MachineInstr &I,
     Register SrcReg, unsigned BitSetOpcode, bool SwapPrimarySide) const {
 
-  // SPIR-V allow vectors of size 2,3,4 only. Calling with a larger vectors requires
-  // creating a param register and return register with an invalid vector size. If that is
-  // resolved, then this function can be used for vectors of any component size.
+  // SPIR-V allow vectors of size 2,3,4 only. Calling with a larger vectors
+  // requires creating a param register and return register with an invalid
+  // vector size. If that is resolved, then this function can be used for
+  // vectors of any component size.
   unsigned ComponentCount = GR.getScalarOrVectorComponentCount(ResType);
   assert(ComponentCount < 5 && "Vec 5+ will generate invalid SPIR-V ops");
 
-  bool ZeroAsNull = STI.isOpenCLEnv();
   MachineIRBuilder MIRBuilder(I);
   SPIRVType *BaseType = GR.retrieveScalarOrVectorIntType(ResType);
   SPIRVType *I64Type = GR.getOrCreateSPIRVIntegerType(64, MIRBuilder);
@@ -3249,11 +3249,9 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
                    .addDef(BitSetResult)
                    .addUse(GR.getSPIRVTypeID(I64x2Type))
                    .addUse(SrcReg)
-                   // Per the spec, repeat the vector if only one vec is needed
-                   .addUse(SrcReg);
-
-    MIB.addImm(CurrentComponent);
-    MIB.addImm(CurrentComponent + 1);
+                   .addUse(SrcReg)
+                   .addImm(CurrentComponent)
+                   .addImm(CurrentComponent + 1);
 
     if (!MIB.constrainAllUses(TII, TRI, RBI))
       return false;
@@ -3270,6 +3268,7 @@ bool SPIRVInstructionSelector::selectFirstBitSet64Overflow(
 
   // On odd component counts we need to handle one more component
   if (CurrentComponent != ComponentCount) {
+    bool ZeroAsNull = STI.isOpenCLEnv();
     Register FinalElemReg = MRI->createVirtualRegister(GR.getRegClass(I64Type));
     Register ConstIntLastIdx = GR.getOrCreateConstInt(
         ComponentCount - 1, I, BaseType, TII, ZeroAsNull);