AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464

pravinjagtap · 2025-02-17T09:54:23Z

The Src0 operand width higher that 32-bits of cvt_scale opcodes operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.

All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for any operand slots providing more than 32-bits of data. Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af

llvmbot · 2025-02-17T09:54:56Z

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-mc

Author: Pravin Jagtap (pravinjagtap)

Changes

All convert opcodes operating on FP6/BF6/FP4
data must use VGPR sources for any operand slots
providing more than 32-bits of data.

Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af

Full diff: https://github.com/llvm/llvm-project/pull/127464.diff

3 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+13)
(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+10-2)
(modified) llvm/test/MC/AMDGPU/gfx950_err.s (+63)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bb78e77a9dc1a..e2d0b28a391fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1818,6 +1818,18 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
          1                 : RegisterOperand<VGPR_32>);
 }
 
+// VGPR only VOP3 src with 9 bit encoding
+class getVOP3VSrcReg9ForVT<ValueType VT> {
+  RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
+                              !eq(VT.Size, 512)  : VRegSrc_512,
+                              !eq(VT.Size, 256)  : VRegSrc_256,
+                              !eq(VT.Size, 192)  : VRegSrc_192,
+                              !eq(VT.Size, 128)  : VRegSrc_128,
+                              !eq(VT.Size, 96)   : VRegSrc_96,
+                              !eq(VT.Size, 64)   : VRegSrc_64,
+                              1 : VRegSrc_32);
+}
+
 // Src2 of VOP3 DPP instructions cannot be a literal
 class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
   RegisterOperand ret =
@@ -2852,6 +2864,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
 def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
 def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
 def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
+def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>;
 def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
 def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
 def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index afafc2ecccfaf..6a6d8bd94826f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1052,7 +1052,10 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
   let HasFP4DstByteSel = 1;
 }
 
-def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
+class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+
+  let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+                                             getVOP3SrcForVT<P.Src0VT>.ret);
   let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
                           Int32InputMods:     $src1_modifiers, Src1RC64:$src1,
                           FP32InputMods:      $src2_modifiers, Src2RC64:$src2,
@@ -1100,6 +1103,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
   let HasExt32BitDPP = 0;
   let HasExtVOP3DPP = 0;
   let HasExt64BitDPP = 0;
+
+  // All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
+  // any operand slots > 32 bit.
+  let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+                                             getVOP3SrcForVT<P.Src0VT>.ret);
 }
 
 let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1141,7 +1149,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
     let Constraints = "@earlyclobber $vdst" in {
       defm V_CVT_SCALEF32_SR_PK_FP4_F16:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
       defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
-      defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+      defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOP_I32_V2F32_I32_F32>>;    
     }
   }
   defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index e0b832d8fe297..099916f48b5e7 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -434,3 +434,66 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 clamp
 
 // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
 v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_fp6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_bf6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5
+v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5

arsenm

We should also have codegen tests that show SGPR to VGPR copies inserted in lowering

llvm/lib/Target/AMDGPU/SIInstrInfo.td

jayfoad · 2025-02-18T10:55:25Z

Allow only VGPR wide sources in fp6/4/8 conversions

All convert opcodes operating on FP6/BF6/FP4

Do you mean all of BF8/BF6/FP8/FP6/FP4? Please make the title and description consistent.

llvm/lib/Target/AMDGPU/SIInstrInfo.td

llvm/lib/Target/AMDGPU/VOP3Instructions.td

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Sisyph

LGTM

llvm-ci · 2025-02-21T02:06:47Z

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/16183

Here is the relevant piece of the build log for the reference

Step 7 (Add check check-offload) failure: test (failure)
...
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug53727.cpp (1002 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug50022.cpp (1003 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/wtime.c (1004 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/test_libc.cpp (1005 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp (1006 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp (1007 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/complex_reduction.cpp (1008 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp (1009 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp (1010 of 1011)
TIMEOUT: libomptarget :: amdgcn-amd-amdhsa :: offloading/ctor_dtor.cpp (1011 of 1011)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/ctor_dtor.cpp' FAILED ********************
Exit Code: -9
Timeout: Reached timeout of 100 seconds

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp    -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a && /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp | /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True

--

********************
Slowest Tests:
--------------------------------------------------------------------------
100.05s: libomptarget :: amdgcn-amd-amdhsa :: offloading/ctor_dtor.cpp
15.46s: libomptarget :: amdgcn-amd-amdhsa :: offloading/bug49021.cpp
12.70s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_max.cpp
12.43s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_min.cpp
10.98s: libomptarget :: amdgcn-amd-amdhsa :: offloading/complex_reduction.cpp
9.31s: libomptarget :: amdgcn-amd-amdhsa :: jit/empty_kernel_lvl2.c
9.23s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp
7.58s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp
7.48s: libomptarget :: amdgcn-amd-amdhsa :: offloading/barrier_fence.c
7.34s: libomptarget :: amdgcn-amd-amdhsa :: offloading/ompx_saxpy_mixed.c
7.25s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/complex_reduction.cpp
6.61s: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp
5.67s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction.cpp
5.16s: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp
5.11s: libomptarget :: amdgcn-amd-amdhsa :: offloading/default_thread_limit.c

llvm-ci · 2025-02-21T02:21:28Z

LLVM Buildbot has detected a new failure on builder clang-aarch64-quick running on linaro-clang-aarch64-quick while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/12584

Here is the relevant piece of the build log for the reference

Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'lit :: googletest-timeout.py' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 9
not env -u FILECHECK_OPTS "/usr/bin/python3.10" /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical -v Inputs/googletest-timeout    --param gtest_filter=InfiniteLoopSubTest --timeout=1 > /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/Output/googletest-timeout.py.tmp.cmd.out
# executed command: not env -u FILECHECK_OPTS /usr/bin/python3.10 /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical -v Inputs/googletest-timeout --param gtest_filter=InfiniteLoopSubTest --timeout=1
# .---command stderr------------
# | lit.py: /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 1 seconds was requested on the command line. Forcing timeout to be 1 seconds.
# `-----------------------------
# RUN: at line 11
FileCheck --check-prefix=CHECK-INF < /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/Output/googletest-timeout.py.tmp.cmd.out /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# executed command: FileCheck --check-prefix=CHECK-INF /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# .---command stderr------------
# | /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py:34:14: error: CHECK-INF: expected string not found in input
# | # CHECK-INF: Timed Out: 1
# |              ^
# | <stdin>:13:29: note: scanning from here
# | Reached timeout of 1 seconds
# |                             ^
# | <stdin>:37:2: note: possible intended match here
# |  Timed Out: 2 (100.00%)
# |  ^
# | 
# | Input file: <stdin>
# | Check file: /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             .
# |             .
# |             .
# |             8:  
# |             9:  
# |            10: -- 
# |            11: exit: -9 
# |            12: -- 
# |            13: Reached timeout of 1 seconds 
# | check:34'0                                 X error: no match found
# |            14: ******************** 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~
# |            15: TIMEOUT: googletest-timeout :: DummySubDir/OneTest.py/1/2 (2 of 2) 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            16: ******************** TEST 'googletest-timeout :: DummySubDir/OneTest.py/1/2' FAILED ******************** 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            17: Script(shard): 
# | check:34'0     ~~~~~~~~~~~~~~~
...

llvm-ci · 2025-02-21T02:40:31Z

LLVM Buildbot has detected a new failure on builder clang-cmake-x86_64-avx512-win running on avx512-intel64-win while building llvm at step 6 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/4962

Here is the relevant piece of the build log for the reference

Step 6 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'Clang :: Driver/offload-Xarch.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 3
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe --target=x86_64-unknown-linux-gnu -x cuda D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c -Xarch_nvptx64 -O3 -S -nogpulib -nogpuinc -### 2>&1 | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' --target=x86_64-unknown-linux-gnu -x cuda 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' -Xarch_nvptx64 -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 4
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -x cuda D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c -Xarch_device -O3 -S -nogpulib -nogpuinc -### 2>&1 | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -x cuda 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' -Xarch_device -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 5
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -x hip D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c -Xarch_amdgcn -O3 -S -nogpulib -nogpuinc -### 2>&1 | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -x hip 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' -Xarch_amdgcn -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 6
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib -nogpuinc    -Xarch_amdgcn -march=gfx90a -Xarch_amdgcn -O3 -S -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib -nogpuinc -Xarch_amdgcn -march=gfx90a -Xarch_amdgcn -O3 -S '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 9
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib -nogpuinc    -Xarch_nvptx64 -march=sm_52 -Xarch_nvptx64 -O3 -S -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib -nogpuinc -Xarch_nvptx64 -march=sm_52 -Xarch_nvptx64 -O3 -S '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 15
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -nogpulib    --target=x86_64-unknown-linux-gnu -Xopenmp-target=nvptx64-nvidia-cuda --offload-arch=sm_52,sm_60 -nogpuinc    -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx90a,gfx1030 -ccc-print-bindings -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=OPENMP D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -nogpulib --target=x86_64-unknown-linux-gnu -Xopenmp-target=nvptx64-nvidia-cuda --offload-arch=sm_52,sm_60 -nogpuinc -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx90a,gfx1030 -ccc-print-bindings '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=OPENMP 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 31
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -x cuda D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c --offload-arch=sm_52,sm_60 -Xarch_sm_52 -O3 -Xarch_sm_60 -O0    --target=x86_64-unknown-linux-gnu -Xarch_host -O3 -S -nogpulib -nogpuinc -### 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=CUDA D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -x cuda 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' --offload-arch=sm_52,sm_60 -Xarch_sm_52 -O3 -Xarch_sm_60 -O0 --target=x86_64-unknown-linux-gnu -Xarch_host -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=CUDA 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 39
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp --offload-arch=gfx90a -nogpulib -nogpuinc    --target=x86_64-unknown-linux-gnu -Xarch_amdgcn -Wl,-lfoo -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=LIBS D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp --offload-arch=gfx90a -nogpulib -nogpuinc --target=x86_64-unknown-linux-gnu -Xarch_amdgcn -Wl,-lfoo '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=LIBS 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# .---command stderr------------
# | D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c:45:10: error: LIBS: expected string not found in input
# | // LIBS: "--device-linker=amdgcn-amd-amdhsa=-lfoo"
# |          ^
# | <stdin>:1:1: note: scanning from here
# | clang version 21.0.0git (https://github.com/llvm/llvm-project.git 7c2ebe5dbb4d5cfae7670036394a6f23dcbe4bf7)
# | ^
# | <stdin>:6:1442: note: possible intended match here
# |  "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\bin\\clang.exe" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" "-emit-llvm-uselists" "-dumpdir" "a-" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "offload-Xarch.c" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-debugger-tuning=gdb" "-fdebug-compilation-dir=D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\tools\\clang\\test\\Driver" "-fcoverage-compilation-dir=D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\tools\\clang\\test\\Driver" "-resource-dir" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\lib\\clang\\21" "-internal-isystem" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\lib\\clang\\21\\include" "-internal-isystem" "/usr/local/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-internal-isystem" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\lib\\clang\\21\\include" "-internal-isystem" "/usr/local/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-ferror-limit" "19" "-fopenmp" "--no-offloadlib" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-disable-llvm-passes" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "C:\\Users\\tianfei\\AppData\\Local\\Temp\\1\\lit-tmp-fx5bqi37\\offload-Xarch-7e30da.bc" "-x" "c" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\llvm\\clang\\test\\Driver\\offload-Xarch.c"
# |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  ^
# | 
# | Input file: <stdin>
# | Check file: D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
...

…lvm#127464) The Src0 operand width higher that 32-bits of cvt_scale opcodes operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.

AMDGPU: Allow only VGPR wide sources in fp6/4/8 conversions

824b47b

All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for any operand slots providing more than 32-bits of data. Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af

pravinjagtap requested review from arsenm, scchan, shiltian, srpande and vikramRH February 17, 2025 09:54

llvmbot added backend:AMDGPU mc Machine (object) code labels Feb 17, 2025

clang-format

13e7dde

arsenm reviewed Feb 17, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstrInfo.td Outdated Show resolved Hide resolved

added codegen tests for SGPR to VGPR copies.

ec892c9

pravinjagtap requested review from cdevadas and jayfoad February 18, 2025 05:34

pravinjagtap requested a review from rampitec February 19, 2025 10:43

rampitec reviewed Feb 19, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstrInfo.td Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/VOP3Instructions.td Outdated Show resolved Hide resolved

pravinjagtap changed the title ~~AMDGPU: Allow only VGPR wide sources in fp6/4/8 conversions~~ AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. Feb 20, 2025

Unified with downstream changes

b9a5e3b

Sisyph reviewed Feb 20, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstrInfo.td Outdated Show resolved Hide resolved

moved getVOP3VRegForVT in VOP2Instructions.td

b521ec2

Sisyph approved these changes Feb 20, 2025

View reviewed changes

pravinjagtap merged commit 7c2ebe5 into llvm:main Feb 21, 2025
8 checks passed

geyyer mentioned this pull request Feb 21, 2025

Add MX FP4 device conversion tests ROCm/composable_kernel#1889

Merged

7 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464

AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464

Uh oh!

pravinjagtap commented Feb 17, 2025 •

edited

Loading

Uh oh!

llvmbot commented Feb 17, 2025 •

edited

Loading

Uh oh!

arsenm left a comment

Uh oh!

Uh oh!

jayfoad commented Feb 18, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Sisyph left a comment

Uh oh!

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

Uh oh!

AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464

AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464

Uh oh!

Conversation

pravinjagtap commented Feb 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Feb 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

arsenm left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

jayfoad commented Feb 18, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

Uh oh!

pravinjagtap commented Feb 17, 2025 •

edited

Loading

llvmbot commented Feb 17, 2025 •

edited

Loading