Skip to content

AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. #127464

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 21, 2025

Conversation

pravinjagtap
Copy link
Contributor

@pravinjagtap pravinjagtap commented Feb 17, 2025

The Src0 operand width higher that 32-bits of cvt_scale opcodes operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.

All convert opcodes operating on FP6/BF6/FP4
data must use VGPR sources for any operand slots
providing more than 32-bits of data.

Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af
@llvmbot
Copy link
Member

llvmbot commented Feb 17, 2025

@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-mc

Author: Pravin Jagtap (pravinjagtap)

Changes

All convert opcodes operating on FP6/BF6/FP4
data must use VGPR sources for any operand slots
providing more than 32-bits of data.

Change-Id: Ic48ef739875c341e85f1a6c3984851ae497592af


Full diff: https://github.com/llvm/llvm-project/pull/127464.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+13)
  • (modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+10-2)
  • (modified) llvm/test/MC/AMDGPU/gfx950_err.s (+63)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index bb78e77a9dc1a..e2d0b28a391fb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1818,6 +1818,18 @@ class getVOP3VRegSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
          1                 : RegisterOperand<VGPR_32>);
 }
 
+// VGPR only VOP3 src with 9 bit encoding
+class getVOP3VSrcReg9ForVT<ValueType VT> {
+  RegisterOperand ret = !cond(!eq(VT.Size, 1024) : VRegSrc_1024,
+                              !eq(VT.Size, 512)  : VRegSrc_512,
+                              !eq(VT.Size, 256)  : VRegSrc_256,
+                              !eq(VT.Size, 192)  : VRegSrc_192,
+                              !eq(VT.Size, 128)  : VRegSrc_128,
+                              !eq(VT.Size, 96)   : VRegSrc_96,
+                              !eq(VT.Size, 64)   : VRegSrc_64,
+                              1 : VRegSrc_32);
+}
+
 // Src2 of VOP3 DPP instructions cannot be a literal
 class getVOP3DPPSrcForVT<ValueType VT, bit IsFake16 = 1> {
   RegisterOperand ret =
@@ -2852,6 +2864,7 @@ def VOP_V2I16_F32_F32_F32 : VOPProfile<[v2i16, f32, f32, f32]>;
 def VOP_V2I16_V2F16_F32 : VOPProfile<[v2i16, v2f16, f32, untyped]>;
 def VOP_V2I16_V2BF16_F32 : VOPProfile<[v2i16, v2bf16, f32, untyped]>;
 def VOP_I32_F32_F32_F32 : VOPProfile<[i32, f32, f32, f32]>;
+def VOP_I32_V2F32_I32_F32 : VOPProfile<[i32, v2f32, i32, f32]>;
 def VOP_I32_V2F16_F32_F32 : VOPProfile<[i32, v2f16, f32, f32]>;
 def VOP_I32_V2BF16_F32_F32: VOPProfile<[i32, v2bf16, f32, f32]>;
 def VOP_BF16_F32_I32 : VOPProfile<[bf16, f32, i32, untyped]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index afafc2ecccfaf..6a6d8bd94826f 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1052,7 +1052,10 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
   let HasFP4DstByteSel = 1;
 }
 
-def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
+class VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profile<P, VOP3_OPSEL> {
+
+  let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+                                             getVOP3SrcForVT<P.Src0VT>.ret);
   let InsVOP3OpSel = (ins PackedF32InputMods: $src0_modifiers, Src0RC64:$src0,
                           Int32InputMods:     $src1_modifiers, Src1RC64:$src1,
                           FP32InputMods:      $src2_modifiers, Src2RC64:$src2,
@@ -1100,6 +1103,11 @@ class VOP3_CVT_SCALEF32_PK_F864_Profile<VOPProfile P> : VOP3_Profile<P> {
   let HasExt32BitDPP = 0;
   let HasExtVOP3DPP = 0;
   let HasExt64BitDPP = 0;
+
+  // All convert opcodes operating on FP6/BF6/FP4 data must use VGPR sources for
+  // any operand slots > 32 bit.
+  let Src0RC64 = !if(!gt(P.Src0VT.Size, 32), getVOP3VSrcReg9ForVT<P.Src0VT>.ret,
+                                             getVOP3SrcForVT<P.Src0VT>.ret);
 }
 
 let SubtargetPredicate = HasFP8ConversionScaleInsts, mayRaiseFPException = 0 in {
@@ -1141,7 +1149,7 @@ let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in
     let Constraints = "@earlyclobber $vdst" in {
       defm V_CVT_SCALEF32_SR_PK_FP4_F16:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
       defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
-      defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile>;
+      defm V_CVT_SCALEF32_SR_PK_FP4_F32:  VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f32", VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile<VOP_I32_V2F32_I32_F32>>;    
     }
   }
   defm V_CVT_SCALEF32_PK_F16_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f16_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f16>>;
diff --git a/llvm/test/MC/AMDGPU/gfx950_err.s b/llvm/test/MC/AMDGPU/gfx950_err.s
index e0b832d8fe297..099916f48b5e7 100644
--- a/llvm/test/MC/AMDGPU/gfx950_err.s
+++ b/llvm/test/MC/AMDGPU/gfx950_err.s
@@ -434,3 +434,66 @@ v_cvt_scalef32_sr_pk32_bf6_f32 v[0:5], v[6:37], v38, v39 clamp
 
 // GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
 v_cvt_scalef32_sr_pk32_fp6_f32 v[0:5], v[6:37], v38, v39 clamp
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_fp6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f32_bf6 v[0:31], s[32:37], v6
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_fp6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_f16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf16_bf6 v[0:15], s[20:25], v8
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_f16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_fp6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_pk32_bf6_bf16 v[18:23], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_bf6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_bf6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_sr_pk32_fp6_f16 v[20:25], s[0:15], v16, v17
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid or unsupported register size
+v_cvt_scalef32_sr_pk32_fp6_f32 v[36:41], s[0:31], v32, v33
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_fp6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], s[0:15], v[6:21], v16
+
+// GFX950: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand for instruction
+v_cvt_scalef32_2xpk16_bf6_f32 v[0:5], v[6:21], s[0:15], v16
+
+// GFX950: v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v4, v5
+v_cvt_scalef32_sr_pk_fp4_f32 v0, s[2:3], v4, v5

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should also have codegen tests that show SGPR to VGPR copies inserted in lowering

@jayfoad
Copy link
Contributor

jayfoad commented Feb 18, 2025

Allow only VGPR wide sources in fp6/4/8 conversions

All convert opcodes operating on FP6/BF6/FP4

Do you mean all of BF8/BF6/FP8/FP6/FP4? Please make the title and description consistent.

@pravinjagtap pravinjagtap changed the title AMDGPU: Allow only VGPR wide sources in fp6/4/8 conversions AMDGPU: Restrict src0 to VGPRs only for certain cvt scale opcodes. Feb 20, 2025
Copy link
Contributor

@Sisyph Sisyph left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@pravinjagtap pravinjagtap merged commit 7c2ebe5 into llvm:main Feb 21, 2025
8 checks passed
@llvm-ci
Copy link
Collaborator

llvm-ci commented Feb 21, 2025

LLVM Buildbot has detected a new failure on builder openmp-offload-amdgpu-runtime running on omp-vega20-0 while building llvm at step 7 "Add check check-offload".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/30/builds/16183

Here is the relevant piece of the build log for the reference
Step 7 (Add check check-offload) failure: test (failure)
...
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug53727.cpp (1002 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug50022.cpp (1003 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/wtime.c (1004 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/test_libc.cpp (1005 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp (1006 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp (1007 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/complex_reduction.cpp (1008 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp (1009 of 1011)
PASS: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp (1010 of 1011)
TIMEOUT: libomptarget :: amdgcn-amd-amdhsa :: offloading/ctor_dtor.cpp (1011 of 1011)
******************** TEST 'libomptarget :: amdgcn-amd-amdhsa :: offloading/ctor_dtor.cpp' FAILED ********************
Exit Code: -9
Timeout: Reached timeout of 100 seconds

Command Output (stdout):
--
# RUN: at line 1
/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp    -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src  -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib  -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a && /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp | /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/clang++ -fopenmp -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test -I /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -L /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -nogpulib -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/openmp/runtime/src -Wl,-rpath,/home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib -fopenmp-targets=amdgcn-amd-amdhsa /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp -o /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./lib/libomptarget.devicertl.a
# note: command had no output on stdout or stderr
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/runtimes/runtimes-bins/offload/test/amdgcn-amd-amdhsa/offloading/Output/ctor_dtor.cpp.tmp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True
# executed command: /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.build/./bin/FileCheck /home/ompworker/bbot/openmp-offload-amdgpu-runtime/llvm.src/offload/test/offloading/ctor_dtor.cpp
# note: command had no output on stdout or stderr
# error: command failed with exit status: -9
# error: command reached timeout: True

--

********************
Slowest Tests:
--------------------------------------------------------------------------
100.05s: libomptarget :: amdgcn-amd-amdhsa :: offloading/ctor_dtor.cpp
15.46s: libomptarget :: amdgcn-amd-amdhsa :: offloading/bug49021.cpp
12.70s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_max.cpp
12.43s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction_min.cpp
10.98s: libomptarget :: amdgcn-amd-amdhsa :: offloading/complex_reduction.cpp
9.31s: libomptarget :: amdgcn-amd-amdhsa :: jit/empty_kernel_lvl2.c
9.23s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/bug49021.cpp
7.58s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/std_complex_arithmetic.cpp
7.48s: libomptarget :: amdgcn-amd-amdhsa :: offloading/barrier_fence.c
7.34s: libomptarget :: amdgcn-amd-amdhsa :: offloading/ompx_saxpy_mixed.c
7.25s: libomptarget :: x86_64-unknown-linux-gnu :: offloading/complex_reduction.cpp
6.61s: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/bug49021.cpp
5.67s: libomptarget :: amdgcn-amd-amdhsa :: offloading/parallel_target_teams_reduction.cpp
5.16s: libomptarget :: x86_64-unknown-linux-gnu-LTO :: offloading/std_complex_arithmetic.cpp
5.11s: libomptarget :: amdgcn-amd-amdhsa :: offloading/default_thread_limit.c

@llvm-ci
Copy link
Collaborator

llvm-ci commented Feb 21, 2025

LLVM Buildbot has detected a new failure on builder clang-aarch64-quick running on linaro-clang-aarch64-quick while building llvm at step 5 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/65/builds/12584

Here is the relevant piece of the build log for the reference
Step 5 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'lit :: googletest-timeout.py' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 9
not env -u FILECHECK_OPTS "/usr/bin/python3.10" /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical -v Inputs/googletest-timeout    --param gtest_filter=InfiniteLoopSubTest --timeout=1 > /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/Output/googletest-timeout.py.tmp.cmd.out
# executed command: not env -u FILECHECK_OPTS /usr/bin/python3.10 /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit.py -j1 --order=lexical -v Inputs/googletest-timeout --param gtest_filter=InfiniteLoopSubTest --timeout=1
# .---command stderr------------
# | lit.py: /home/tcwg-buildbot/worker/clang-aarch64-quick/llvm/llvm/utils/lit/lit/main.py:72: note: The test suite configuration requested an individual test timeout of 0 seconds but a timeout of 1 seconds was requested on the command line. Forcing timeout to be 1 seconds.
# `-----------------------------
# RUN: at line 11
FileCheck --check-prefix=CHECK-INF < /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/Output/googletest-timeout.py.tmp.cmd.out /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# executed command: FileCheck --check-prefix=CHECK-INF /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# .---command stderr------------
# | /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py:34:14: error: CHECK-INF: expected string not found in input
# | # CHECK-INF: Timed Out: 1
# |              ^
# | <stdin>:13:29: note: scanning from here
# | Reached timeout of 1 seconds
# |                             ^
# | <stdin>:37:2: note: possible intended match here
# |  Timed Out: 2 (100.00%)
# |  ^
# | 
# | Input file: <stdin>
# | Check file: /home/tcwg-buildbot/worker/clang-aarch64-quick/stage1/utils/lit/tests/googletest-timeout.py
# | 
# | -dump-input=help explains the following input dump.
# | 
# | Input was:
# | <<<<<<
# |             .
# |             .
# |             .
# |             8:  
# |             9:  
# |            10: -- 
# |            11: exit: -9 
# |            12: -- 
# |            13: Reached timeout of 1 seconds 
# | check:34'0                                 X error: no match found
# |            14: ******************** 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~
# |            15: TIMEOUT: googletest-timeout :: DummySubDir/OneTest.py/1/2 (2 of 2) 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            16: ******************** TEST 'googletest-timeout :: DummySubDir/OneTest.py/1/2' FAILED ******************** 
# | check:34'0     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# |            17: Script(shard): 
# | check:34'0     ~~~~~~~~~~~~~~~
...

@llvm-ci
Copy link
Collaborator

llvm-ci commented Feb 21, 2025

LLVM Buildbot has detected a new failure on builder clang-cmake-x86_64-avx512-win running on avx512-intel64-win while building llvm at step 6 "ninja check 1".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/81/builds/4962

Here is the relevant piece of the build log for the reference
Step 6 (ninja check 1) failure: stage 1 checked (failure)
******************** TEST 'Clang :: Driver/offload-Xarch.c' FAILED ********************
Exit Code: 1

Command Output (stdout):
--
# RUN: at line 3
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe --target=x86_64-unknown-linux-gnu -x cuda D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c -Xarch_nvptx64 -O3 -S -nogpulib -nogpuinc -### 2>&1 | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' --target=x86_64-unknown-linux-gnu -x cuda 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' -Xarch_nvptx64 -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 4
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -x cuda D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c -Xarch_device -O3 -S -nogpulib -nogpuinc -### 2>&1 | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -x cuda 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' -Xarch_device -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 5
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -x hip D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c -Xarch_amdgcn -O3 -S -nogpulib -nogpuinc -### 2>&1 | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -x hip 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' -Xarch_amdgcn -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 6
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib -nogpuinc    -Xarch_amdgcn -march=gfx90a -Xarch_amdgcn -O3 -S -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp -fopenmp-targets=amdgcn-amd-amdhsa -nogpulib -nogpuinc -Xarch_amdgcn -march=gfx90a -Xarch_amdgcn -O3 -S '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 9
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib -nogpuinc    -Xarch_nvptx64 -march=sm_52 -Xarch_nvptx64 -O3 -S -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=O3ONCE D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -nogpulib -nogpuinc -Xarch_nvptx64 -march=sm_52 -Xarch_nvptx64 -O3 -S '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=O3ONCE 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 15
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -nogpulib    --target=x86_64-unknown-linux-gnu -Xopenmp-target=nvptx64-nvidia-cuda --offload-arch=sm_52,sm_60 -nogpuinc    -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx90a,gfx1030 -ccc-print-bindings -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=OPENMP D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -nogpulib --target=x86_64-unknown-linux-gnu -Xopenmp-target=nvptx64-nvidia-cuda --offload-arch=sm_52,sm_60 -nogpuinc -Xopenmp-target=amdgcn-amd-amdhsa --offload-arch=gfx90a,gfx1030 -ccc-print-bindings '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=OPENMP 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 31
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -x cuda D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c --offload-arch=sm_52,sm_60 -Xarch_sm_52 -O3 -Xarch_sm_60 -O0    --target=x86_64-unknown-linux-gnu -Xarch_host -O3 -S -nogpulib -nogpuinc -### 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=CUDA D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -x cuda 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c' --offload-arch=sm_52,sm_60 -Xarch_sm_52 -O3 -Xarch_sm_60 -O0 --target=x86_64-unknown-linux-gnu -Xarch_host -O3 -S -nogpulib -nogpuinc '-###'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=CUDA 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# RUN: at line 39
d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe -fopenmp=libomp --offload-arch=gfx90a -nogpulib -nogpuinc    --target=x86_64-unknown-linux-gnu -Xarch_amdgcn -Wl,-lfoo -### D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c 2>&1  | d:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe -check-prefix=LIBS D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\clang.exe' -fopenmp=libomp --offload-arch=gfx90a -nogpulib -nogpuinc --target=x86_64-unknown-linux-gnu -Xarch_amdgcn -Wl,-lfoo '-###' 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# executed command: 'd:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\stage1\bin\filecheck.exe' -check-prefix=LIBS 'D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c'
# .---command stderr------------
# | D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c:45:10: error: LIBS: expected string not found in input
# | // LIBS: "--device-linker=amdgcn-amd-amdhsa=-lfoo"
# |          ^
# | <stdin>:1:1: note: scanning from here
# | clang version 21.0.0git (https://github.com/llvm/llvm-project.git 7c2ebe5dbb4d5cfae7670036394a6f23dcbe4bf7)
# | ^
# | <stdin>:6:1442: note: possible intended match here
# |  "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\bin\\clang.exe" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-emit-llvm-bc" "-emit-llvm-uselists" "-dumpdir" "a-" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "offload-Xarch.c" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-debugger-tuning=gdb" "-fdebug-compilation-dir=D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\tools\\clang\\test\\Driver" "-fcoverage-compilation-dir=D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\tools\\clang\\test\\Driver" "-resource-dir" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\lib\\clang\\21" "-internal-isystem" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\lib\\clang\\21\\include" "-internal-isystem" "/usr/local/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-internal-isystem" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\stage1\\lib\\clang\\21\\include" "-internal-isystem" "/usr/local/include" "-internal-externc-isystem" "/include" "-internal-externc-isystem" "/usr/include" "-ferror-limit" "19" "-fopenmp" "--no-offloadlib" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-disable-llvm-passes" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "C:\\Users\\tianfei\\AppData\\Local\\Temp\\1\\lit-tmp-fx5bqi37\\offload-Xarch-7e30da.bc" "-x" "c" "D:\\buildbot\\llvm-worker\\clang-cmake-x86_64-avx512-win\\llvm\\clang\\test\\Driver\\offload-Xarch.c"
# |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  ^
# | 
# | Input file: <stdin>
# | Check file: D:\buildbot\llvm-worker\clang-cmake-x86_64-avx512-win\llvm\clang\test\Driver\offload-Xarch.c
...

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Mar 13, 2025
…lvm#127464)

The Src0 operand width higher that 32-bits of cvt_scale opcodes
operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.
jrbyrnes pushed a commit to jrbyrnes/llvm-project that referenced this pull request May 27, 2025
…lvm#127464)

The Src0 operand width higher that 32-bits of cvt_scale opcodes
operating on FP6/BF6/FP4 need to be restricted to take only VGPRs.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
backend:AMDGPU mc Machine (object) code
Projects
None yet
Development

Successfully merging this pull request may close these issues.

8 participants