diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2f3626f1c820a..be5e0f6ef058b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -729,8 +729,17 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL, // prevents it from being picked up by the earlier bitcast case. if (ValueVT.getVectorElementCount().isScalar() && (!ValueVT.isFloatingPoint() || !PartVT.isInteger())) { - Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, - DAG.getVectorIdxConstant(0, DL)); + // If we reach this condition and PartVT is FP, this means that + // ValueVT is also FP and both have a different size, otherwise we + // would have bitcasted them. Producing an EXTRACT_VECTOR_ELT here + // would be invalid since that would mean the smaller FP type has to + // be extended to the larger one. + if (PartVT.isFloatingPoint()) { + Val = DAG.getBitcast(ValueVT.getScalarType(), Val); + Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val); + } else + Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val, + DAG.getVectorIdxConstant(0, DL)); } else { uint64_t ValueSize = ValueVT.getFixedSizeInBits(); assert(PartVT.getFixedSizeInBits() > ValueSize && diff --git a/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll new file mode 100644 index 0000000000000..0d6e987165d87 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/select-phi-s16-fp.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdpal -mcpu=hawaii %s -o - | FileCheck %s + +; For all these tests we disable optimizations through function attributes +; because the code we are exercising here needs phis and we want to keep the +; IR small. + +; This code used to crash in SDISel because f16 was promoted to f32 through +; a `f32 = vector_extract_elt <1 x f16>, i32 0`, which is illegal. +; The invalid SDNode and thus, the crash was only exposed by the constant +; folding. +define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 { +; CHECK-LABEL: phi_vec1half_to_f32_with_const_folding: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_cvt_f32_f16_e64 v2, s4 +; CHECK-NEXT: ; %bb.1: ; %bb +; CHECK-NEXT: v_cvt_f16_f32_e64 v2, v2 +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2 +; CHECK-NEXT: v_cvt_f16_f32_e64 v2, s4 +; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + br label %bb + +bb: + %phi = phi <1 x half> [ zeroinitializer, %entry ] + %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> + store <2 x half> %res, ptr addrspace(1) %dst + ret void +} + +; Same as phi_vec1half_to_f32_with_const_folding but without the folding. +; This test exercises the same invalid SDNode, but it happened to work by +; accident before. Here we make sure the fix also work as expected in the +; non-constant folding case. +define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 { +; CHECK-LABEL: phi_vec1half_to_f32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cvt_f32_f16_e64 v0, v0 +; CHECK-NEXT: ; %bb.1: ; %bb +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, v0 +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2 +; CHECK-NEXT: v_cvt_f16_f32_e64 v0, s4 +; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %input = load <1 x half>, ptr addrspace(1) %src + br label %bb + +bb: + %phi = phi <1 x half> [ %input, %entry ] + %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> + store <2 x half> %res, ptr addrspace(1) %dst + ret void +} + +; Same as phi_vec1bf16_to_f32 but with bfloat instead of half. +define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #0 { +; CHECK-LABEL: phi_vec1bf16_to_f32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; CHECK-NEXT: s_mov_b32 s4, 16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_lshlrev_b32_e64 v0, s4, v0 +; CHECK-NEXT: ; %bb.1: ; %bb +; CHECK-NEXT: v_mul_f32_e64 v0, 1.0, v0 +; CHECK-NEXT: s_mov_b32 s4, 16 +; CHECK-NEXT: v_lshrrev_b32_e64 v0, s4, v0 +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + %input = load <1 x bfloat>, ptr addrspace(1) %src + br label %bb + +bb: + %phi = phi <1 x bfloat> [ %input, %entry ] + %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> + store <2 x bfloat> %res, ptr addrspace(1) %dst + ret void +} + +; Same as phi_vec1half_to_f32_with_const_folding but with bfloat instead of half. +define void @phi_vec1bf16_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 { +; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: ; %bb.1: ; %bb +; CHECK-NEXT: v_mul_f32_e64 v2, 1.0, s4 +; CHECK-NEXT: s_mov_b32 s4, 16 +; CHECK-NEXT: v_lshrrev_b32_e32 v2, s4, v2 +; CHECK-NEXT: s_mov_b32 s7, 0xf000 +; CHECK-NEXT: s_mov_b32 s6, 0 +; CHECK-NEXT: s_mov_b32 s4, s6 +; CHECK-NEXT: s_mov_b32 s5, s6 +; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +entry: + br label %bb + +bb: + %phi = phi <1 x bfloat> [ zeroinitializer, %entry ] + %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> + store <2 x bfloat> %res, ptr addrspace(1) %dst + ret void +} + +attributes #0 = { noinline optnone } diff --git a/llvm/test/CodeGen/ARM/arm-half-promote.ll b/llvm/test/CodeGen/ARM/arm-half-promote.ll index e1ab75b2ac7f1..a5fafd4238616 100644 --- a/llvm/test/CodeGen/ARM/arm-half-promote.ll +++ b/llvm/test/CodeGen/ARM/arm-half-promote.ll @@ -116,7 +116,9 @@ define fastcc { <8 x half>, <8 x half> } @f3() { define void @extract_insert(ptr %dst) optnone noinline { ; CHECK-LABEL: extract_insert: -; CHECK: vmov.i32 d0, #0x0 +; CHECK: movs r1, #0 +; CHECK: vmov s0, r1 +; CHECK: vcvtb.f32.f16 s0, s0 ; CHECK: vcvtb.f16.f32 s0, s0 ; CHECK: vmov r1, s0 ; CHECK: strh r1, [r0] diff --git a/llvm/test/CodeGen/X86/select-phi-s16-fp.ll b/llvm/test/CodeGen/X86/select-phi-s16-fp.ll new file mode 100644 index 0000000000000..083f418b6752e --- /dev/null +++ b/llvm/test/CodeGen/X86/select-phi-s16-fp.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=0 -mcpu=generic -mtriple=x86_64-apple-darwin %s -o - | FileCheck %s + +; For all these tests we disable optimizations through function attributes +; because the code we are exercising here needs phis and we want to keep the +; IR small. + +; This code used to crash in SDISel because bf16 was promoted to f32 through +; a `f32 = vector_extract_elt <1 x bf16>, i32 0`, which is illegal. +; The invalid SDNode and thus, the crash was only exposed by the constant +; folding. +define void @phi_vec1bf16_to_f32_with_const_folding(ptr %dst) #0 { +; CHECK-LABEL: phi_vec1bf16_to_f32_with_const_folding: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rdi, %rbx +; CHECK-NEXT: jmp LBB0_1 +; CHECK-NEXT: LBB0_1: ## %bb +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: callq ___truncsfbf2 +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, 2(%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + br label %bb + +bb: + %phi = phi <1 x bfloat> [ zeroinitializer, %entry ] + %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> + store <2 x bfloat> %res, ptr %dst + ret void +} +; Same as phi_vec1bf16_to_f32_with_const_folding but without the constant +; folding. +; This test exercises the same invalid SDNode, but it happened to work by +; accident before. Here we make sure the fix also work as expected in the +; non-constant folding case. +define void @phi_vec1bf16_to_f32(ptr %src, ptr %dst) #0 { +; CHECK-LABEL: phi_vec1bf16_to_f32: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbx, -16 +; CHECK-NEXT: movq %rsi, %rbx +; CHECK-NEXT: movzwl (%rdi), %eax +; CHECK-NEXT: shll $16, %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: jmp LBB1_1 +; CHECK-NEXT: LBB1_1: ## %bb +; CHECK-NEXT: callq ___truncsfbf2 +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, 2(%rbx) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: retq +entry: + %input = load <1 x bfloat>, ptr %src + br label %bb + +bb: + %phi = phi <1 x bfloat> [ %input, %entry ] + %res = shufflevector <1 x bfloat> poison, <1 x bfloat> %phi, <2 x i32> + store <2 x bfloat> %res, ptr %dst + ret void +} + + +; Half type is legal on x86 so nothing special here, but it +; doesn't hurt to be thorough. +define void @phi_vec1half_with_const_folding(ptr %dst) #0 { +; CHECK-LABEL: phi_vec1half_with_const_folding: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: jmp LBB2_1 +; CHECK-NEXT: LBB2_1: ## %bb +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, 2(%rdi) +; CHECK-NEXT: retq +entry: + br label %bb + +bb: + %phi = phi <1 x half> [ zeroinitializer, %entry ] + %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> + store <2 x half> %res, ptr %dst + ret void +} + +; Half type is legal on x86 so nothing special here, but it +; doesn't hurt to be thorough. +; Same as phi_vec1half_with_constant_folding but without the constant folding. +define void @phi_vec1half(ptr %src, ptr %dst) #0 { +; CHECK-LABEL: phi_vec1half: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: pinsrw $0, (%rdi), %xmm0 +; CHECK-NEXT: jmp LBB3_1 +; CHECK-NEXT: LBB3_1: ## %bb +; CHECK-NEXT: pextrw $0, %xmm0, %eax +; CHECK-NEXT: movw %ax, 2(%rsi) +; CHECK-NEXT: retq +entry: + %input = load <1 x half>, ptr %src + br label %bb + +bb: + %phi = phi <1 x half> [ %input, %entry ] + %res = shufflevector <1 x half> poison, <1 x half> %phi, <2 x i32> + store <2 x half> %res, ptr %dst + ret void +} + +attributes #0 = { noinline optnone }