Skip to content

Commit aa84ee5

Browse files
topperctstellar
authored andcommitted
[X86] Change precision control to FP80 during u64->fp32 conversion on Windows.
This is an alternative to D141074 to fix the problem by adjusting the precision control dynamically. Reviewed By: icedrocket Differential Revision: https://reviews.llvm.org/D142178 (cherry picked from commit 11fb09e)
1 parent d551b1e commit aa84ee5

File tree

4 files changed

+145
-5
lines changed

4 files changed

+145
-5
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+78-3
Original file line numberDiff line numberDiff line change
@@ -21986,15 +21986,25 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
2198621986
// Extend everything to 80 bits to force it to be done on x87.
2198721987
// TODO: Are there any fast-math-flags to propagate here?
2198821988
if (IsStrict) {
21989-
SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
21990-
{Chain, Fild, Fudge});
21989+
unsigned Opc = ISD::STRICT_FADD;
21990+
// Windows needs the precision control changed to 80bits around this add.
21991+
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
21992+
Opc = X86ISD::STRICT_FP80_ADD;
21993+
21994+
SDValue Add =
21995+
DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
2199121996
// STRICT_FP_ROUND can't handle equal types.
2199221997
if (DstVT == MVT::f80)
2199321998
return Add;
2199421999
return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
2199522000
{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
2199622001
}
21997-
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
22002+
unsigned Opc = ISD::FADD;
22003+
// Windows needs the precision control changed to 80bits around this add.
22004+
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
22005+
Opc = X86ISD::FP80_ADD;
22006+
22007+
SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
2199822008
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
2199922009
DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
2200022010
}
@@ -34790,6 +34800,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
3479034800
NODE_NAME_CASE(AESDECWIDE256KL)
3479134801
NODE_NAME_CASE(CMPCCXADD)
3479234802
NODE_NAME_CASE(TESTUI)
34803+
NODE_NAME_CASE(FP80_ADD)
34804+
NODE_NAME_CASE(STRICT_FP80_ADD)
3479334805
}
3479434806
return nullptr;
3479534807
#undef NODE_NAME_CASE
@@ -37300,6 +37312,69 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
3730037312
return BB;
3730137313
}
3730237314

37315+
case X86::FP80_ADDr:
37316+
case X86::FP80_ADDm32: {
37317+
// Change the floating point control register to use double extended
37318+
// precision when performing the addition.
37319+
int OrigCWFrameIdx =
37320+
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37321+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
37322+
OrigCWFrameIdx);
37323+
37324+
// Load the old value of the control word...
37325+
Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37326+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
37327+
OrigCWFrameIdx);
37328+
37329+
// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
37330+
// precision.
37331+
Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
37332+
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
37333+
.addReg(OldCW, RegState::Kill)
37334+
.addImm(0x300);
37335+
37336+
// Extract to 16 bits.
37337+
Register NewCW16 =
37338+
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
37339+
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
37340+
.addReg(NewCW, RegState::Kill, X86::sub_16bit);
37341+
37342+
// Prepare memory for FLDCW.
37343+
int NewCWFrameIdx =
37344+
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
37345+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
37346+
NewCWFrameIdx)
37347+
.addReg(NewCW16, RegState::Kill);
37348+
37349+
// Reload the modified control word now...
37350+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37351+
NewCWFrameIdx);
37352+
37353+
// Do the addition.
37354+
if (MI.getOpcode() == X86::FP80_ADDr) {
37355+
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
37356+
.add(MI.getOperand(0))
37357+
.add(MI.getOperand(1))
37358+
.add(MI.getOperand(2));
37359+
} else {
37360+
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
37361+
.add(MI.getOperand(0))
37362+
.add(MI.getOperand(1))
37363+
.add(MI.getOperand(2))
37364+
.add(MI.getOperand(3))
37365+
.add(MI.getOperand(4))
37366+
.add(MI.getOperand(5))
37367+
.add(MI.getOperand(6));
37368+
}
37369+
37370+
// Reload the original control word now.
37371+
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
37372+
OrigCWFrameIdx);
37373+
37374+
MI.eraseFromParent(); // The pseudo instruction is gone now.
37375+
return BB;
37376+
}
37377+
3730337378
case X86::FP32_TO_INT16_IN_MEM:
3730437379
case X86::FP32_TO_INT32_IN_MEM:
3730537380
case X86::FP32_TO_INT64_IN_MEM:

llvm/lib/Target/X86/X86ISelLowering.h

+6
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,9 @@ namespace llvm {
740740
// User level interrupts - testui
741741
TESTUI,
742742

743+
// Perform an FP80 add after changing precision control in FPCW.
744+
FP80_ADD,
745+
743746
/// X86 strict FP compare instructions.
744747
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
745748
STRICT_FCMPS,
@@ -779,6 +782,9 @@ namespace llvm {
779782
STRICT_CVTPS2PH,
780783
STRICT_CVTPH2PS,
781784

785+
// Perform an FP80 add after changing precision control in FPCW.
786+
STRICT_FP80_ADD,
787+
782788
// WARNING: Only add nodes here if they are strict FP nodes. Non-memory and
783789
// non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.
784790

llvm/lib/Target/X86/X86InstrFPStack.td

+15
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
2626
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
2727
def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
2828

29+
def X86fp80_add : SDNode<"X86ISD::FP80_ADD", SDTFPBinOp, [SDNPCommutative]>;
30+
def X86strict_fp80_add : SDNode<"X86ISD::STRICT_FP80_ADD", SDTFPBinOp,
31+
[SDNPHasChain,SDNPCommutative]>;
32+
def any_X86fp80_add : PatFrags<(ops node:$lhs, node:$rhs),
33+
[(X86strict_fp80_add node:$lhs, node:$rhs),
34+
(X86fp80_add node:$lhs, node:$rhs)]>;
35+
2936
def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
3037
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
3138
def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
@@ -141,6 +148,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
141148
[(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
142149
def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
143150
[(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
151+
152+
def FP80_ADDr : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
153+
[(set RFP80:$dst,
154+
(any_X86fp80_add RFP80:$src1, RFP80:$src2))]>;
155+
def FP80_ADDm32 : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2),
156+
[(set RFP80:$dst,
157+
(any_X86fp80_add RFP80:$src1,
158+
(f80 (extloadf32 addr:$src2))))]>;
144159
}
145160

146161
// All FP Stack operations are represented with four instructions here. The

llvm/test/CodeGen/X86/uint64-to-float.ll

+46-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=i686-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3-
; RUN: llc < %s -mtriple=x86_64-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
2+
; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
3+
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
4+
; RUN: llc < %s -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86-WIN
5+
; RUN: llc < %s -mtriple=x86_64-windows -mattr=+sse2 | FileCheck %s --check-prefix=X64-WIN
46

57
; Verify that we are using the efficient uitofp --> sitofp lowering illustrated
68
; by the compiler_rt implementation of __floatundisf.
@@ -42,6 +44,48 @@ define float @test(i64 %a) nounwind {
4244
; X64-NEXT: cvtsi2ss %rdi, %xmm0
4345
; X64-NEXT: addss %xmm0, %xmm0
4446
; X64-NEXT: retq
47+
;
48+
; X86-WIN-LABEL: test:
49+
; X86-WIN: # %bb.0: # %entry
50+
; X86-WIN-NEXT: pushl %ebp
51+
; X86-WIN-NEXT: movl %esp, %ebp
52+
; X86-WIN-NEXT: andl $-8, %esp
53+
; X86-WIN-NEXT: subl $24, %esp
54+
; X86-WIN-NEXT: movl 12(%ebp), %eax
55+
; X86-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
56+
; X86-WIN-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
57+
; X86-WIN-NEXT: shrl $31, %eax
58+
; X86-WIN-NEXT: fildll {{[0-9]+}}(%esp)
59+
; X86-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
60+
; X86-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
61+
; X86-WIN-NEXT: orl $768, %ecx # imm = 0x300
62+
; X86-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
63+
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
64+
; X86-WIN-NEXT: fadds __real@5f80000000000000(,%eax,4)
65+
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
66+
; X86-WIN-NEXT: fstps {{[0-9]+}}(%esp)
67+
; X86-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
68+
; X86-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
69+
; X86-WIN-NEXT: flds {{[0-9]+}}(%esp)
70+
; X86-WIN-NEXT: movl %ebp, %esp
71+
; X86-WIN-NEXT: popl %ebp
72+
; X86-WIN-NEXT: retl
73+
;
74+
; X64-WIN-LABEL: test:
75+
; X64-WIN: # %bb.0: # %entry
76+
; X64-WIN-NEXT: testq %rcx, %rcx
77+
; X64-WIN-NEXT: js .LBB0_1
78+
; X64-WIN-NEXT: # %bb.2: # %entry
79+
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
80+
; X64-WIN-NEXT: retq
81+
; X64-WIN-NEXT: .LBB0_1:
82+
; X64-WIN-NEXT: movq %rcx, %rax
83+
; X64-WIN-NEXT: shrq %rax
84+
; X64-WIN-NEXT: andl $1, %ecx
85+
; X64-WIN-NEXT: orq %rax, %rcx
86+
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
87+
; X64-WIN-NEXT: addss %xmm0, %xmm0
88+
; X64-WIN-NEXT: retq
4589
entry:
4690
%b = uitofp i64 %a to float
4791
ret float %b

0 commit comments

Comments
 (0)