Skip to content

Cherry-pick: [X86] Change precision control to FP80 during u64->fp32 conversion on Windows #145

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 78 additions & 3 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21616,15 +21616,25 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
if (IsStrict) {
SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
{Chain, Fild, Fudge});
unsigned Opc = ISD::STRICT_FADD;
// Windows needs the precision control changed to 80bits around this add.
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
Opc = X86ISD::STRICT_FP80_ADD;

SDValue Add =
DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});
// STRICT_FP_ROUND can't handle equal types.
if (DstVT == MVT::f80)
return Add;
return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
}
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
unsigned Opc = ISD::FADD;
// Windows needs the precision control changed to 80bits around this add.
if (Subtarget.isOSWindows() && DstVT == MVT::f32)
Opc = X86ISD::FP80_ADD;

SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
DAG.getIntPtrConstant(0, dl));
}
Expand Down Expand Up @@ -33830,6 +33840,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(AESENCWIDE256KL)
NODE_NAME_CASE(AESDECWIDE256KL)
NODE_NAME_CASE(TESTUI)
NODE_NAME_CASE(FP80_ADD)
NODE_NAME_CASE(STRICT_FP80_ADD)
}
return nullptr;
#undef NODE_NAME_CASE
Expand Down Expand Up @@ -36340,6 +36352,69 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
return BB;
}

case X86::FP80_ADDr:
case X86::FP80_ADDm32: {
// Change the floating point control register to use double extended
// precision when performing the addition.
int OrigCWFrameIdx =
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),
OrigCWFrameIdx);

// Load the old value of the control word...
Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),
OrigCWFrameIdx);

// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended
// precision.
Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)
.addReg(OldCW, RegState::Kill)
.addImm(0x300);

// Extract to 16 bits.
Register NewCW16 =
MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)
.addReg(NewCW, RegState::Kill, X86::sub_16bit);

// Prepare memory for FLDCW.
int NewCWFrameIdx =
MF->getFrameInfo().CreateStackObject(2, Align(2), false);
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
NewCWFrameIdx)
.addReg(NewCW16, RegState::Kill);

// Reload the modified control word now...
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
NewCWFrameIdx);

// Do the addition.
if (MI.getOpcode() == X86::FP80_ADDr) {
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
.add(MI.getOperand(2));
} else {
BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
.add(MI.getOperand(2))
.add(MI.getOperand(3))
.add(MI.getOperand(4))
.add(MI.getOperand(5))
.add(MI.getOperand(6));
}

// Reload the original control word now.
addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),
OrigCWFrameIdx);

MI.eraseFromParent(); // The pseudo instruction is gone now.
return BB;
}

case X86::FP32_TO_INT16_IN_MEM:
case X86::FP32_TO_INT32_IN_MEM:
case X86::FP32_TO_INT64_IN_MEM:
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,9 @@ namespace llvm {
// User level interrupts - testui
TESTUI,

// Perform an FP80 add after changing precision control in FPCW.
FP80_ADD,

/// X86 strict FP compare instructions.
STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
STRICT_FCMPS,
Expand Down Expand Up @@ -771,6 +774,9 @@ namespace llvm {
STRICT_CVTPS2PH,
STRICT_CVTPH2PS,

// Perform an FP80 add after changing precision control in FPCW.
STRICT_FP80_ADD,

// WARNING: Only add nodes here if they are stric FP nodes. Non-memory and
// non-strict FP nodes should be above FIRST_TARGET_STRICTFP_OPCODE.

Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Target/X86/X86InstrFPStack.td
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ def SDTX86Fist : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def SDTX86CwdLoad : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;

def X86fp80_add : SDNode<"X86ISD::FP80_ADD", SDTFPBinOp, [SDNPCommutative]>;
def X86strict_fp80_add : SDNode<"X86ISD::STRICT_FP80_ADD", SDTFPBinOp,
[SDNPHasChain,SDNPCommutative]>;
def any_X86fp80_add : PatFrags<(ops node:$lhs, node:$rhs),
[(X86strict_fp80_add node:$lhs, node:$rhs),
(X86fp80_add node:$lhs, node:$rhs)]>;

def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
Expand Down Expand Up @@ -141,6 +148,14 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Defs = [EFLAGS] in {
[(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
def FP80_TO_INT64_IN_MEM : PseudoI<(outs), (ins i64mem:$dst, RFP80:$src),
[(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;

def FP80_ADDr : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
[(set RFP80:$dst,
(any_X86fp80_add RFP80:$src1, RFP80:$src2))]>;
def FP80_ADDm32 : PseudoI<(outs RFP80:$dst), (ins RFP80:$src1, f32mem:$src2),
[(set RFP80:$dst,
(any_X86fp80_add RFP80:$src1,
(f80 (extloadf32 addr:$src2))))]>;
}

// All FP Stack operations are represented with four instructions here. The
Expand Down
48 changes: 46 additions & 2 deletions llvm/test/CodeGen/X86/uint64-to-float.ll
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-apple-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i686-windows -mattr=+sse2 | FileCheck %s --check-prefix=X86-WIN
; RUN: llc < %s -mtriple=x86_64-windows -mattr=+sse2 | FileCheck %s --check-prefix=X64-WIN

; Verify that we are using the efficient uitofp --> sitofp lowering illustrated
; by the compiler_rt implementation of __floatundisf.
Expand Down Expand Up @@ -42,6 +44,48 @@ define float @test(i64 %a) nounwind {
; X64-NEXT: cvtsi2ss %rdi, %xmm0
; X64-NEXT: addss %xmm0, %xmm0
; X64-NEXT: retq
;
; X86-WIN-LABEL: test:
; X86-WIN: # %bb.0: # %entry
; X86-WIN-NEXT: pushl %ebp
; X86-WIN-NEXT: movl %esp, %ebp
; X86-WIN-NEXT: andl $-8, %esp
; X86-WIN-NEXT: subl $24, %esp
; X86-WIN-NEXT: movl 12(%ebp), %eax
; X86-WIN-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
; X86-WIN-NEXT: movlps %xmm0, {{[0-9]+}}(%esp)
; X86-WIN-NEXT: shrl $31, %eax
; X86-WIN-NEXT: fildll {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fnstcw {{[0-9]+}}(%esp)
; X86-WIN-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
; X86-WIN-NEXT: orl $768, %ecx # imm = 0x300
; X86-WIN-NEXT: movw %cx, {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fadds __real@5f80000000000000(,%eax,4)
; X86-WIN-NEXT: fldcw {{[0-9]+}}(%esp)
; X86-WIN-NEXT: fstps {{[0-9]+}}(%esp)
; X86-WIN-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X86-WIN-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
; X86-WIN-NEXT: flds {{[0-9]+}}(%esp)
; X86-WIN-NEXT: movl %ebp, %esp
; X86-WIN-NEXT: popl %ebp
; X86-WIN-NEXT: retl
;
; X64-WIN-LABEL: test:
; X64-WIN: # %bb.0: # %entry
; X64-WIN-NEXT: testq %rcx, %rcx
; X64-WIN-NEXT: js .LBB0_1
; X64-WIN-NEXT: # %bb.2: # %entry
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
; X64-WIN-NEXT: retq
; X64-WIN-NEXT: .LBB0_1:
; X64-WIN-NEXT: movq %rcx, %rax
; X64-WIN-NEXT: shrq %rax
; X64-WIN-NEXT: andl $1, %ecx
; X64-WIN-NEXT: orq %rax, %rcx
; X64-WIN-NEXT: cvtsi2ss %rcx, %xmm0
; X64-WIN-NEXT: addss %xmm0, %xmm0
; X64-WIN-NEXT: retq
entry:
%b = uitofp i64 %a to float
ret float %b
Expand Down