Skip to content

[X86] Support lowering for APX promoted BMI instructions. #77433

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4086,14 +4086,17 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
SDValue Control;
unsigned ROpc, MOpc;

#define GET_EGPR_IF_ENABLED(OPC) (Subtarget->hasEGPR() ? OPC##_EVEX : OPC)
if (!PreferBEXTR) {
assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then.");
// If we can't make use of BEXTR then we can't fuse shift+mask stages.
// Let's perform the mask first, and apply shift later. Note that we need to
// widen the mask to account for the fact that we'll apply shift afterwards!
Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT);
ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr;
MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm;
ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rr)
: GET_EGPR_IF_ENABLED(X86::BZHI32rr);
MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BZHI64rm)
: GET_EGPR_IF_ENABLED(X86::BZHI32rm);
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
} else {
Expand All @@ -4108,8 +4111,10 @@ MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
} else {
assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then.");
// BMI requires the immediate to placed in a register.
ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
ROpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rr)
: GET_EGPR_IF_ENABLED(X86::BEXTR32rr);
MOpc = NVT == MVT::i64 ? GET_EGPR_IF_ENABLED(X86::BEXTR64rm)
: GET_EGPR_IF_ENABLED(X86::BEXTR32rm);
unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri;
Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0);
}
Expand Down Expand Up @@ -5482,25 +5487,30 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
case MVT::i32:
Opc = UseMULXHi ? X86::MULX32Hrr :
UseMULX ? X86::MULX32rr :
IsSigned ? X86::IMUL32r : X86::MUL32r;
MOpc = UseMULXHi ? X86::MULX32Hrm :
UseMULX ? X86::MULX32rm :
IsSigned ? X86::IMUL32m : X86::MUL32m;
Opc = UseMULXHi ? X86::MULX32Hrr
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rr)
: IsSigned ? X86::IMUL32r
: X86::MUL32r;
MOpc = UseMULXHi ? X86::MULX32Hrm
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX32rm)
: IsSigned ? X86::IMUL32m
: X86::MUL32m;
LoReg = UseMULX ? X86::EDX : X86::EAX;
HiReg = X86::EDX;
break;
case MVT::i64:
Opc = UseMULXHi ? X86::MULX64Hrr :
UseMULX ? X86::MULX64rr :
IsSigned ? X86::IMUL64r : X86::MUL64r;
MOpc = UseMULXHi ? X86::MULX64Hrm :
UseMULX ? X86::MULX64rm :
IsSigned ? X86::IMUL64m : X86::MUL64m;
Opc = UseMULXHi ? X86::MULX64Hrr
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rr)
: IsSigned ? X86::IMUL64r
: X86::MUL64r;
MOpc = UseMULXHi ? X86::MULX64Hrm
: UseMULX ? GET_EGPR_IF_ENABLED(X86::MULX64rm)
: IsSigned ? X86::IMUL64m
: X86::MUL64m;
LoReg = UseMULX ? X86::RDX : X86::RAX;
HiReg = X86::RDX;
break;
#undef GET_EGPR_IF_ENABLED
}

SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
Expand Down
16 changes: 11 additions & 5 deletions llvm/lib/Target/X86/X86InstrArithmetic.td
Original file line number Diff line number Diff line change
Expand Up @@ -1338,17 +1338,23 @@ defm ANDN32 : AndN<Xi32, "_EVEX">, EVEX, Requires<[HasBMI, HasEGPR, In64BitMode]
defm ANDN64 : AndN<Xi64, "_EVEX">, EVEX, REX_W, Requires<[HasBMI, HasEGPR, In64BitMode]>;
}

let Predicates = [HasBMI], AddedComplexity = -6 in {
multiclass Andn_Pats<string suffix> {
def : Pat<(and (not GR32:$src1), GR32:$src2),
(ANDN32rr GR32:$src1, GR32:$src2)>;
(!cast<Instruction>(ANDN32rr#suffix) GR32:$src1, GR32:$src2)>;
def : Pat<(and (not GR64:$src1), GR64:$src2),
(ANDN64rr GR64:$src1, GR64:$src2)>;
(!cast<Instruction>(ANDN64rr#suffix) GR64:$src1, GR64:$src2)>;
def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
(ANDN32rm GR32:$src1, addr:$src2)>;
(!cast<Instruction>(ANDN32rm#suffix) GR32:$src1, addr:$src2)>;
def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
(ANDN64rm GR64:$src1, addr:$src2)>;
(!cast<Instruction>(ANDN64rm#suffix) GR64:$src1, addr:$src2)>;
}

let Predicates = [HasBMI, NoEGPR], AddedComplexity = -6 in
defm : Andn_Pats<"">;

let Predicates = [HasBMI, HasEGPR], AddedComplexity = -6 in
defm : Andn_Pats<"_EVEX">;

//===----------------------------------------------------------------------===//
// MULX Instruction
//
Expand Down
58 changes: 43 additions & 15 deletions llvm/lib/Target/X86/X86InstrMisc.td
Original file line number Diff line number Diff line change
Expand Up @@ -1241,43 +1241,49 @@ let Predicates = [HasBMI, In64BitMode], Defs = [EFLAGS] in {
defm BLSI64 : Bls<"blsi", MRM3r, MRM3m, Xi64, "_EVEX">, EVEX;
}

let Predicates = [HasBMI] in {
multiclass Bls_Pats<string suffix> {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bls_pats

// FIXME(1): patterns for the load versions are not implemented
// FIXME(2): By only matching `add_su` and `ineg_su` we may emit
// extra `mov` instructions if `src` has future uses. It may be better
// to always match if `src` has more users.
def : Pat<(and GR32:$src, (add_su GR32:$src, -1)),
(BLSR32rr GR32:$src)>;
(!cast<Instruction>(BLSR32rr#suffix) GR32:$src)>;
def : Pat<(and GR64:$src, (add_su GR64:$src, -1)),
(BLSR64rr GR64:$src)>;
(!cast<Instruction>(BLSR64rr#suffix) GR64:$src)>;

def : Pat<(xor GR32:$src, (add_su GR32:$src, -1)),
(BLSMSK32rr GR32:$src)>;
(!cast<Instruction>(BLSMSK32rr#suffix) GR32:$src)>;
def : Pat<(xor GR64:$src, (add_su GR64:$src, -1)),
(BLSMSK64rr GR64:$src)>;
(!cast<Instruction>(BLSMSK64rr#suffix) GR64:$src)>;

def : Pat<(and GR32:$src, (ineg_su GR32:$src)),
(BLSI32rr GR32:$src)>;
(!cast<Instruction>(BLSI32rr#suffix) GR32:$src)>;
def : Pat<(and GR64:$src, (ineg_su GR64:$src)),
(BLSI64rr GR64:$src)>;
(!cast<Instruction>(BLSI64rr#suffix) GR64:$src)>;

// Versions to match flag producing ops.
def : Pat<(and_flag_nocf GR32:$src, (add_su GR32:$src, -1)),
(BLSR32rr GR32:$src)>;
(!cast<Instruction>(BLSR32rr#suffix) GR32:$src)>;
def : Pat<(and_flag_nocf GR64:$src, (add_su GR64:$src, -1)),
(BLSR64rr GR64:$src)>;
(!cast<Instruction>(BLSR64rr#suffix) GR64:$src)>;

def : Pat<(xor_flag_nocf GR32:$src, (add_su GR32:$src, -1)),
(BLSMSK32rr GR32:$src)>;
(!cast<Instruction>(BLSMSK32rr#suffix) GR32:$src)>;
def : Pat<(xor_flag_nocf GR64:$src, (add_su GR64:$src, -1)),
(BLSMSK64rr GR64:$src)>;
(!cast<Instruction>(BLSMSK64rr#suffix) GR64:$src)>;

def : Pat<(and_flag_nocf GR32:$src, (ineg_su GR32:$src)),
(BLSI32rr GR32:$src)>;
(!cast<Instruction>(BLSI32rr#suffix) GR32:$src)>;
def : Pat<(and_flag_nocf GR64:$src, (ineg_su GR64:$src)),
(BLSI64rr GR64:$src)>;
(!cast<Instruction>(BLSI64rr#suffix) GR64:$src)>;
}

let Predicates = [HasBMI, NoEGPR] in
defm : Bls_Pats<"">;

let Predicates = [HasBMI, HasEGPR] in
defm : Bls_Pats<"_EVEX">;

multiclass Bmi4VOp3<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
X86FoldableSchedWrite sched, string Suffix = ""> {
let SchedRW = [sched], Form = MRMSrcReg4VOp3 in
Expand Down Expand Up @@ -1324,7 +1330,7 @@ def AndMask64 : ImmLeaf<i64, [{
}]>;

// Use BEXTR for 64-bit 'and' with large immediate 'mask'.
let Predicates = [HasBMI, NoBMI2, NoTBM] in {
let Predicates = [HasBMI, NoBMI2, NoTBM, NoEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BEXTR64rr GR64:$src,
(SUBREG_TO_REG (i64 0),
Expand All @@ -1335,8 +1341,19 @@ let Predicates = [HasBMI, NoBMI2, NoTBM] in {
(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
}

let Predicates = [HasBMI, NoBMI2, NoTBM, HasEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BEXTR64rr_EVEX GR64:$src,
(SUBREG_TO_REG (i64 0),
(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
(BEXTR64rm_EVEX addr:$src,
(SUBREG_TO_REG (i64 0),
(MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
}

// Use BZHI for 64-bit 'and' with large immediate 'mask'.
let Predicates = [HasBMI2, NoTBM] in {
let Predicates = [HasBMI2, NoTBM, NoEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BZHI64rr GR64:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
Expand All @@ -1347,6 +1364,17 @@ let Predicates = [HasBMI2, NoTBM] in {
(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
}

let Predicates = [HasBMI2, NoTBM, HasEGPR] in {
def : Pat<(and GR64:$src, AndMask64:$mask),
(BZHI64rr_EVEX GR64:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
(BZHI64rm_EVEX addr:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
}

multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
X86MemOperand x86memop, SDPatternOperator OpNode,
PatFrag ld_frag, string Suffix = ""> {
Expand Down
47 changes: 27 additions & 20 deletions llvm/lib/Target/X86/X86InstrShiftRotate.td
Original file line number Diff line number Diff line change
Expand Up @@ -284,32 +284,32 @@ defm SHRX64: ShiftX<"shrx", Xi64>, XD;
defm SHLX32: ShiftX<"shlx", Xi32>, PD;
defm SHLX64: ShiftX<"shlx", Xi64>, PD;

multiclass RORX_Pats {
multiclass RORX_Pats<string suffix> {
// Prefer RORX which is non-destructive and doesn't update EFLAGS.
let AddedComplexity = 10 in {
def : Pat<(rotr GR32:$src, (i8 imm:$shamt)),
(RORX32ri GR32:$src, imm:$shamt)>;
(!cast<Instruction>(RORX32ri#suffix) GR32:$src, imm:$shamt)>;
def : Pat<(rotr GR64:$src, (i8 imm:$shamt)),
(RORX64ri GR64:$src, imm:$shamt)>;
(!cast<Instruction>(RORX64ri#suffix) GR64:$src, imm:$shamt)>;

def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
(RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX32ri#suffix) GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
(RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX64ri#suffix) GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
}

def : Pat<(rotr (loadi32 addr:$src), (i8 imm:$shamt)),
(RORX32mi addr:$src, imm:$shamt)>;
(!cast<Instruction>(RORX32mi#suffix) addr:$src, imm:$shamt)>;
def : Pat<(rotr (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, imm:$shamt)>;
(!cast<Instruction>(RORX64mi#suffix) addr:$src, imm:$shamt)>;

def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
(RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX32mi#suffix) addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
(!cast<Instruction>(RORX64mi#suffix) addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
}

multiclass ShiftX_Pats<SDNode op> {
multiclass ShiftX_Pats<SDNode op, string suffix = ""> {
// Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
// immediate shift, i.e. the following code is considered better
//
Expand All @@ -325,16 +325,16 @@ multiclass ShiftX_Pats<SDNode op> {
//
let AddedComplexity = 1 in {
def : Pat<(op GR32:$src1, GR8:$src2),
(!cast<Instruction>(NAME#"32rr") GR32:$src1,
(!cast<Instruction>(NAME#"32rr"#suffix) GR32:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op GR64:$src1, GR8:$src2),
(!cast<Instruction>(NAME#"64rr") GR64:$src1,
(!cast<Instruction>(NAME#"64rr"#suffix) GR64:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op GR32:$src1, (shiftMask32 GR8:$src2)),
(!cast<Instruction>(NAME#"32rr") GR32:$src1,
(!cast<Instruction>(NAME#"32rr"#suffix) GR32:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op GR64:$src1, (shiftMask64 GR8:$src2)),
(!cast<Instruction>(NAME#"64rr") GR64:$src1,
(!cast<Instruction>(NAME#"64rr"#suffix) GR64:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
// We prefer to use
Expand All @@ -348,22 +348,29 @@ multiclass ShiftX_Pats<SDNode op> {
//
// This priority is enforced by IsProfitableToFoldLoad.
def : Pat<(op (loadi32 addr:$src1), GR8:$src2),
(!cast<Instruction>(NAME#"32rm") addr:$src1,
(!cast<Instruction>(NAME#"32rm"#suffix) addr:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op (loadi64 addr:$src1), GR8:$src2),
(!cast<Instruction>(NAME#"64rm") addr:$src1,
(!cast<Instruction>(NAME#"64rm"#suffix) addr:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op (loadi32 addr:$src1), (shiftMask32 GR8:$src2)),
(!cast<Instruction>(NAME#"32rm") addr:$src1,
(!cast<Instruction>(NAME#"32rm"#suffix) addr:$src1,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
def : Pat<(op (loadi64 addr:$src1), (shiftMask64 GR8:$src2)),
(!cast<Instruction>(NAME#"64rm") addr:$src1,
(!cast<Instruction>(NAME#"64rm"#suffix) addr:$src1,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}

let Predicates = [HasBMI2] in {
defm : RORX_Pats;
let Predicates = [HasBMI2, NoEGPR] in {
defm : RORX_Pats<"">;
defm SARX : ShiftX_Pats<sra>;
defm SHRX : ShiftX_Pats<srl>;
defm SHLX : ShiftX_Pats<shl>;
}

let Predicates = [HasBMI2, HasEGPR] in {
defm : RORX_Pats<"_EVEX">;
defm SARX : ShiftX_Pats<sra, "_EVEX">;
defm SHRX : ShiftX_Pats<srl, "_EVEX">;
defm SHLX : ShiftX_Pats<shl, "_EVEX">;
}
Loading