[X86][NFC] Rename variables/passes for EVEX compression optimization

KanRobert · KanRobert · commit a5902a4d2425 · 2024-01-06T12:41:09.000+08:00
RFC: https://discourse.llvm.org/t/rfc-design-for-apx-feature-egpr-and-ndd-support/73031 APX introduces EGPR, NDD and NF instructions. In addition to compressing EVEX encoded AVX512 instructions into VEX encoding, we also have several more possible optimizations. a. Promoted instruction (EVEX space) -> pre-promotion instruction (legacy space) b. NDD (EVEX space) -> non-NDD (legacy space) c. NF_ND (EVEX space) -> NF (EVEX space) The first two types of compression can usually reduce code size, while the third type of compression can help hardware decode although the instruction length remains unchanged. So we do the renaming for the upcoming APX optimizations. BTW, I clang-format the code in X86CompressEVEX.cpp, X86CompressEVEXTablesEmitter.cpp. This patch also extracts the NFC in llvm#77065 into a separate commit.
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
@@ -8,7 +8,7 @@ tablegen(LLVM X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
 tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM X86GenDisassemblerTables.inc -gen-disassembler)
-tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables)
+tablegen(LLVM X86GenCompressEVEXTables.inc -gen-x86-compress-evex-tables)
 tablegen(LLVM X86GenExegesis.inc -gen-exegesis)
 tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
 tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
@@ -61,7 +61,7 @@ set(sources
   X86InstrFMA3Info.cpp
   X86InstrFoldTables.cpp
   X86InstrInfo.cpp
-  X86EvexToVex.cpp
+  X86CompressEVEX.cpp
   X86LoadValueInjectionLoadHardening.cpp
   X86LoadValueInjectionRetHardening.cpp
   X86MCInstLower.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
@@ -131,9 +131,9 @@ FunctionPass *createX86FixupBWInsts();
 /// to another, when profitable.
 FunctionPass *createX86DomainReassignmentPass();
 
-/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
-/// encoding when possible in order to reduce code size.
-FunctionPass *createX86EvexToVexInsts();
+/// This pass compress instructions from EVEX space to legacy/VEX/EVEX space when
+/// possible in order to reduce code size or facilitate HW decoding.
+FunctionPass *createX86CompressEVEXPass();
 
 /// This pass creates the thunks for the retpoline feature.
 FunctionPass *createX86IndirectThunksPass();
@@ -167,7 +167,7 @@ FunctionPass *createX86SpeculativeLoadHardeningPass();
 FunctionPass *createX86SpeculativeExecutionSideEffectSuppression();
 FunctionPass *createX86ArgumentStackSlotPass();
 
-void initializeEvexToVexInstPassPass(PassRegistry &);
+void initializeCompressEVEXPassPass(PassRegistry &);
 void initializeFPSPass(PassRegistry &);
 void initializeFixupBWInstPassPass(PassRegistry &);
 void initializeFixupLEAPassPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -1,23 +1,39 @@
-//===- X86EvexToVex.cpp ---------------------------------------------------===//
-// Compress EVEX instructions to VEX encoding when possible to reduce code size
+//===- X86CompressEVEX.cpp ------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
-/// \file
-/// This file defines the pass that goes over all AVX-512 instructions which
-/// are encoded using the EVEX prefix and if possible replaces them by their
-/// corresponding VEX encoding which is usually shorter by 2 bytes.
-/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
-/// instruction has a corresponding AVX/AVX2 opcode, when vector length
-/// accessed by instruction is less than 512 bits and when it does not use
-//  the xmm or the mask registers or xmm/ymm registers with indexes higher
-//  than 15.
-/// The pass applies code reduction on the generated code for AVX-512 instrs.
+// This pass compresses instructions from EVEX space to legacy/VEX/EVEX space
+// when possible in order to reduce code size or facilitate HW decoding.
 //
+// Possible compression:
+//   a. AVX512 instruction (EVEX) -> AVX instruction (VEX)
+//   b. Promoted instruction (EVEX) -> pre-promotion instruction (legacy)
+//   c. NDD (EVEX) -> non-NDD (legacy)
+//   d. NF_ND (EVEX) -> NF (EVEX)
+//
+// Compression a, b and c always reduce code size (some exception)
+// fourth type of compression can help hardware decode although the instruction
+// length remains unchanged.
+//
+// Compression a, b and c can always reduce code size, with some exceptions
+// such as promoted 16-bit CRC32 which is as long as the legacy version.
+//
+// legacy:
+//   crc32w %si, %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0xc6]
+// promoted:
+//   crc32w %si, %eax ## encoding: [0x62,0xf4,0x7d,0x08,0xf1,0xc6]
+//
+// From performance perspective, these should be same (same uops and same EXE
+// ports). From a FMV perspective, an older legacy encoding is preferred b/c it
+// can execute in more places (broader HW install base). So we will still do
+// the compression.
+//
+// Compression d can help hardware decode (HW may skip reading the NDD
+// register) although the instruction length remains unchanged.
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86BaseInfo.h"
@@ -38,37 +54,34 @@
 
 using namespace llvm;
 
-// Including the generated EVEX2VEX tables.
-struct X86EvexToVexCompressTableEntry {
-  uint16_t EvexOpc;
-  uint16_t VexOpc;
+// Including the generated EVEX compression tables.
+struct X86CompressEVEXTableEntry {
+  uint16_t OldOpc;
+  uint16_t NewOpc;
 
-  bool operator<(const X86EvexToVexCompressTableEntry &RHS) const {
-    return EvexOpc < RHS.EvexOpc;
+  bool operator<(const X86CompressEVEXTableEntry &RHS) const {
+    return OldOpc < RHS.OldOpc;
   }
 
-  friend bool operator<(const X86EvexToVexCompressTableEntry &TE,
-                        unsigned Opc) {
-    return TE.EvexOpc < Opc;
+  friend bool operator<(const X86CompressEVEXTableEntry &TE, unsigned Opc) {
+    return TE.OldOpc < Opc;
   }
 };
-#include "X86GenEVEX2VEXTables.inc"
+#include "X86GenCompressEVEXTables.inc"
 
-#define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
-#define EVEX2VEX_NAME "x86-evex-to-vex-compress"
+#define COMP_EVEX_DESC "Compressing EVEX instrs when possible"
+#define COMP_EVEX_NAME "x86-compress-evex"
 
-#define DEBUG_TYPE EVEX2VEX_NAME
+#define DEBUG_TYPE COMP_EVEX_NAME
 
 namespace {
 
-class EvexToVexInstPass : public MachineFunctionPass {
+class CompressEVEXPass : public MachineFunctionPass {
 public:
   static char ID;
-  EvexToVexInstPass() : MachineFunctionPass(ID) {}
-  StringRef getPassName() const override { return EVEX2VEX_DESC; }
+  CompressEVEXPass() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override { return COMP_EVEX_DESC; }
 
-  /// Loop over all of the basic blocks, replacing EVEX instructions
-  /// by equivalent VEX instructions when possible for reducing code size.
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   // This pass runs after regalloc and doesn't support VReg operands.
@@ -80,7 +93,7 @@ class EvexToVexInstPass : public MachineFunctionPass {
 
 } // end anonymous namespace
 
-char EvexToVexInstPass::ID = 0;
+char CompressEVEXPass::ID = 0;
 
 static bool usesExtendedRegister(const MachineInstr &MI) {
   auto isHiRegIdx = [](unsigned Reg) {
@@ -112,8 +125,8 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
   return false;
 }
 
-static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) {
-  switch (EvexOpc) {
+static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) {
+  switch (OldOpc) {
   default:
     return true;
   case X86::VCVTNEPS2BF16Z128rm:
@@ -151,15 +164,15 @@ static bool checkVEXInstPredicate(unsigned EvexOpc, const X86Subtarget &ST) {
 }
 
 // Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {
-  (void)VexOpc;
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+  (void)NewOpc;
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
   case X86::VALIGNDZ128rri:
   case X86::VALIGNDZ128rmi:
   case X86::VALIGNQZ128rri:
   case X86::VALIGNQZ128rmi: {
-    assert((VexOpc == X86::VPALIGNRrri || VexOpc == X86::VPALIGNRrmi) &&
+    assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
            "Unexpected new opcode!");
     unsigned Scale =
         (Opc == X86::VALIGNQZ128rri || Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
@@ -175,8 +188,8 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {
   case X86::VSHUFI32X4Z256rri:
   case X86::VSHUFI64X2Z256rmi:
   case X86::VSHUFI64X2Z256rri: {
-    assert((VexOpc == X86::VPERM2F128rr || VexOpc == X86::VPERM2I128rr ||
-            VexOpc == X86::VPERM2F128rm || VexOpc == X86::VPERM2I128rm) &&
+    assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
+            NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
            "Unexpected new opcode!");
     MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
     int64_t ImmVal = Imm.getImm();
@@ -200,7 +213,7 @@ static bool performCustomAdjustments(MachineInstr &MI, unsigned VexOpc) {
   case X86::VRNDSCALESDZm_Int:
   case X86::VRNDSCALESSZr_Int:
   case X86::VRNDSCALESSZm_Int:
-    const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+    const MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands() - 1);
     int64_t ImmVal = Imm.getImm();
     // Ensure that only bits 3:0 of the immediate are used.
     if ((ImmVal & 0xf) != ImmVal)
@@ -239,28 +252,28 @@ static bool CompressEvexToVexImpl(MachineInstr &MI, const X86Subtarget &ST) {
     return false;
 
   // Use the VEX.L bit to select the 128 or 256-bit table.
-  ArrayRef<X86EvexToVexCompressTableEntry> Table =
+  ArrayRef<X86CompressEVEXTableEntry> Table =
       (Desc.TSFlags & X86II::VEX_L) ? ArrayRef(X86EvexToVex256CompressTable)
                                     : ArrayRef(X86EvexToVex128CompressTable);
 
-  unsigned EvexOpc = MI.getOpcode();
-  const auto *I = llvm::lower_bound(Table, EvexOpc);
-  if (I == Table.end() || I->EvexOpc != EvexOpc)
+  unsigned Opc = MI.getOpcode();
+  const auto *I = llvm::lower_bound(Table, Opc);
+  if (I == Table.end() || I->OldOpc != Opc)
     return false;
 
   if (usesExtendedRegister(MI))
     return false;
-  if (!checkVEXInstPredicate(EvexOpc, ST))
+  if (!checkVEXInstPredicate(Opc, ST))
     return false;
-  if (!performCustomAdjustments(MI, I->VexOpc))
+  if (!performCustomAdjustments(MI, I->NewOpc))
     return false;
 
-  MI.setDesc(ST.getInstrInfo()->get(I->VexOpc));
+  MI.setDesc(ST.getInstrInfo()->get(I->NewOpc));
   MI.setAsmPrinterFlag(X86::AC_EVEX_2_VEX);
   return true;
 }
 
-bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
+bool CompressEVEXPass::runOnMachineFunction(MachineFunction &MF) {
 #ifndef NDEBUG
   // Make sure the tables are sorted.
   static std::atomic<bool> TableChecked(false);
@@ -289,8 +302,8 @@ bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+INITIALIZE_PASS(CompressEVEXPass, COMP_EVEX_NAME, COMP_EVEX_DESC, false, false)
 
-FunctionPass *llvm::createX86EvexToVexInsts() {
-  return new EvexToVexInstPass();
+FunctionPass *llvm::createX86CompressEVEXPass() {
+  return new CompressEVEXPass();
 }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -75,7 +75,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeGlobalISel(PR);
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
-  initializeEvexToVexInstPassPass(PR);
+  initializeCompressEVEXPassPass(PR);
   initializeFixupLEAPassPass(PR);
   initializeFPSPass(PR);
   initializeX86FixupSetCCPassPass(PR);
@@ -575,7 +575,7 @@ void X86PassConfig::addPreEmitPass() {
     addPass(createX86FixupInstTuning());
     addPass(createX86FixupVectorConstants());
   }
-  addPass(createX86EvexToVexInsts());
+  addPass(createX86CompressEVEXPass());
   addPass(createX86DiscriminateMemOpsPass());
   addPass(createX86InsertPrefetchPass());
   addPass(createX86InsertX87waitPass());
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -68,7 +68,7 @@
 ; CHECK-NEXT:       Implement the 'patchable-function' attribute
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
-; CHECK-NEXT:       Compressing EVEX instrs to VEX encoding when possibl
+; CHECK-NEXT:       Compressing EVEX instrs when possible
 ; CHECK-NEXT:       X86 Discriminate Memory Operands
 ; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
diff --git a/llvm/test/CodeGen/X86/evex-to-vex-compress.mir b/llvm/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=x86_64-- -run-pass x86-evex-to-vex-compress -verify-machineinstrs -mcpu=skx -o - %s | FileCheck %s
+# RUN: llc -mtriple=x86_64-- -run-pass x86-compress-evex -verify-machineinstrs -mcpu=skx -o - %s | FileCheck %s
 # This test verifies VEX encoding for AVX-512 instructions that use registers of low indexes and
 # do not use zmm or mask registers and have a corresponding AVX/AVX2 opcode
 
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -205,7 +205,7 @@
 ; CHECK-NEXT:       X86 LEA Fixup
 ; CHECK-NEXT:       X86 Fixup Inst Tuning
 ; CHECK-NEXT:       X86 Fixup Vector Constants
-; CHECK-NEXT:       Compressing EVEX instrs to VEX encoding when possible
+; CHECK-NEXT:       Compressing EVEX instrs when possible
 ; CHECK-NEXT:       X86 Discriminate Memory Operands
 ; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
diff --git a/llvm/utils/TableGen/CMakeLists.txt b/llvm/utils/TableGen/CMakeLists.txt
@@ -82,7 +82,7 @@ add_tablegen(llvm-tblgen LLVM
   Types.cpp
   VarLenCodeEmitterGen.cpp
   X86DisassemblerTables.cpp
-  X86EVEX2VEXTablesEmitter.cpp
+  X86CompressEVEXTablesEmitter.cpp
   X86FoldTablesEmitter.cpp
   X86MnemonicTables.cpp
   X86ModRMFilters.cpp
diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# RUN: llc -mtriple=x86_64-- -run-pass x86-evex-to-vex-compress -verify-machineinstrs -mcpu=skx -o - %s \| FileCheck %s`
	`1`	`+# RUN: llc -mtriple=x86_64-- -run-pass x86-compress-evex -verify-machineinstrs -mcpu=skx -o - %s \| FileCheck %s`
`2`	`2`	`# This test verifies VEX encoding for AVX-512 instructions that use registers of low indexes and`
`3`	`3`	`# do not use zmm or mask registers and have a corresponding AVX/AVX2 opcode`
`4`	`4`