@@ -64,6 +64,8 @@ STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
6464STATISTIC (NumLoadsFromStoresPromoted, " Number of loads from stores promoted" );
6565STATISTIC (NumFailedAlignmentCheck, " Number of load/store pair transformation "
6666 " not passed the alignment check" );
67+ STATISTIC (NumConstOffsetFolded,
68+ " Number of const offset of index address folded" );
6769
6870DEBUG_COUNTER (RegRenamingCounter, DEBUG_TYPE " -reg-renaming" ,
6971 " Controls which pairs are considered for renaming" );
@@ -77,6 +79,11 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
7779static cl::opt<unsigned > UpdateLimit (" aarch64-update-scan-limit" , cl::init(100 ),
7880 cl::Hidden);
7981
82+ // The LdStConstLimit limits how far we search for const offset instructions
83+ // when we form index address load/store instructions.
84+ static cl::opt<unsigned > LdStConstLimit (" aarch64-load-store-const-scan-limit" ,
85+ cl::init (10 ), cl::Hidden);
86+
8087// Enable register renaming to find additional store pairing opportunities.
8188static cl::opt<bool > EnableRenaming (" aarch64-load-store-renaming" ,
8289 cl::init (true ), cl::Hidden);
@@ -173,6 +180,13 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
173180 findMatchingUpdateInsnForward (MachineBasicBlock::iterator I,
174181 int UnscaledOffset, unsigned Limit);
175182
183+ // Scan the instruction list to find a register assigned with a const
184+ // value that can be combined with the current instruction (a load or store)
185+ // using base addressing with writeback. Scan backwards.
186+ MachineBasicBlock::iterator
187+ findMatchingConstOffsetBackward (MachineBasicBlock::iterator I, unsigned Limit,
188+ unsigned &Offset);
189+
176190 // Scan the instruction list to find a base register update that can
177191 // be combined with the current instruction (a load or store) using
178192 // pre or post indexed addressing with writeback. Scan backwards.
@@ -184,11 +198,19 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
184198 bool isMatchingUpdateInsn (MachineInstr &MemMI, MachineInstr &MI,
185199 unsigned BaseReg, int Offset);
186200
201+ bool isMatchingMovConstInsn (MachineInstr &MemMI, MachineInstr &MI,
202+ unsigned IndexReg, unsigned &Offset);
203+
187204 // Merge a pre- or post-index base register update into a ld/st instruction.
188205 MachineBasicBlock::iterator
189206 mergeUpdateInsn (MachineBasicBlock::iterator I,
190207 MachineBasicBlock::iterator Update, bool IsPreIdx);
191208
209+ MachineBasicBlock::iterator
210+ mergeConstOffsetInsn (MachineBasicBlock::iterator I,
211+ MachineBasicBlock::iterator Update, unsigned Offset,
212+ int Scale);
213+
192214 // Find and merge zero store instructions.
193215 bool tryToMergeZeroStInst (MachineBasicBlock::iterator &MBBI);
194216
@@ -201,6 +223,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
201223 // Find and merge a base register updates before or after a ld/st instruction.
202224 bool tryToMergeLdStUpdate (MachineBasicBlock::iterator &MBBI);
203225
226+ // Find and merge an index ldr/st instruction into a base ld/st instruction.
227+ bool tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI, int Scale);
228+
204229 bool optimizeBlock (MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt);
205230
206231 bool runOnMachineFunction (MachineFunction &Fn) override ;
@@ -483,6 +508,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
483508 }
484509}
485510
511+ static unsigned getBaseAddressOpcode (unsigned Opc) {
512+ // TODO: Add more index address loads/stores.
513+ switch (Opc) {
514+ default :
515+ llvm_unreachable (" Opcode has no base address equivalent!" );
516+ case AArch64::LDRBBroX:
517+ return AArch64::LDRBBui;
518+ }
519+ }
520+
486521static unsigned getPostIndexedOpcode (unsigned Opc) {
487522 switch (Opc) {
488523 default :
@@ -724,6 +759,20 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
724759 }
725760}
726761
762+ // Make sure this is a reg+reg Ld/St
763+ static bool isMergeableIndexLdSt (MachineInstr &MI, int &Scale) {
764+ unsigned Opc = MI.getOpcode ();
765+ switch (Opc) {
766+ default :
767+ return false ;
768+ // Scaled instructions.
769+ // TODO: Add more index address loads/stores.
770+ case AArch64::LDRBBroX:
771+ Scale = 1 ;
772+ return true ;
773+ }
774+ }
775+
727776static bool isRewritableImplicitDef (unsigned Opc) {
728777 switch (Opc) {
729778 default :
@@ -2053,6 +2102,63 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
20532102 return NextI;
20542103}
20552104
2105+ MachineBasicBlock::iterator
2106+ AArch64LoadStoreOpt::mergeConstOffsetInsn (MachineBasicBlock::iterator I,
2107+ MachineBasicBlock::iterator Update,
2108+ unsigned Offset, int Scale) {
2109+ assert ((Update->getOpcode () == AArch64::MOVKWi) &&
2110+ " Unexpected const mov instruction to merge!" );
2111+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2112+ MachineBasicBlock::iterator NextI = next_nodbg (I, E);
2113+ MachineBasicBlock::iterator PrevI = prev_nodbg (Update, E);
2114+ MachineInstr &MemMI = *I;
2115+ unsigned Mask = (1 << 12 ) * Scale - 1 ;
2116+ unsigned Low = Offset & Mask;
2117+ unsigned High = Offset - Low;
2118+ Register BaseReg = AArch64InstrInfo::getLdStBaseOp (MemMI).getReg ();
2119+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2120+ MachineInstrBuilder AddMIB, MemMIB;
2121+
2122+ // Add IndexReg, BaseReg, High (the BaseReg may be SP)
2123+ AddMIB =
2124+ BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (AArch64::ADDXri))
2125+ .addDef (IndexReg)
2126+ .addUse (BaseReg)
2127+ .addImm (High >> 12 ) // shifted value
2128+ .addImm (12 ); // shift 12
2129+ (void )AddMIB;
2130+ // Ld/St DestReg, IndexReg, Imm12
2131+ unsigned NewOpc = getBaseAddressOpcode (I->getOpcode ());
2132+ MemMIB = BuildMI (*I->getParent (), I, I->getDebugLoc (), TII->get (NewOpc))
2133+ .add (getLdStRegOp (MemMI))
2134+ .add (AArch64InstrInfo::getLdStOffsetOp (MemMI))
2135+ .addImm (Low / Scale)
2136+ .setMemRefs (I->memoperands ())
2137+ .setMIFlags (I->mergeFlagsWith (*Update));
2138+ (void )MemMIB;
2139+
2140+ ++NumConstOffsetFolded;
2141+ LLVM_DEBUG (dbgs () << " Creating base address load/store.\n " );
2142+ LLVM_DEBUG (dbgs () << " Replacing instructions:\n " );
2143+ LLVM_DEBUG (PrevI->print (dbgs ()));
2144+ LLVM_DEBUG (dbgs () << " " );
2145+ LLVM_DEBUG (Update->print (dbgs ()));
2146+ LLVM_DEBUG (dbgs () << " " );
2147+ LLVM_DEBUG (I->print (dbgs ()));
2148+ LLVM_DEBUG (dbgs () << " with instruction:\n " );
2149+ LLVM_DEBUG (((MachineInstr *)AddMIB)->print (dbgs ()));
2150+ LLVM_DEBUG (dbgs () << " " );
2151+ LLVM_DEBUG (((MachineInstr *)MemMIB)->print (dbgs ()));
2152+ LLVM_DEBUG (dbgs () << " \n " );
2153+
2154+ // Erase the old instructions for the block.
2155+ I->eraseFromParent ();
2156+ PrevI->eraseFromParent ();
2157+ Update->eraseFromParent ();
2158+
2159+ return NextI;
2160+ }
2161+
20562162bool AArch64LoadStoreOpt::isMatchingUpdateInsn (MachineInstr &MemMI,
20572163 MachineInstr &MI,
20582164 unsigned BaseReg, int Offset) {
@@ -2100,6 +2206,34 @@ bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI,
21002206 return false ;
21012207}
21022208
2209+ bool AArch64LoadStoreOpt::isMatchingMovConstInsn (MachineInstr &MemMI,
2210+ MachineInstr &MI,
2211+ unsigned IndexReg,
2212+ unsigned &Offset) {
2213+ // The update instruction source and destination register must be the
2214+ // same as the load/store index register.
2215+ if (MI.getOpcode () == AArch64::MOVKWi &&
2216+ TRI->isSuperOrSubRegisterEq (IndexReg, MI.getOperand (1 ).getReg ())) {
2217+
2218+ // movz + movk hold a large offset of a Ld/St instruction.
2219+ MachineBasicBlock::iterator B = MI.getParent ()->begin ();
2220+ MachineBasicBlock::iterator MBBI = &MI;
2221+ // Skip the scene when the MI is the first instruction of a block.
2222+ if (MBBI == B)
2223+ return false ;
2224+ MBBI = prev_nodbg (MBBI, B);
2225+ MachineInstr &MovzMI = *MBBI;
2226+ if (MovzMI.getOpcode () == AArch64::MOVZWi) {
2227+ unsigned Low = MovzMI.getOperand (1 ).getImm ();
2228+ unsigned High = MI.getOperand (2 ).getImm () << MI.getOperand (3 ).getImm ();
2229+ Offset = High + Low;
2230+ // 12-bit optionally shifted immediates are legal for adds.
2231+ return Offset >> 24 == 0 ;
2232+ }
2233+ }
2234+ return false ;
2235+ }
2236+
21032237MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward (
21042238 MachineBasicBlock::iterator I, int UnscaledOffset, unsigned Limit) {
21052239 MachineBasicBlock::iterator E = I->getParent ()->end ();
@@ -2255,6 +2389,60 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
22552389 return E;
22562390}
22572391
2392+ MachineBasicBlock::iterator
2393+ AArch64LoadStoreOpt::findMatchingConstOffsetBackward (
2394+ MachineBasicBlock::iterator I, unsigned Limit, unsigned &Offset) {
2395+ MachineBasicBlock::iterator B = I->getParent ()->begin ();
2396+ MachineBasicBlock::iterator E = I->getParent ()->end ();
2397+ MachineInstr &MemMI = *I;
2398+ MachineBasicBlock::iterator MBBI = I;
2399+
2400+ // If the load is the first instruction in the block, there's obviously
2401+ // not any matching load or store.
2402+ if (MBBI == B)
2403+ return E;
2404+
2405+ // Make sure the IndexReg is killed and the shift amount is zero.
2406+ // TODO: Relex this restriction to extend, simplify processing now.
2407+ if (!AArch64InstrInfo::getLdStOffsetOp (MemMI).isKill () ||
2408+ !AArch64InstrInfo::getLdStAmountOp (MemMI).isImm () ||
2409+ (AArch64InstrInfo::getLdStAmountOp (MemMI).getImm () != 0 ))
2410+ return E;
2411+
2412+ Register IndexReg = AArch64InstrInfo::getLdStOffsetOp (MemMI).getReg ();
2413+
2414+ // Track which register units have been modified and used between the first
2415+ // insn (inclusive) and the second insn.
2416+ ModifiedRegUnits.clear ();
2417+ UsedRegUnits.clear ();
2418+ unsigned Count = 0 ;
2419+ do {
2420+ MBBI = prev_nodbg (MBBI, B);
2421+ MachineInstr &MI = *MBBI;
2422+
2423+ // Don't count transient instructions towards the search limit since there
2424+ // may be different numbers of them if e.g. debug information is present.
2425+ if (!MI.isTransient ())
2426+ ++Count;
2427+
2428+ // If we found a match, return it.
2429+ if (isMatchingMovConstInsn (*I, MI, IndexReg, Offset)) {
2430+ return MBBI;
2431+ }
2432+
2433+ // Update the status of what the instruction clobbered and used.
2434+ LiveRegUnits::accumulateUsedDefed (MI, ModifiedRegUnits, UsedRegUnits, TRI);
2435+
2436+ // Otherwise, if the index register is used or modified, we have no match,
2437+ // so return early.
2438+ if (!ModifiedRegUnits.available (IndexReg) ||
2439+ !UsedRegUnits.available (IndexReg))
2440+ return E;
2441+
2442+ } while (MBBI != B && Count < Limit);
2443+ return E;
2444+ }
2445+
22582446bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore (
22592447 MachineBasicBlock::iterator &MBBI) {
22602448 MachineInstr &MI = *MBBI;
@@ -2443,6 +2631,34 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
24432631 return false ;
24442632}
24452633
2634+ bool AArch64LoadStoreOpt::tryToMergeIndexLdSt (MachineBasicBlock::iterator &MBBI,
2635+ int Scale) {
2636+ MachineInstr &MI = *MBBI;
2637+ MachineBasicBlock::iterator E = MI.getParent ()->end ();
2638+ MachineBasicBlock::iterator Update;
2639+
2640+ // Don't know how to handle unscaled pre/post-index versions below, so bail.
2641+ if (TII->hasUnscaledLdStOffset (MI.getOpcode ()))
2642+ return false ;
2643+
2644+ // Look back to try to find a const offset for index LdSt instruction. For
2645+ // example,
2646+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2647+ // ldr x1, [x0, x8]
2648+ // merged into:
2649+ // add x8, x0, a * (1<<12)
2650+ // ldr x1, [x8, imm12]
2651+ unsigned Offset;
2652+ Update = findMatchingConstOffsetBackward (MBBI, LdStConstLimit, Offset);
2653+ if (Update != E && (Offset & (Scale - 1 )) == 0 ) {
2654+ // Merge the imm12 into the ld/st.
2655+ MBBI = mergeConstOffsetInsn (MBBI, Update, Offset, Scale);
2656+ return true ;
2657+ }
2658+
2659+ return false ;
2660+ }
2661+
24462662bool AArch64LoadStoreOpt::optimizeBlock (MachineBasicBlock &MBB,
24472663 bool EnableNarrowZeroStOpt) {
24482664
@@ -2521,6 +2737,22 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
25212737 ++MBBI;
25222738 }
25232739
2740+ // 5) Find a register assigned with a const value that can be combined with
2741+ // into the load or store. e.g.,
2742+ // mov x8, #LargeImm ; = a * (1<<12) + imm12
2743+ // ldr x1, [x0, x8]
2744+ // ; becomes
2745+ // add x8, x0, a * (1<<12)
2746+ // ldr x1, [x8, imm12]
2747+ for (MachineBasicBlock::iterator MBBI = MBB.begin (), E = MBB.end ();
2748+ MBBI != E;) {
2749+ int Scale;
2750+ if (isMergeableIndexLdSt (*MBBI, Scale) && tryToMergeIndexLdSt (MBBI, Scale))
2751+ Modified = true ;
2752+ else
2753+ ++MBBI;
2754+ }
2755+
25242756 return Modified;
25252757}
25262758
0 commit comments