diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 43574a54c37dd..a6e14dd6b581e 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -436,6 +436,8 @@ class CodeGenPrepare { bool optimizeExt(Instruction *&I); bool optimizeExtUses(Instruction *I); bool optimizeLoadExt(LoadInst *Load); + bool optimizeStoreMisalign(StoreInst *ST); + bool optimizeLoadMisalign(LoadInst *ST); bool optimizeShiftInst(BinaryOperator *BO); bool optimizeFunnelShift(IntrinsicInst *Fsh); bool optimizeSelectInst(SelectInst *SI); @@ -7353,6 +7355,138 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) { return true; } +static bool isOptimizeMisalignCandidate(Instruction *I, const DataLayout *DL, + const TargetLowering *TLI, + const DominatorTree *DT) { + if (!isa(I) && !isa(I)) + return false; + + Value *Ptr = I->getOperand(isa(I) ? 1 : 0); + Align Alignment = isa(I) ? cast(I)->getAlign() + : cast(I)->getAlign(); + Type *ValTy = isa(I) ? I->getOperand(0)->getType() : I->getType(); + + if (ValTy->isScalableTy() || !ValTy->isSized()) + return false; + + unsigned BitWidth = DL->getTypeSizeInBits(ValTy); + + // DAG legalization can handle this situation well + if (Alignment.value() * 8 >= BitWidth / 2) + return false; + + Type *PtrTy = Ptr->getType(); + EVT ValVT = TLI->getValueType(*DL, ValTy, true); + if (!ValVT.isSimple() || ValVT == MVT::Other || + TLI->allowsMisalignedMemoryAccesses( + ValVT, PtrTy->getPointerAddressSpace(), Alignment)) + return false; + + KnownBits Known = computeKnownBits(Ptr, *DL, nullptr, I, DT); + if (Known.isUnknown()) + return false; + + unsigned PtrWidth = DL->getPointerTypeSizeInBits(PtrTy); + KnownBits AlignKnown = + KnownBits::makeConstant(APInt(PtrWidth, Alignment.value())); + + if (KnownBits::add(Known, AlignKnown).countMinTrailingZeros() <= + AlignKnown.countMinTrailingZeros()) + return false; + return true; +} + +bool CodeGenPrepare::optimizeStoreMisalign(StoreInst *SI) { + if (!isOptimizeMisalignCandidate(SI, DL, TLI, DT.get())) + return false; + + IRBuilder<> Builder(SI); + Value *Val = SI->getValueOperand(); + unsigned BitWidth = DL->getTypeSizeInBits(Val->getType()); + if (!Val->getType()->isIntegerTy()) + Val = + Builder.CreateBitCast(Val, Type::getIntNTy(SI->getContext(), BitWidth)); + + bool IsLE = DL->isLittleEndian(); + bool IsVolatile = SI->isVolatile(); + Align Alignment = SI->getAlign(); + Value *Ptr = SI->getPointerOperand(); + unsigned RemainingBits = BitWidth; + Type *Int8Ty = Type::getInt8Ty(SI->getContext()); + Type *Int32Ty = Type::getInt32Ty(SI->getContext()); + + while (RemainingBits > 0) { + unsigned ChunkBits = + std::min((uint64_t)(RemainingBits), 8 * Alignment.value()); + Type *ChunkTy = Type::getIntNTy(SI->getContext(), ChunkBits); + Value *ChunkVal; + if (IsLE) { + ChunkVal = Builder.CreateTrunc(Val, ChunkTy); + } else { + Value *ShiftR = Builder.CreateLShr(Val, BitWidth - ChunkBits); + ChunkVal = Builder.CreateTrunc(ShiftR, ChunkTy); + } + Builder.CreateAlignedStore(ChunkVal, Ptr, Alignment, IsVolatile); + RemainingBits -= ChunkBits; + if (RemainingBits == 0) + break; + + Val = IsLE ? Builder.CreateLShr(Val, ChunkBits) + : Builder.CreateShl(Val, ChunkBits); + Ptr = Builder.CreateGEP(Int8Ty, Ptr, + ConstantInt::get(Int32Ty, ChunkBits / 8)); + Alignment = getKnownAlignment(Ptr, *DL); + } + + SI->eraseFromParent(); + return true; +} + +bool CodeGenPrepare::optimizeLoadMisalign(LoadInst *LI) { + if (!isOptimizeMisalignCandidate(LI, DL, TLI, DT.get())) + return false; + + IRBuilder<> Builder(LI); + Type *ValTy = LI->getType(); + + unsigned BitWidth = DL->getTypeSizeInBits(LI->getType()); + bool IsLE = DL->isLittleEndian(); + bool IsVolatile = LI->isVolatile(); + Align Alignment = LI->getAlign(); + Value *Ptr = LI->getPointerOperand(); + unsigned RemainingBits = BitWidth; + Type *IntTy = Type::getIntNTy(LI->getContext(), BitWidth); + Type *Int8Ty = Type::getInt8Ty(LI->getContext()); + Type *Int32Ty = Type::getInt32Ty(LI->getContext()); + Value *Val = ConstantInt::get(IntTy, 0); + + while (RemainingBits > 0) { + unsigned ChunkBits = + std::min((uint64_t)(RemainingBits), 8 * Alignment.value()); + Type *ChunkTy = Type::getIntNTy(LI->getContext(), ChunkBits); + Value *ChunkVal = Builder.CreateZExt( + Builder.CreateAlignedLoad(ChunkTy, Ptr, Alignment, IsVolatile), IntTy); + if (IsLE) { + ChunkVal = Builder.CreateShl(ChunkVal, BitWidth - RemainingBits); + } else { + ChunkVal = Builder.CreateShl(Val, RemainingBits - ChunkBits); + } + Val = Builder.CreateOr(Val, ChunkVal); + RemainingBits -= ChunkBits; + if (RemainingBits == 0) + break; + Ptr = Builder.CreateGEP(Int8Ty, Ptr, + ConstantInt::get(Int32Ty, ChunkBits / 8)); + Alignment = getKnownAlignment(Ptr, *DL); + } + + if (!ValTy->isIntegerTy()) + Val = Builder.CreateBitCast(Val, ValTy); + LI->replaceAllUsesWith(Val); + LI->eraseFromParent(); + return true; +} + /// Check if V (an operand of a select instruction) is an expensive instruction /// that is only used once. static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) { @@ -8750,6 +8884,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { return true; if (LoadInst *LI = dyn_cast(I)) { + if (optimizeLoadMisalign(LI)) + return true; LI->setMetadata(LLVMContext::MD_invariant_group, nullptr); bool Modified = optimizeLoadExt(LI); unsigned AS = LI->getPointerAddressSpace(); @@ -8760,6 +8896,8 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, ModifyDT &ModifiedDT) { if (StoreInst *SI = dyn_cast(I)) { if (splitMergedValStore(*SI, *DL, *TLI)) return true; + if (optimizeStoreMisalign(SI)) + return true; SI->setMetadata(LLVMContext::MD_invariant_group, nullptr); unsigned AS = SI->getPointerAddressSpace(); return optimizeMemoryInst(I, SI->getOperand(1), diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll index d95f528442efd..a34aacbea5668 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1451,63 +1451,41 @@ define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, ptr addrspac define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) { ; CI-LABEL: read2_v2i32_align1_odd_offset: ; CI: ; %bb.0: ; %entry -; CI-NEXT: v_mov_b32_e32 v0, 0 +; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_read_u8 v1, v0 offset:70 -; CI-NEXT: ds_read_u8 v2, v0 offset:72 -; CI-NEXT: ds_read_u8 v3, v0 offset:71 -; CI-NEXT: ds_read_u8 v4, v0 offset:69 -; CI-NEXT: ds_read_u8 v5, v0 offset:68 -; CI-NEXT: s_waitcnt lgkmcnt(4) -; CI-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; CI-NEXT: s_waitcnt lgkmcnt(3) -; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; CI-NEXT: s_waitcnt lgkmcnt(2) -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: s_waitcnt lgkmcnt(1) -; CI-NEXT: v_or_b32_e32 v1, v1, v4 -; CI-NEXT: ds_read_u8 v4, v0 offset:66 -; CI-NEXT: ds_read_u8 v6, v0 offset:67 -; CI-NEXT: ds_read_u8 v0, v0 offset:65 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; CI-NEXT: ds_read_u8 v2, v1 offset:65 +; CI-NEXT: ds_read_u16 v3, v1 offset:66 +; CI-NEXT: ds_read_b32 v0, v1 offset:68 +; CI-NEXT: ds_read_u8 v4, v1 offset:72 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_or_b32_e32 v1, v2, v1 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v4 -; CI-NEXT: v_or_b32_e32 v0, v2, v0 -; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v5 -; CI-NEXT: v_or_b32_e32 v2, v2, v6 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 24 +; CI-NEXT: v_or_b32_e32 v2, v2, v3 +; CI-NEXT: v_or_b32_e32 v0, v0, v2 +; CI-NEXT: v_lshlrev_b32_e32 v2, 24, v4 ; CI-NEXT: s_mov_b32 s2, -1 -; CI-NEXT: v_or_b32_e32 v0, v2, v0 +; CI-NEXT: v_or_b32_e32 v1, v1, v2 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset: ; GFX9-ALIGNED: ; %bb.0: ; %entry -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-ALIGNED-NEXT: ds_read_u8 v0, v2 offset:65 -; GFX9-ALIGNED-NEXT: ds_read_u8 v3, v2 offset:66 -; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v2 offset:67 -; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v2 offset:68 -; GFX9-ALIGNED-NEXT: ds_read_u8 v1, v2 offset:70 -; GFX9-ALIGNED-NEXT: ds_read_u8 v6, v2 offset:69 -; GFX9-ALIGNED-NEXT: ds_read_u8 v7, v2 offset:72 -; GFX9-ALIGNED-NEXT: ds_read_u8 v8, v2 offset:71 +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-ALIGNED-NEXT: ds_read_u16 v2, v1 offset:66 +; GFX9-ALIGNED-NEXT: ds_read_b32 v0, v1 offset:68 +; GFX9-ALIGNED-NEXT: ds_read_u8 v4, v1 offset:65 +; GFX9-ALIGNED-NEXT: ds_read_u8 v5, v1 offset:72 ; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v1, v6 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v7 -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v5 -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v6, v6, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v1, v6, v1 -; GFX9-ALIGNED-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX9-ALIGNED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v6, 8, v2 +; GFX9-ALIGNED-NEXT: v_lshlrev_b64 v[2:3], 24, v[0:1] +; GFX9-ALIGNED-NEXT: v_or_b32_sdwa v0, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v2, v0, v2 +; GFX9-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 24, v5 +; GFX9-ALIGNED-NEXT: v_or_b32_e32 v3, v3, v0 +; GFX9-ALIGNED-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset: diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 41e3d5f10f6dd..31c6739ce5559 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1009,15 +1009,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write_b8 v1, v0 offset:65 -; CI-NEXT: v_mov_b32_e32 v0, 1 -; CI-NEXT: ds_write_b8 v1, v0 offset:70 -; CI-NEXT: v_mov_b32_e32 v0, 0xc8 -; CI-NEXT: ds_write_b8 v1, v0 offset:69 -; CI-NEXT: ds_write_b8 v1, v1 offset:68 -; CI-NEXT: ds_write_b8 v1, v1 offset:67 -; CI-NEXT: ds_write_b8 v1, v1 offset:66 +; CI-NEXT: ds_write_b16 v1, v1 offset:66 +; CI-NEXT: v_mov_b32_e32 v0, 0x1c800 +; CI-NEXT: ds_write_b32 v1, v0 offset:68 ; CI-NEXT: ds_write_b8 v1, v1 offset:72 -; CI-NEXT: ds_write_b8 v1, v1 offset:71 ; CI-NEXT: s_endpgm ; ; GFX9-ALIGNED-LABEL: write2_v2i32_align1_odd_offset: @@ -1025,15 +1020,10 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:65 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:70 -; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0xc8 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v0 offset:69 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:68 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:67 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:66 +; GFX9-ALIGNED-NEXT: ds_write_b16 v1, v1 offset:66 +; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0x1c800 +; GFX9-ALIGNED-NEXT: ds_write_b32 v1, v0 offset:68 ; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:72 -; GFX9-ALIGNED-NEXT: ds_write_b8 v1, v1 offset:71 ; GFX9-ALIGNED-NEXT: s_endpgm ; ; GFX9-UNALIGNED-LABEL: write2_v2i32_align1_odd_offset: diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll index 3d783d143192d..278df39938342 100644 --- a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll +++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll @@ -12,10 +12,15 @@ define void @ret_void_args_i8_i32(i8 %a, i32 %b) { ; CHECK: sts 4, r24 store volatile i8 %a, ptr inttoptr (i64 4 to ptr) - ; CHECK-NEXT: sts 8, r23 - ; CHECK-NEXT: sts 7, r22 - ; CHECK-NEXT: sts 6, r21 ; CHECK-NEXT: sts 5, r20 + + ; redundant instructions, should be deleted + ; CHECK-NEXT: mov r24, r21 + ; CHECK-NEXT: mov r25, r22 + + ; CHECK-NEXT: sts 7, r25 + ; CHECK-NEXT: sts 6, r24 + ; CHECK-NEXT: sts 8, r23 store volatile i32 %b, ptr inttoptr (i64 5 to ptr) ret void } diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll index 07d58841dd802..cd1817cd245be 100644 --- a/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll +++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic_aggr.ll @@ -7,10 +7,15 @@ start: %0 = extractvalue { i8, i32 } %a, 0 store volatile i8 %0, ptr inttoptr (i64 4 to ptr) - ; CHECK-NEXT: sts 8, r24 - ; CHECK-NEXT: sts 7, r23 - ; CHECK-NEXT: sts 6, r22 ; CHECK-NEXT: sts 5, r21 + + ; redundant instructions, should be deleted + ; CHECK-NEXT: mov r18, r22 + ; CHECK-NEXT: mov r19, r23 + + ; CHECK-NEXT: sts 7, r19 + ; CHECK-NEXT: sts 6, r18 + ; CHECK-NEXT: sts 8, r24 %1 = extractvalue { i8, i32 } %a, 1 store volatile i32 %1, ptr inttoptr (i64 5 to ptr) ret void @@ -62,17 +67,22 @@ start: %0 = extractvalue { i8, i32 } %a, 0 store volatile i8 %0, ptr inttoptr (i64 4 to ptr) - ; CHECK-NEXT: sts 8, r24 - ; CHECK-NEXT: sts 7, r23 - ; CHECK-NEXT: sts 6, r22 ; CHECK-NEXT: sts 5, r21 + + ; redundant instructions, should be deleted + ; CHECK-NEXT: mov r20, r22 + ; CHECK-NEXT: mov r21, r23 + + ; CHECK-NEXT: sts 7, r21 + ; CHECK-NEXT: sts 6, r20 + ; CHECK-NEXT: sts 8, r24 %1 = extractvalue { i8, i32 } %a, 1 store volatile i32 %1, ptr inttoptr (i64 5 to ptr) - ; CHECK-NEXT: sts 9, r17 - ; CHECK-NEXT: sts 8, r16 - ; CHECK-NEXT: sts 7, r15 - ; CHECK-NEXT: sts 6, r14 + ; CHECK-NEXT: sts 9, r17 + ; CHECK-NEXT: sts 8, r16 + ; CHECK-NEXT: sts 7, r15 + ; CHECK-NEXT: sts 6, r14 %2 = extractvalue { i32, i8 } %b, 0 store volatile i32 %2, ptr inttoptr (i64 6 to ptr) diff --git a/llvm/test/CodeGen/XCore/unaligned_load.ll b/llvm/test/CodeGen/XCore/unaligned_load.ll index ee9aea4689503..ce27c1ac49801 100644 --- a/llvm/test/CodeGen/XCore/unaligned_load.ll +++ b/llvm/test/CodeGen/XCore/unaligned_load.ll @@ -24,8 +24,11 @@ entry: ; Constant offset from word aligned base. ; CHECK-LABEL: align3: -; CHECK: ldw {{r[0-9]+}}, dp -; CHECK: ldw {{r[0-9]+}}, dp +; CHECK: ldaw {{r[0-9]+}}, dp +; CHECK: ld8u +; CHECK: ld16s +; CHECK: or +; CHECK: ld8u ; CHECK: or define i32 @align3() nounwind { entry: diff --git a/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll b/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll new file mode 100644 index 0000000000000..a4d28aac256fc --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/RISCV/misalign-refine.ll @@ -0,0 +1,286 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=riscv32 %s \ +; RUN: | FileCheck --check-prefixes=LE %s +; RUN: opt -S -passes='require,function(codegenprepare)' -mtriple=riscv32 -data-layout="E-m:e-p:32:32-i64:64-n32-S128" %s \ +; RUN: | FileCheck --check-prefixes=BE %s + + +define void @foo-i32(ptr align 4 %p, i32 %v) { +; LE-LABEL: @foo-i32( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = trunc i32 [[V:%.*]] to i8 +; LE-NEXT: store i8 [[TMP0]], ptr [[LEN]], align 1 +; LE-NEXT: [[TMP1:%.*]] = lshr i32 [[V]], 8 +; LE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP1]] to i24 +; LE-NEXT: store i24 [[TMP3]], ptr [[TMP2]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @foo-i32( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = lshr i32 [[V:%.*]], 24 +; BE-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8 +; BE-NEXT: store i8 [[TMP1]], ptr [[LEN]], align 1 +; BE-NEXT: [[TMP2:%.*]] = shl i32 [[V]], 8 +; BE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP2]], 8 +; BE-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i24 +; BE-NEXT: store i24 [[TMP5]], ptr [[TMP3]], align 4 +; BE-NEXT: ret void +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + store i32 %v, ptr %len, align 1 + ret void +} +define void @foo-i64(ptr align 4 %p, i64 %v) { +; LE-LABEL: @foo-i64( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = trunc i64 [[V:%.*]] to i8 +; LE-NEXT: store i8 [[TMP0]], ptr [[LEN]], align 1 +; LE-NEXT: [[TMP1:%.*]] = lshr i64 [[V]], 8 +; LE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP1]] to i32 +; LE-NEXT: store i32 [[TMP3]], ptr [[TMP2]], align 4 +; LE-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP1]], 32 +; LE-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP2]], i32 4 +; LE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP4]] to i24 +; LE-NEXT: store i24 [[TMP6]], ptr [[TMP5]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @foo-i64( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = lshr i64 [[V:%.*]], 56 +; BE-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i8 +; BE-NEXT: store i8 [[TMP1]], ptr [[LEN]], align 1 +; BE-NEXT: [[TMP2:%.*]] = shl i64 [[V]], 8 +; BE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP2]], 32 +; BE-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32 +; BE-NEXT: store i32 [[TMP5]], ptr [[TMP3]], align 4 +; BE-NEXT: [[TMP6:%.*]] = shl i64 [[TMP2]], 32 +; BE-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP3]], i32 4 +; BE-NEXT: [[TMP8:%.*]] = lshr i64 [[TMP6]], 40 +; BE-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i24 +; BE-NEXT: store i24 [[TMP9]], ptr [[TMP7]], align 4 +; BE-NEXT: ret void +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + store i64 %v, ptr %len, align 1 + ret void +} + +define void @foo-float(ptr align 4 %p, float %v) { +; LE-LABEL: @foo-float( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = bitcast float [[V:%.*]] to i32 +; LE-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8 +; LE-NEXT: store i8 [[TMP1]], ptr [[LEN]], align 1 +; LE-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP0]], 8 +; LE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP2]] to i24 +; LE-NEXT: store i24 [[TMP4]], ptr [[TMP3]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @foo-float( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = bitcast float [[V:%.*]] to i32 +; BE-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 24 +; BE-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +; BE-NEXT: store i8 [[TMP2]], ptr [[LEN]], align 1 +; BE-NEXT: [[TMP3:%.*]] = shl i32 [[TMP0]], 8 +; BE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP3]], 8 +; BE-NEXT: [[TMP6:%.*]] = trunc i32 [[TMP5]] to i24 +; BE-NEXT: store i24 [[TMP6]], ptr [[TMP4]], align 4 +; BE-NEXT: ret void +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + store float %v, ptr %len, align 1 + ret void +} + +define void @foo-double(ptr align 4 %p, double %v) { +; LE-LABEL: @foo-double( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = bitcast double [[V:%.*]] to i64 +; LE-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i8 +; LE-NEXT: store i8 [[TMP1]], ptr [[LEN]], align 1 +; LE-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 8 +; LE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32 +; LE-NEXT: store i32 [[TMP4]], ptr [[TMP3]], align 4 +; LE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP2]], 32 +; LE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP3]], i32 4 +; LE-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP5]] to i24 +; LE-NEXT: store i24 [[TMP7]], ptr [[TMP6]], align 4 +; LE-NEXT: ret void +; +; BE-LABEL: @foo-double( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = bitcast double [[V:%.*]] to i64 +; BE-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 56 +; BE-NEXT: [[TMP2:%.*]] = trunc i64 [[TMP1]] to i8 +; BE-NEXT: store i8 [[TMP2]], ptr [[LEN]], align 1 +; BE-NEXT: [[TMP3:%.*]] = shl i64 [[TMP0]], 8 +; BE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32 +; BE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; BE-NEXT: store i32 [[TMP6]], ptr [[TMP4]], align 4 +; BE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP3]], 32 +; BE-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP4]], i32 4 +; BE-NEXT: [[TMP9:%.*]] = lshr i64 [[TMP7]], 40 +; BE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i24 +; BE-NEXT: store i24 [[TMP10]], ptr [[TMP8]], align 4 +; BE-NEXT: ret void +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + store double %v, ptr %len, align 1 + ret void +} + +define i32 @foo-load-i32(ptr align 4 %p) { +; LE-LABEL: @foo-load-i32( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; LE-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; LE-NEXT: [[TMP2:%.*]] = shl i32 [[TMP1]], 0 +; LE-NEXT: [[TMP3:%.*]] = or i32 0, [[TMP2]] +; LE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP5:%.*]] = load i24, ptr [[TMP4]], align 4 +; LE-NEXT: [[TMP6:%.*]] = zext i24 [[TMP5]] to i32 +; LE-NEXT: [[TMP7:%.*]] = shl i32 [[TMP6]], 8 +; LE-NEXT: [[TMP8:%.*]] = or i32 [[TMP3]], [[TMP7]] +; LE-NEXT: ret i32 [[TMP8]] +; +; BE-LABEL: @foo-load-i32( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; BE-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; BE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP3:%.*]] = load i24, ptr [[TMP2]], align 4 +; BE-NEXT: [[TMP4:%.*]] = zext i24 [[TMP3]] to i32 +; BE-NEXT: ret i32 0 +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + %v = load i32, ptr %len, align 1 + ret i32 %v +} +define i64 @foo-load-i64(ptr align 4 %p) { +; LE-LABEL: @foo-load-i64( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; LE-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i64 +; LE-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 0 +; LE-NEXT: [[TMP3:%.*]] = or i64 0, [[TMP2]] +; LE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +; LE-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; LE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 8 +; LE-NEXT: [[TMP8:%.*]] = or i64 [[TMP3]], [[TMP7]] +; LE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP4]], i32 4 +; LE-NEXT: [[TMP10:%.*]] = load i24, ptr [[TMP9]], align 4 +; LE-NEXT: [[TMP11:%.*]] = zext i24 [[TMP10]] to i64 +; LE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 40 +; LE-NEXT: [[TMP13:%.*]] = or i64 [[TMP8]], [[TMP12]] +; LE-NEXT: ret i64 [[TMP13]] +; +; BE-LABEL: @foo-load-i64( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; BE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +; BE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 4 +; BE-NEXT: [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 4 +; BE-NEXT: ret i64 0 +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + %v = load i64, ptr %len, align 1 + ret i64 %v +} + +define float @foo-load-float(ptr align 4 %p) { +; LE-LABEL: @foo-load-float( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; LE-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; LE-NEXT: [[TMP2:%.*]] = shl i32 [[TMP1]], 0 +; LE-NEXT: [[TMP3:%.*]] = or i32 0, [[TMP2]] +; LE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP5:%.*]] = load i24, ptr [[TMP4]], align 4 +; LE-NEXT: [[TMP6:%.*]] = zext i24 [[TMP5]] to i32 +; LE-NEXT: [[TMP7:%.*]] = shl i32 [[TMP6]], 8 +; LE-NEXT: [[TMP8:%.*]] = or i32 [[TMP3]], [[TMP7]] +; LE-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; LE-NEXT: ret float [[TMP9]] +; +; BE-LABEL: @foo-load-float( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; BE-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32 +; BE-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP3:%.*]] = load i24, ptr [[TMP2]], align 4 +; BE-NEXT: [[TMP4:%.*]] = zext i24 [[TMP3]] to i32 +; BE-NEXT: ret float 0.000000e+00 +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + %v = load float, ptr %len, align 1 + ret float %v +} + +define double @foo-load-double(ptr align 4 %p) { +; LE-LABEL: @foo-load-double( +; LE-NEXT: entry: +; LE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; LE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; LE-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i64 +; LE-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 0 +; LE-NEXT: [[TMP3:%.*]] = or i64 0, [[TMP2]] +; LE-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; LE-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +; LE-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; LE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 8 +; LE-NEXT: [[TMP8:%.*]] = or i64 [[TMP3]], [[TMP7]] +; LE-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP4]], i32 4 +; LE-NEXT: [[TMP10:%.*]] = load i24, ptr [[TMP9]], align 4 +; LE-NEXT: [[TMP11:%.*]] = zext i24 [[TMP10]] to i64 +; LE-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 40 +; LE-NEXT: [[TMP13:%.*]] = or i64 [[TMP8]], [[TMP12]] +; LE-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP13]] to double +; LE-NEXT: ret double [[TMP14]] +; +; BE-LABEL: @foo-load-double( +; BE-NEXT: entry: +; BE-NEXT: [[LEN:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i32 3 +; BE-NEXT: [[TMP0:%.*]] = load i8, ptr [[LEN]], align 1 +; BE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[LEN]], i32 1 +; BE-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +; BE-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 4 +; BE-NEXT: [[TMP4:%.*]] = load i24, ptr [[TMP3]], align 4 +; BE-NEXT: ret double 0.000000e+00 +; +entry: + %len = getelementptr inbounds nuw i8, ptr %p, i32 3 + %v = load double, ptr %len, align 1 + ret double %v +}