Skip to content

Commit 8ad39fb

Browse files
committed
[Attributor][FIX] Heap2Stack needs to use the alloca AS
When we move an allocation from the heap to the stack we need to allocate it in the alloca AS and then cast the result. This also prevents us from inserting the alloca after the allocation call but rather right before. Fixes #53858
1 parent 668c5c6 commit 8ad39fb

File tree

3 files changed

+58
-11
lines changed

3 files changed

+58
-11
lines changed

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

+8-7
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/Analysis/ValueTracking.h"
3333
#include "llvm/IR/Assumptions.h"
3434
#include "llvm/IR/Constants.h"
35+
#include "llvm/IR/DataLayout.h"
3536
#include "llvm/IR/IRBuilder.h"
3637
#include "llvm/IR/Instruction.h"
3738
#include "llvm/IR/Instructions.h"
@@ -6031,13 +6032,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
60316032
else
60326033
A.emitRemark<OptimizationRemark>(AI.CB, "HeapToStack", Remark);
60336034

6035+
const DataLayout &DL = A.getInfoCache().getDL();
60346036
Value *Size;
60356037
Optional<APInt> SizeAPI = getSize(A, *this, AI);
60366038
if (SizeAPI.hasValue()) {
60376039
Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI);
60386040
} else {
60396041
LLVMContext &Ctx = AI.CB->getContext();
6040-
auto &DL = A.getInfoCache().getDL();
60416042
ObjectSizeOpts Opts;
60426043
ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
60436044
SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
@@ -6057,14 +6058,14 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
60576058
max(Alignment, MaybeAlign(AlignmentAPI.getValue().getZExtValue()));
60586059
}
60596060

6060-
unsigned AS = cast<PointerType>(AI.CB->getType())->getAddressSpace();
6061-
Instruction *Alloca =
6062-
new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
6063-
"", AI.CB->getNextNode());
6061+
// TODO: Hoist the alloca towards the function entry.
6062+
unsigned AS = DL.getAllocaAddrSpace();
6063+
Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
6064+
Size, Alignment, "", AI.CB);
60646065

60656066
if (Alloca->getType() != AI.CB->getType())
6066-
Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc",
6067-
Alloca->getNextNode());
6067+
Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
6068+
Alloca, AI.CB->getType(), "malloc_cast", AI.CB);
60686069

60696070
auto *I8Ty = Type::getInt8Ty(F->getContext());
60706071
auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty);

llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll

+44
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
55
; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
66

7+
; FIXME: amdgpu doesn't claim malloc is a thing, so the test is somewhat
8+
; useless except the __kmpc_alloc_shared part which now also covers the important
9+
; part this test was initially designed for, make sure the "is freed" check is
10+
; not sufficient on a GPU.
711
target triple = "amdgcn-amd-amdhsa"
12+
target datalayout = "A5"
813

914
declare noalias i8* @malloc(i64)
1015

@@ -20,6 +25,7 @@ declare void @no_sync_func(i8* nocapture %p) nofree nosync willreturn
2025

2126
declare void @nofree_func(i8* nocapture %p) nofree nosync willreturn
2227

28+
declare void @usei8(i8* %p)
2329
declare void @foo(i32* %p)
2430

2531
declare void @foo_nounw(i32* %p) nounwind nofree
@@ -663,11 +669,49 @@ define void @test16d(i8 %v, i8** %P) {
663669
store i8* %1, i8** %P
664670
ret void
665671
}
672+
673+
declare i8* @__kmpc_alloc_shared(i64)
674+
declare void @__kmpc_free_shared(i8* nocapture, i64)
675+
676+
define void @test17() {
677+
; IS________OPM-LABEL: define {{[^@]+}}@test17() {
678+
; IS________OPM-NEXT: [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
679+
; IS________OPM-NEXT: tail call void @usei8(i8* noalias nocapture nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
680+
; IS________OPM-NEXT: tail call void @__kmpc_free_shared(i8* noalias nocapture [[TMP1]], i64 noundef 4)
681+
; IS________OPM-NEXT: ret void
682+
;
683+
; IS________NPM-LABEL: define {{[^@]+}}@test17() {
684+
; IS________NPM-NEXT: [[TMP1:%.*]] = alloca i8, i64 4, align 1, addrspace(5)
685+
; IS________NPM-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP1]] to i8*
686+
; IS________NPM-NEXT: tail call void @usei8(i8* noalias nocapture nofree [[MALLOC_CAST]]) #[[ATTR6:[0-9]+]]
687+
; IS________NPM-NEXT: ret void
688+
;
689+
%1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
690+
tail call void @usei8(i8* nocapture nofree %1) willreturn nounwind nosync
691+
tail call void @__kmpc_free_shared(i8* %1, i64 4)
692+
ret void
693+
}
694+
695+
define void @test17b() {
696+
; CHECK-LABEL: define {{[^@]+}}@test17b() {
697+
; CHECK-NEXT: [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
698+
; CHECK-NEXT: tail call void @usei8(i8* nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
699+
; CHECK-NEXT: tail call void @__kmpc_free_shared(i8* nocapture [[TMP1]], i64 noundef 4)
700+
; CHECK-NEXT: ret void
701+
;
702+
%1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
703+
tail call void @usei8(i8* nofree %1) willreturn nounwind nosync
704+
tail call void @__kmpc_free_shared(i8* %1, i64 4)
705+
ret void
706+
}
707+
708+
666709
;.
667710
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn }
668711
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nosync willreturn }
669712
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nofree nounwind }
670713
; CHECK: attributes #[[ATTR3]] = { noreturn }
671714
; CHECK: attributes #[[ATTR4:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
672715
; CHECK: attributes #[[ATTR5]] = { nounwind }
716+
; CHECK: attributes #[[ATTR6]] = { nosync nounwind willreturn }
673717
;.

llvm/test/Transforms/OpenMP/spmdization.ll

+6-4
Original file line numberDiff line numberDiff line change
@@ -678,8 +678,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
678678
; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
679679
; AMDGPU-NEXT: entry:
680680
; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
681-
; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
682-
; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
681+
; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
682+
; AMDGPU-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
683+
; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
683684
; AMDGPU-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
684685
; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
685686
; AMDGPU: for.cond:
@@ -722,8 +723,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
722723
; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
723724
; AMDGPU-DISABLED-NEXT: entry:
724725
; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
725-
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
726-
; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
726+
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
727+
; AMDGPU-DISABLED-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
728+
; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
727729
; AMDGPU-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
728730
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
729731
; AMDGPU-DISABLED: for.cond:

0 commit comments

Comments
 (0)