Skip to content

Commit 5593af7

Browse files
jdoerferttstellar
authored andcommitted
[Attributor][FIX] Heap2Stack needs to use the alloca AS
When we move an allocation from the heap to the stack we need to allocate it in the alloca AS and then cast the result. This also prevents us from inserting the alloca after the allocation call but rather right before. Fixes #53858 (cherry picked from commit 8ad39fb)
1 parent b3d3501 commit 5593af7

File tree

3 files changed

+58
-11
lines changed

3 files changed

+58
-11
lines changed

llvm/lib/Transforms/IPO/AttributorAttributes.cpp

+8-7
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/Analysis/ValueTracking.h"
3333
#include "llvm/IR/Assumptions.h"
3434
#include "llvm/IR/Constants.h"
35+
#include "llvm/IR/DataLayout.h"
3536
#include "llvm/IR/IRBuilder.h"
3637
#include "llvm/IR/Instruction.h"
3738
#include "llvm/IR/Instructions.h"
@@ -6026,13 +6027,13 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
60266027
else
60276028
A.emitRemark<OptimizationRemark>(AI.CB, "HeapToStack", Remark);
60286029

6030+
const DataLayout &DL = A.getInfoCache().getDL();
60296031
Value *Size;
60306032
Optional<APInt> SizeAPI = getSize(A, *this, AI);
60316033
if (SizeAPI.hasValue()) {
60326034
Size = ConstantInt::get(AI.CB->getContext(), *SizeAPI);
60336035
} else {
60346036
LLVMContext &Ctx = AI.CB->getContext();
6035-
auto &DL = A.getInfoCache().getDL();
60366037
ObjectSizeOpts Opts;
60376038
ObjectSizeOffsetEvaluator Eval(DL, TLI, Ctx, Opts);
60386039
SizeOffsetEvalType SizeOffsetPair = Eval.compute(AI.CB);
@@ -6052,14 +6053,14 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
60526053
max(Alignment, MaybeAlign(AlignmentAPI.getValue().getZExtValue()));
60536054
}
60546055

6055-
unsigned AS = cast<PointerType>(AI.CB->getType())->getAddressSpace();
6056-
Instruction *Alloca =
6057-
new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
6058-
"", AI.CB->getNextNode());
6056+
// TODO: Hoist the alloca towards the function entry.
6057+
unsigned AS = DL.getAllocaAddrSpace();
6058+
Instruction *Alloca = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
6059+
Size, Alignment, "", AI.CB);
60596060

60606061
if (Alloca->getType() != AI.CB->getType())
6061-
Alloca = new BitCastInst(Alloca, AI.CB->getType(), "malloc_bc",
6062-
Alloca->getNextNode());
6062+
Alloca = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
6063+
Alloca, AI.CB->getType(), "malloc_cast", AI.CB);
60636064

60646065
auto *I8Ty = Type::getInt8Ty(F->getContext());
60656066
auto *InitVal = getInitialValueOfAllocation(AI.CB, TLI, I8Ty);

llvm/test/Transforms/Attributor/heap_to_stack_gpu.ll

+44
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
; RUN: opt -attributor-cgscc -enable-new-pm=0 -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_NPM,IS__CGSCC____,IS________OPM,IS__CGSCC_OPM
55
; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,NOT_TUNIT_NPM,NOT_TUNIT_OPM,NOT_CGSCC_OPM,IS__CGSCC____,IS________NPM,IS__CGSCC_NPM
66

7+
; FIXME: amdgpu doesn't claim malloc is a thing, so the test is somewhat
8+
; useless except the __kmpc_alloc_shared part which now also covers the important
9+
; part this test was initially designed for, make sure the "is freed" check is
10+
; not sufficient on a GPU.
711
target triple = "amdgcn-amd-amdhsa"
12+
target datalayout = "A5"
813

914
declare noalias i8* @malloc(i64)
1015

@@ -20,6 +25,7 @@ declare void @no_sync_func(i8* nocapture %p) nofree nosync willreturn
2025

2126
declare void @nofree_func(i8* nocapture %p) nofree nosync willreturn
2227

28+
declare void @usei8(i8* %p)
2329
declare void @foo(i32* %p)
2430

2531
declare void @foo_nounw(i32* %p) nounwind nofree
@@ -663,11 +669,49 @@ define void @test16d(i8 %v, i8** %P) {
663669
store i8* %1, i8** %P
664670
ret void
665671
}
672+
673+
declare i8* @__kmpc_alloc_shared(i64)
674+
declare void @__kmpc_free_shared(i8* nocapture, i64)
675+
676+
define void @test17() {
677+
; IS________OPM-LABEL: define {{[^@]+}}@test17() {
678+
; IS________OPM-NEXT: [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
679+
; IS________OPM-NEXT: tail call void @usei8(i8* noalias nocapture nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
680+
; IS________OPM-NEXT: tail call void @__kmpc_free_shared(i8* noalias nocapture [[TMP1]], i64 noundef 4)
681+
; IS________OPM-NEXT: ret void
682+
;
683+
; IS________NPM-LABEL: define {{[^@]+}}@test17() {
684+
; IS________NPM-NEXT: [[TMP1:%.*]] = alloca i8, i64 4, align 1, addrspace(5)
685+
; IS________NPM-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP1]] to i8*
686+
; IS________NPM-NEXT: tail call void @usei8(i8* noalias nocapture nofree [[MALLOC_CAST]]) #[[ATTR6:[0-9]+]]
687+
; IS________NPM-NEXT: ret void
688+
;
689+
%1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
690+
tail call void @usei8(i8* nocapture nofree %1) willreturn nounwind nosync
691+
tail call void @__kmpc_free_shared(i8* %1, i64 4)
692+
ret void
693+
}
694+
695+
define void @test17b() {
696+
; CHECK-LABEL: define {{[^@]+}}@test17b() {
697+
; CHECK-NEXT: [[TMP1:%.*]] = tail call noalias i8* @__kmpc_alloc_shared(i64 noundef 4)
698+
; CHECK-NEXT: tail call void @usei8(i8* nofree [[TMP1]]) #[[ATTR6:[0-9]+]]
699+
; CHECK-NEXT: tail call void @__kmpc_free_shared(i8* nocapture [[TMP1]], i64 noundef 4)
700+
; CHECK-NEXT: ret void
701+
;
702+
%1 = tail call noalias i8* @__kmpc_alloc_shared(i64 4)
703+
tail call void @usei8(i8* nofree %1) willreturn nounwind nosync
704+
tail call void @__kmpc_free_shared(i8* %1, i64 4)
705+
ret void
706+
}
707+
708+
666709
;.
667710
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind willreturn }
668711
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nosync willreturn }
669712
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nofree nounwind }
670713
; CHECK: attributes #[[ATTR3]] = { noreturn }
671714
; CHECK: attributes #[[ATTR4:[0-9]+]] = { argmemonly nofree nosync nounwind willreturn }
672715
; CHECK: attributes #[[ATTR5]] = { nounwind }
716+
; CHECK: attributes #[[ATTR6]] = { nosync nounwind willreturn }
673717
;.

llvm/test/Transforms/OpenMP/spmdization.ll

+6-4
Original file line numberDiff line numberDiff line change
@@ -678,8 +678,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
678678
; AMDGPU-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
679679
; AMDGPU-NEXT: entry:
680680
; AMDGPU-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
681-
; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
682-
; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
681+
; AMDGPU-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
682+
; AMDGPU-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
683+
; AMDGPU-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
683684
; AMDGPU-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
684685
; AMDGPU-NEXT: br label [[FOR_COND:%.*]]
685686
; AMDGPU: for.cond:
@@ -722,8 +723,9 @@ define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias
722723
; AMDGPU-DISABLED-SAME: (i32* noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], i32* noalias nocapture nofree nonnull readnone align 4 dereferenceable(4) [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
723724
; AMDGPU-DISABLED-NEXT: entry:
724725
; AMDGPU-DISABLED-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
725-
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4
726-
; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[TMP0]] to i32*
726+
; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
727+
; AMDGPU-DISABLED-NEXT: [[MALLOC_CAST:%.*]] = addrspacecast i8 addrspace(5)* [[TMP0]] to i8*
728+
; AMDGPU-DISABLED-NEXT: [[X_ON_STACK:%.*]] = bitcast i8* [[MALLOC_CAST]] to i32*
727729
; AMDGPU-DISABLED-NEXT: call void @use(i32* nocapture [[X_ON_STACK]]) #[[ATTR5]]
728730
; AMDGPU-DISABLED-NEXT: br label [[FOR_COND:%.*]]
729731
; AMDGPU-DISABLED: for.cond:

0 commit comments

Comments
 (0)