Skip to content

Commit 8696d16

Browse files
committed
[OpenACC][CIR] Implement 'async' lowering for combined constructs
Implementation is 'trivial' as were the rest of the non data clauses, so this implements them, finishing the last non-data/var-list clause for combined constructs. Also ensures this is properly tested.
1 parent 36290b7 commit 8696d16

File tree

3 files changed

+68
-3
lines changed

3 files changed

+68
-3
lines changed

clang/lib/CIR/CodeGen/CIRGenOpenACCClause.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,10 +266,12 @@ class OpenACCClauseCIREmitter final
266266
else
267267
operation.getAsyncOperandMutable().append(
268268
createIntExpr(clause.getIntExpr()));
269+
} else if constexpr (isCombinedType<OpTy>) {
270+
applyToComputeOp(clause);
269271
} else {
270272
// TODO: When we've implemented this for everything, switch this to an
271273
// unreachable. Combined constructs remain. Data, enter data, exit data,
272-
// update, combined constructs remain.
274+
// update constructs remain.
273275
return clauseNotImplemented(clause);
274276
}
275277
}

clang/test/CIR/CodeGenOpenACC/combined.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,4 +947,67 @@ extern "C" void acc_combined(int N, int cond) {
947947
// CHECK-NEXT: acc.yield
948948
// CHECK-NEXT: } loc
949949

950+
#pragma acc parallel loop async
951+
for(unsigned I = 0; I < N; ++I);
952+
// CHECK-NEXT: acc.parallel combined(loop) async {
953+
// CHECK-NEXT: acc.loop combined(parallel) {
954+
// CHECK: acc.yield
955+
// CHECK-NEXT: } loc
956+
// CHECK-NEXT: acc.yield
957+
// CHECK-NEXT: } loc
958+
959+
#pragma acc serial loop async(cond)
960+
for(unsigned I = 0; I < N; ++I);
961+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
962+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
963+
// CHECK-NEXT: acc.serial combined(loop) async(%[[CONV_CAST]] : si32) {
964+
// CHECK-NEXT: acc.loop combined(serial) {
965+
// CHECK: acc.yield
966+
// CHECK-NEXT: } loc
967+
// CHECK-NEXT: acc.yield
968+
// CHECK-NEXT: } loc
969+
970+
#pragma acc kernels loop async device_type(nvidia, radeon) async
971+
for(unsigned I = 0; I < N; ++I);
972+
// CHECK-NEXT: acc.kernels combined(loop) async([#acc.device_type<none>, #acc.device_type<nvidia>, #acc.device_type<radeon>]) {
973+
// CHECK-NEXT: acc.loop combined(kernels) {
974+
// CHECK: acc.yield
975+
// CHECK-NEXT: } loc
976+
// CHECK-NEXT: acc.terminator
977+
// CHECK-NEXT: } loc
978+
979+
#pragma acc parallel loop async(3) device_type(nvidia, radeon) async(cond)
980+
for(unsigned I = 0; I < N; ++I);
981+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
982+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
983+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
984+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
985+
// CHECK-NEXT: acc.parallel combined(loop) async(%[[THREE_CAST]] : si32, %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
986+
// CHECK-NEXT: acc.loop combined(parallel) {
987+
// CHECK: acc.yield
988+
// CHECK-NEXT: } loc
989+
// CHECK-NEXT: acc.yield
990+
// CHECK-NEXT: } loc
991+
992+
#pragma acc serial loop async device_type(nvidia, radeon) async(cond)
993+
for(unsigned I = 0; I < N; ++I);
994+
// CHECK-NEXT: %[[COND_LOAD:.*]] = cir.load %[[COND]] : !cir.ptr<!s32i>, !s32i
995+
// CHECK-NEXT: %[[CONV_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_LOAD]] : !s32i to si32
996+
// CHECK-NEXT: acc.serial combined(loop) async([#acc.device_type<none>], %[[CONV_CAST]] : si32 [#acc.device_type<nvidia>], %[[CONV_CAST]] : si32 [#acc.device_type<radeon>]) {
997+
// CHECK-NEXT: acc.loop combined(serial) {
998+
// CHECK: acc.yield
999+
// CHECK-NEXT: } loc
1000+
// CHECK-NEXT: acc.yield
1001+
// CHECK-NEXT: } loc
1002+
1003+
#pragma acc kernels loop async(3) device_type(nvidia, radeon) async
1004+
for(unsigned I = 0; I < N; ++I);
1005+
// CHECK-NEXT: %[[THREE_LITERAL:.*]] = cir.const #cir.int<3> : !s32i
1006+
// CHECK-NEXT: %[[THREE_CAST:.*]] = builtin.unrealized_conversion_cast %[[THREE_LITERAL]] : !s32i to si32
1007+
// CHECK-NEXT: acc.kernels combined(loop) async([#acc.device_type<nvidia>, #acc.device_type<radeon>], %[[THREE_CAST]] : si32) {
1008+
// CHECK-NEXT: acc.loop combined(kernels) {
1009+
// CHECK: acc.yield
1010+
// CHECK-NEXT: } loc
1011+
// CHECK-NEXT: acc.terminator
1012+
// CHECK-NEXT: } loc
9501013
}

clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ void HelloWorld(int *A, int *B, int *C, int N) {
1313
// expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: private}}
1414
#pragma acc parallel loop private(A)
1515
for(int i = 0; i <5; ++i);
16-
// expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: async}}
17-
#pragma acc parallel loop async
16+
// expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: reduction}}
17+
#pragma acc parallel loop reduction(+:A)
1818
for(int i = 0; i <5; ++i);
1919
}

0 commit comments

Comments
 (0)