Skip to content

[SCFToGPU] Convert scf.parallel+scf.reduce to gpu.all_reduce #122782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -408,8 +408,8 @@ static LogicalResult processParallelLoop(
ArrayAttr mapping =
parallelOp->getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());

// TODO: Support reductions.
if (!mapping || parallelOp.getNumResults() != 0)
// TODO: Support multiple reductions.
if (!mapping || parallelOp.getNumResults() > 1)
return failure();

Location loc = parallelOp.getLoc();
Expand Down Expand Up @@ -556,6 +556,11 @@ static LogicalResult processParallelLoop(

Block *body = parallelOp.getBody();
worklist.reserve(worklist.size() + body->getOperations().size());
// Include scf.reduce terminator if exists and has an operand.
if (auto terminator = body->getTerminator();
isa<scf::ReduceOp>(terminator) && terminator->getOperands().size() == 1) {
worklist.push_back(terminator);
}
for (Operation &op : llvm::reverse(body->without_terminator()))
worklist.push_back(&op);
return success();
Expand Down Expand Up @@ -648,6 +653,33 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
rewriter.setInsertionPointAfter(parent);
leftNestingScope = true;
seenSideeffects = false;
} else if (auto reduceOp = dyn_cast<scf::ReduceOp>(op)) {
// Convert scf.reduction op
auto parentLoop = op->getParentOfType<ParallelOp>();
if (!parentLoop || op->getOperands().size() != 1)
return failure();
auto operand = op->getOperands().front();
auto newValue = cloningMap.lookupOrNull(operand);
if (!newValue || !operand.getType().isSignlessIntOrFloat())
return failure();
// Ensure reduction region is isolated from above.
llvm::SetVector<Value> externalValues;
getUsedValuesDefinedAbove(reduceOp.getRegion(0), externalValues);
if (externalValues.size())
return failure();
// Replace by gpu.all_reduce.
auto gpuRedOp = rewriter.create<gpu::AllReduceOp>(loc, newValue);
cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult());
// Copy region.
rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(),
gpuRedOp.getRegion().begin());
// Replace src.reduce.return with gpu.yield.
auto scfReturn = gpuRedOp.getRegion().front().getTerminator();
auto ip = rewriter.saveInsertionPoint();
rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front());
rewriter.replaceOpWithNewOp<gpu::YieldOp>(
scfReturn, scfReturn->getOperands().front());
rewriter.restoreInsertionPoint(ip);
} else {
// Otherwise we copy it over.
Operation *clone = rewriter.clone(*op, cloningMap);
Expand Down
213 changes: 213 additions & 0 deletions mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -428,3 +428,216 @@ func.func @step_invariant() {
// CHECK: %[[rhs:.*]] = memref.load %[[alloc_1]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>
// CHECK: %[[sum:.*]] = arith.addf %[[lhs]], %[[rhs]] : f64
// CHECK: memref.store %[[sum]], %[[alloc_0]][%[[dim0]], %[[dim1]]] : memref<1x1xf64>

// -----

// 1-d parallel reduction mapped to block.x and thread.x.

// CHECK-LABEL: @parallel_reduction_1d
func.func @parallel_reduction_1d() {
%alloc = memref.alloc() : memref<f32>
%alloc_0 = memref.alloc() : memref<64xf32>
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
%0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 {
%1 = memref.load %alloc_0[%arg2] : memref<64xf32>
scf.reduce(%1 : f32) {
^bb0(%arg3: f32, %arg4: f32):
%2 = arith.addf %arg3, %arg4 : f32
scf.reduce.return %2 : f32
}
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.store %0, %alloc[] : memref<f32>
scf.reduce
} {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.dealloc %alloc : memref<f32>
memref.dealloc %alloc_0 : memref<64xf32>
return
}

// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32>
// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32>
// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
// CHECK: gpu.launch
// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]]] : memref<64xf32>
// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
// CHECK-NEXT: gpu.yield %[[sum]] : f32
// CHECK-NEXT: } : (f32) -> f32
// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32>

// -----

// 2-d parallel reduction mapped to block.x and thread.x and thread.y.

// CHECK-LABEL: @parallel_reduction_2d
func.func @parallel_reduction_2d() {
%alloc = memref.alloc() : memref<f32>
%alloc_0 = memref.alloc() : memref<8x8xf32>
%c1 = arith.constant 1 : index
%c8 = arith.constant 8 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
%0 = scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c8, %c8) step (%c1, %c1) init (%cst) -> f32 {
%1 = memref.load %alloc_0[%arg2, %arg3] : memref<8x8xf32>
scf.reduce(%1 : f32) {
^bb0(%arg4: f32, %arg5: f32):
%2 = arith.addf %arg4, %arg5 : f32
scf.reduce.return %2 : f32
}
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map<processor = thread_y, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.store %0, %alloc[] : memref<f32>
scf.reduce
} {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.dealloc %alloc : memref<f32>
memref.dealloc %alloc_0 : memref<8x8xf32>
return
}

// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<f32>
// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<8x8xf32>
// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
// CHECK: %[[map_2:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
// CHECK: gpu.launch
// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
// CHECK-SAME: threads(%[[arg_3:.*]], %[[arg_4:.*]], %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %[[map_2]], %{{[^)]*}} = %{{[^)]*}})
// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_4]])[{{.*}}, {{.*}}]
// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]], %[[dim2]]] : memref<8x8xf32>
// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
// CHECK-NEXT: gpu.yield %[[sum]] : f32
// CHECK-NEXT: } : (f32) -> f32
// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref<f32>

// -----

// tiled 1-d parallel reduction mapped to block.x and thread.x.

// CHECK-LABEL: @parallel_reduction_1d_tiled
func.func @parallel_reduction_1d_tiled() {
%c128 = arith.constant 128 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%alloc_0 = memref.alloc() : memref<8192xf32>
%alloc_1 = memref.alloc() : memref<64xf32>
scf.parallel (%arg1) = (%c0) to (%c64) step (%c1) {
%subview = memref.subview %alloc_1[%arg1] [1] [1] : memref<64xf32> to memref<f32, strided<[], offset: ?>>
%0 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1)
%subview_1 = memref.subview %alloc_0[%0] [128] [1] : memref<8192xf32> to memref<128xf32, strided<[1], offset: ?>>
%1 = scf.parallel (%arg2) = (%c0) to (%c128) step (%c1) init (%cst) -> f32 {
%2 = memref.load %subview_1[%arg2] : memref<128xf32, strided<[1], offset: ?>>
scf.reduce(%2 : f32) {
^bb0(%arg3: f32, %arg4: f32):
%3 = arith.addf %arg3, %arg4 : f32
scf.reduce.return %3 : f32
}
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.store %1, %subview[] : memref<f32, strided<[], offset: ?>>
scf.reduce
} {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.dealloc %alloc_0 : memref<8192xf32>
memref.dealloc %alloc_1 : memref<64xf32>
return
}

// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<8192xf32>
// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32>
// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}]
// CHECK: gpu.launch
// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}]
// CHECK-NEXT: %[[dst:.*]] = memref.subview %[[alloc_1]][%[[dim0]]] [1] [1] : memref<64xf32>
// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map2(%[[dim0]])
// CHECK-NEXT: %[[tile:.*]] = memref.subview %[[alloc_0]][%[[dim1]]] [128] [1] : memref<8192xf32>
// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}]
// CHECK-NEXT: %[[src:.*]] = memref.load %[[tile]][%[[dim2]]] : memref<128xf32, strided<[1], offset: ?>>
// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] {
// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32):
// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32
// CHECK-NEXT: gpu.yield %[[sum]] : f32
// CHECK-NEXT: } : (f32) -> f32
// CHECK-NEXT: memref.store %[[res]], %[[dst]][] : memref<f32, strided<[], offset: ?>>

// -----

// 1-d parallel reduction, unsigned int. Cannot be mapped.

// CHECK-LABEL: @parallel_reduction_1d_uint
func.func @parallel_reduction_1d_uint(%cst : ui32) {
%alloc = memref.alloc() : memref<ui32>
%alloc_0 = memref.alloc() : memref<64xui32>
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
%0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> ui32 {
%1 = memref.load %alloc_0[%arg2] : memref<64xui32>
scf.reduce(%1 : ui32) {
^bb0(%arg3: ui32, %arg4: ui32):
scf.reduce.return %arg3 : ui32
}
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.store %0, %alloc[] : memref<ui32>
scf.reduce
} {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.dealloc %alloc : memref<ui32>
memref.dealloc %alloc_0 : memref<64xui32>
return
}

// CHECK: scf.parallel
// CHECK-NEXT: scf.parallel
// CHECK: scf.reduce

// -----

// 1-d parallel reduction, not isolated from above. Cannot be mapped.

// CHECK-LABEL: @parallel_reduction_1d_outside
func.func @parallel_reduction_1d_outside() {
%alloc = memref.alloc() : memref<f32>
%alloc_0 = memref.alloc() : memref<64xf32>
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%const = arith.constant 1.000000e+00 : f32
scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) {
%0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 {
%1 = memref.load %alloc_0[%arg2] : memref<64xf32>
scf.reduce(%1 : f32) {
^bb0(%arg3: f32, %arg4: f32):
%2 = arith.addf %arg3, %arg4 : f32
%3 = arith.addf %2, %const : f32
scf.reduce.return %3 : f32
}
} {mapping = [#gpu.loop_dim_map<processor = thread_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.store %0, %alloc[] : memref<f32>
scf.reduce
} {mapping = [#gpu.loop_dim_map<processor = block_x, map = (d0) -> (d0), bound = (d0) -> (d0)>]}
memref.dealloc %alloc : memref<f32>
memref.dealloc %alloc_0 : memref<64xf32>
return
}

// CHECK: scf.parallel
// CHECK-NEXT: scf.parallel
// CHECK: scf.reduce
Loading