From 117adfe5b9f669c1cd43f9fa308b8bb2b3efa605 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 18 Nov 2024 23:37:46 +0200 Subject: [PATCH 1/4] SCFToGPU: ParallelToGpuLaunch converts scf.reduce to gpu.all_reduce --- mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp | 33 +++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index dece254c325fc..ea2f1db244537 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -408,8 +408,8 @@ static LogicalResult processParallelLoop( ArrayAttr mapping = parallelOp->getAttrOfType(gpu::getMappingAttrName()); - // TODO: Support reductions. - if (!mapping || parallelOp.getNumResults() != 0) + // TODO: Support multiple reductions. + if (!mapping || parallelOp.getNumResults() > 1) return failure(); Location loc = parallelOp.getLoc(); @@ -556,6 +556,11 @@ static LogicalResult processParallelLoop( Block *body = parallelOp.getBody(); worklist.reserve(worklist.size() + body->getOperations().size()); + // Include scf.reduce terminator if exists and has an operand. + if (auto terminator = body->getTerminator(); + isa(terminator) && terminator->getOperands().size() == 1) { + worklist.push_back(terminator); + } for (Operation &op : llvm::reverse(body->without_terminator())) worklist.push_back(&op); return success(); @@ -648,6 +653,30 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, rewriter.setInsertionPointAfter(parent); leftNestingScope = true; seenSideeffects = false; + } else if (auto reduceOp = dyn_cast(op)) { + // Convert scf.reduction op + auto parentLoop = op->getParentOfType(); + if (!parentLoop || op->getOperands().size() != 1) { + return failure(); + } + auto operand = op->getOperands().front(); + auto newValue = cloningMap.lookupOrNull(operand); + if (!newValue) { + return failure(); + } + // Replace by gpu.all_reduce. + auto gpuRedOp = rewriter.create(loc, newValue); + cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult()); + // Copy region. + rewriter.inlineRegionBefore(reduceOp.getRegion(0), gpuRedOp.getRegion(), + gpuRedOp.getRegion().begin()); + // Replace src.reduce.return with gpu.yield. + auto scfReturn = gpuRedOp.getRegion().front().getTerminator(); + auto ip = rewriter.saveInsertionPoint(); + rewriter.setInsertionPointToEnd(&gpuRedOp.getRegion().front()); + rewriter.replaceOpWithNewOp( + scfReturn, scfReturn->getOperands().front()); + rewriter.restoreInsertionPoint(ip); } else { // Otherwise we copy it over. Operation *clone = rewriter.clone(*op, cloningMap); From 6da4b85a86c1ddaabe7a98b0f89841a395ccff0d Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Mon, 13 Jan 2025 19:16:47 +0200 Subject: [PATCH 2/4] SCFToGPU: add scf.parallel reduction tests --- .../Conversion/SCFToGPU/parallel_loop.mlir | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir index 59441e5ed6629..e5cafde39df1f 100644 --- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir @@ -428,3 +428,150 @@ func.func @step_invariant() { // CHECK: %[[rhs:.*]] = memref.load %[[alloc_1]][%[[dim0]], %[[dim1]]] : memref<1x1xf64> // CHECK: %[[sum:.*]] = arith.addf %[[lhs]], %[[rhs]] : f64 // CHECK: memref.store %[[sum]], %[[alloc_0]][%[[dim0]], %[[dim1]]] : memref<1x1xf64> + +// ----- + +// 1-d parallel reduction mapped to block.x and thread.x. + +// CHECK-LABEL: @parallel_reduction_1d +func.func @parallel_reduction_1d() { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<64xf32> + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 { + %1 = memref.load %alloc_0[%arg2] : memref<64xf32> + scf.reduce(%1 : f32) { + ^bb0(%arg3: f32, %arg4: f32): + %2 = arith.addf %arg3, %arg4 : f32 + scf.reduce.return %2 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<64xf32> + return +} + +// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref +// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32> +// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: gpu.launch +// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]]] : memref<64xf32> +// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { +// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): +// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 +// CHECK-NEXT: gpu.yield %[[sum]] : f32 +// CHECK-NEXT: } : (f32) -> f32 +// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref + +// ----- + +// 2-d parallel reduction mapped to block.x and thread.x and thread.y. + +// CHECK-LABEL: @parallel_reduction_2d +func.func @parallel_reduction_2d() { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<8x8xf32> + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2, %arg3) = (%c0, %c0) to (%c8, %c8) step (%c1, %c1) init (%cst) -> f32 { + %1 = memref.load %alloc_0[%arg2, %arg3] : memref<8x8xf32> + scf.reduce(%1 : f32) { + ^bb0(%arg4: f32, %arg5: f32): + %2 = arith.addf %arg4, %arg5 : f32 + scf.reduce.return %2 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>, #gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<8x8xf32> + return +} + +// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref +// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<8x8xf32> +// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_2:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: gpu.launch +// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-SAME: threads(%[[arg_3:.*]], %[[arg_4:.*]], %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %[[map_2]], %{{[^)]*}} = %{{[^)]*}}) +// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_4]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[src:.*]] = memref.load %[[alloc_1]][%[[dim1]], %[[dim2]]] : memref<8x8xf32> +// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { +// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): +// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 +// CHECK-NEXT: gpu.yield %[[sum]] : f32 +// CHECK-NEXT: } : (f32) -> f32 +// CHECK-NEXT: memref.store %[[res]], %[[alloc_0]][] : memref + +// ----- + +// tiled 1-d parallel reduction mapped to block.x and thread.x. + +// CHECK-LABEL: @parallel_reduction_1d_tiled +func.func @parallel_reduction_1d_tiled() { + %c128 = arith.constant 128 : index + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %alloc_0 = memref.alloc() : memref<8192xf32> + %alloc_1 = memref.alloc() : memref<64xf32> + scf.parallel (%arg1) = (%c0) to (%c64) step (%c1) { + %subview = memref.subview %alloc_1[%arg1] [1] [1] : memref<64xf32> to memref> + %0 = affine.apply affine_map<(d0) -> (d0 * 128)>(%arg1) + %subview_1 = memref.subview %alloc_0[%0] [128] [1] : memref<8192xf32> to memref<128xf32, strided<[1], offset: ?>> + %1 = scf.parallel (%arg2) = (%c0) to (%c128) step (%c1) init (%cst) -> f32 { + %2 = memref.load %subview_1[%arg2] : memref<128xf32, strided<[1], offset: ?>> + scf.reduce(%2 : f32) { + ^bb0(%arg3: f32, %arg4: f32): + %3 = arith.addf %arg3, %arg4 : f32 + scf.reduce.return %3 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %1, %subview[] : memref> + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc_0 : memref<8192xf32> + memref.dealloc %alloc_1 : memref<64xf32> + return +} + +// CHECK: %[[alloc_0:.*]] = memref.alloc() : memref<8192xf32> +// CHECK: %[[alloc_1:.*]] = memref.alloc() : memref<64xf32> +// CHECK: %[[map_0:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: %[[map_1:.*]] = affine.apply #map({{.*}})[{{.*}}, {{.*}}] +// CHECK: gpu.launch +// CHECK-SAME: blocks(%[[arg_0:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_0]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-SAME: threads(%[[arg_3:.*]], %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[map_1]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}}) +// CHECK-NEXT: %[[dim0:.*]] = affine.apply #map1(%[[arg_0]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[dst:.*]] = memref.subview %[[alloc_1]][%[[dim0]]] [1] [1] : memref<64xf32> +// CHECK-NEXT: %[[dim1:.*]] = affine.apply #map2(%[[dim0]]) +// CHECK-NEXT: %[[tile:.*]] = memref.subview %[[alloc_0]][%[[dim1]]] [128] [1] : memref<8192xf32> +// CHECK-NEXT: %[[dim2:.*]] = affine.apply #map1(%[[arg_3]])[{{.*}}, {{.*}}] +// CHECK-NEXT: %[[src:.*]] = memref.load %[[tile]][%[[dim2]]] : memref<128xf32, strided<[1], offset: ?>> +// CHECK-NEXT: %[[res:.*]] = gpu.all_reduce %[[src]] { +// CHECK-NEXT: ^bb0(%[[arg12:.*]]: f32, %[[arg13:.*]]: f32): +// CHECK-NEXT: %[[sum:.*]] = arith.addf %[[arg12]], %[[arg13]] : f32 +// CHECK-NEXT: gpu.yield %[[sum]] : f32 +// CHECK-NEXT: } : (f32) -> f32 +// CHECK-NEXT: memref.store %[[res]], %[[dst]][] : memref> From ac8f767d6269ec5469b81e0d9bffe9b24d321e72 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 14 Jan 2025 18:51:53 +0200 Subject: [PATCH 3/4] SCFToGPU: better verification for gpu.all_reduce op --- mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp index ea2f1db244537..1ac95ebcdc87f 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp @@ -656,14 +656,17 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, } else if (auto reduceOp = dyn_cast(op)) { // Convert scf.reduction op auto parentLoop = op->getParentOfType(); - if (!parentLoop || op->getOperands().size() != 1) { + if (!parentLoop || op->getOperands().size() != 1) return failure(); - } auto operand = op->getOperands().front(); auto newValue = cloningMap.lookupOrNull(operand); - if (!newValue) { + if (!newValue || !operand.getType().isSignlessIntOrFloat()) + return failure(); + // Ensure reduction region is isolated from above. + llvm::SetVector externalValues; + getUsedValuesDefinedAbove(reduceOp.getRegion(0), externalValues); + if (externalValues.size()) return failure(); - } // Replace by gpu.all_reduce. auto gpuRedOp = rewriter.create(loc, newValue); cloningMap.map(parentLoop->getResult(0), gpuRedOp.getResult()); From f8bc8ba1814829bca13893200fc9d4145f6e9036 Mon Sep 17 00:00:00 2001 From: Tuomas Karna Date: Tue, 14 Jan 2025 21:26:27 +0200 Subject: [PATCH 4/4] SCFToGPU: add negative tests --- .../Conversion/SCFToGPU/parallel_loop.mlir | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir index e5cafde39df1f..1dbce05be85b4 100644 --- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir @@ -575,3 +575,69 @@ func.func @parallel_reduction_1d_tiled() { // CHECK-NEXT: gpu.yield %[[sum]] : f32 // CHECK-NEXT: } : (f32) -> f32 // CHECK-NEXT: memref.store %[[res]], %[[dst]][] : memref> + +// ----- + +// 1-d parallel reduction, unsigned int. Cannot be mapped. + +// CHECK-LABEL: @parallel_reduction_1d_uint +func.func @parallel_reduction_1d_uint(%cst : ui32) { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<64xui32> + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> ui32 { + %1 = memref.load %alloc_0[%arg2] : memref<64xui32> + scf.reduce(%1 : ui32) { + ^bb0(%arg3: ui32, %arg4: ui32): + scf.reduce.return %arg3 : ui32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<64xui32> + return +} + +// CHECK: scf.parallel +// CHECK-NEXT: scf.parallel +// CHECK: scf.reduce + +// ----- + +// 1-d parallel reduction, not isolated from above. Cannot be mapped. + +// CHECK-LABEL: @parallel_reduction_1d_outside +func.func @parallel_reduction_1d_outside() { + %alloc = memref.alloc() : memref + %alloc_0 = memref.alloc() : memref<64xf32> + %c1 = arith.constant 1 : index + %c64 = arith.constant 64 : index + %c0 = arith.constant 0 : index + %cst = arith.constant 0.000000e+00 : f32 + %const = arith.constant 1.000000e+00 : f32 + scf.parallel (%arg1) = (%c0) to (%c1) step (%c1) { + %0 = scf.parallel (%arg2) = (%c0) to (%c64) step (%c1) init (%cst) -> f32 { + %1 = memref.load %alloc_0[%arg2] : memref<64xf32> + scf.reduce(%1 : f32) { + ^bb0(%arg3: f32, %arg4: f32): + %2 = arith.addf %arg3, %arg4 : f32 + %3 = arith.addf %2, %const : f32 + scf.reduce.return %3 : f32 + } + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.store %0, %alloc[] : memref + scf.reduce + } {mapping = [#gpu.loop_dim_map (d0), bound = (d0) -> (d0)>]} + memref.dealloc %alloc : memref + memref.dealloc %alloc_0 : memref<64xf32> + return +} + +// CHECK: scf.parallel +// CHECK-NEXT: scf.parallel +// CHECK: scf.reduce