diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp index e6dd6f135884e..8002d08fdbd27 100644 --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -149,10 +149,13 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern { Value widthOrZeroIfOutside = rewriter.create(loc, int32Type, add, negwidth); Value dstLane; - // TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN. // TODO: Use ds_swizzle for XOR when step/offsets are constants for better // perf. switch (op.getMode()) { + case gpu::ShuffleMode::UP: + dstLane = rewriter.create(loc, int32Type, srcLaneId, + adaptor.getOffset()); + break; case gpu::ShuffleMode::DOWN: dstLane = rewriter.create(loc, int32Type, srcLaneId, adaptor.getOffset()); diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir index 071cae9d5789f..999b383a3b8db 100644 --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -660,7 +660,7 @@ gpu.module @test_module { gpu.module @test_module { // CHECK-LABEL: func @gpu_shuffle() - func.func @gpu_shuffle() -> (f32, f32, f32) { + func.func @gpu_shuffle() -> (f32, f32, f32, f32) { // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32 %arg0 = arith.constant 1.0 : f32 // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32 @@ -693,7 +693,22 @@ gpu.module @test_module { // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32 // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 - %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 + %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32 + // *** UP mode shuffle *** + // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi + // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32 + // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32 + // CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32 + // CHECK: %[[#UP:]] = llvm.sub %[[#LANE_ID]], %{{.*}} : i32 + // CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#UP]], %[[#WARP_OR_ZERO]] : i32 + // CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#UP]], %{{.*}} : i1, i32 + // CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32 + // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32 + // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 + // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 + %shflu, %predu = gpu.shuffle up %arg0, %arg1, %arg2 : f32 // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32 // CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32 @@ -708,7 +723,7 @@ gpu.module @test_module { // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32 // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32 %shfld, %predd = gpu.shuffle down %arg0, %arg1, %arg2 : f32 - func.return %shfl, %shfli, %shfld : f32, f32, f32 + func.return %shfl, %shfli, %shflu, %shfld : f32, f32, f32, f32 } // CHECK-LABEL: func @gpu_shuffle_vec