Skip to content

Commit 5f15fee

Browse files
authored
[mlir][amdgpu] Add tensor load store operations (#172686)
Reland #170918 This PR differs from the original one by making the target materialization more restrictive.
1 parent e5fe825 commit 5f15fee

File tree

3 files changed

+108
-7
lines changed

3 files changed

+108
-7
lines changed

mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1587,4 +1587,35 @@ def AMDGPU_MakeDmaDescriptorOp : AMDGPU_MakeDescriptorOp<"make_dma_descriptor">
15871587

15881588
}
15891589

1590+
def AMDGPU_TensorLoadToLDSOp :
1591+
AMDGPU_Op<"tensor_load_to_lds", [MemoryEffects<[MemWrite, MemRead]>]>,
1592+
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
1593+
let summary = "Load tensors from global memory to LDS.";
1594+
let description = [{
1595+
Load tensors of up to five dimensions from global memory to LDS.
1596+
1597+
This operation was introduced in gfx1250.
1598+
}];
1599+
1600+
let assemblyFormat = [{
1601+
$desc attr-dict `:` qualified(type($desc))
1602+
}];
1603+
}
1604+
1605+
def AMDGPU_TensorStoreFromLDSOp :
1606+
AMDGPU_Op<"tensor_store_from_lds", [MemoryEffects<[MemWrite, MemRead]>]>,
1607+
Arguments<(ins AMDGPU_TDMDescriptorType: $desc)> {
1608+
1609+
let summary = "Store tensors from LDS to global memory.";
1610+
let description = [{
1611+
Store tensors of up to five dimensions from LDS to global memory.
1612+
1613+
This operation was introduced in gfx1250.
1614+
}];
1615+
1616+
let assemblyFormat = [{
1617+
$desc attr-dict `:` qualified(type($desc))
1618+
}];
1619+
}
1620+
15901621
#endif // AMDGPU

mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp

Lines changed: 59 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3218,11 +3218,6 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern<DescriptorOp> {
32183218

32193219
Location loc = op.getLoc();
32203220

3221-
IntegerType i32 = rewriter.getI32Type();
3222-
[[maybe_unused]] Type v4i32 =
3223-
this->typeConverter->convertType(VectorType::get(4, i32));
3224-
assert(v4i32 && "expected type conversion to succeed");
3225-
32263221
SmallVector<Value> consts;
32273222
for (int64_t i = 0; i < 8; ++i)
32283223
consts.push_back(createI32Constant(rewriter, loc, i));
@@ -3237,6 +3232,32 @@ struct AMDGPULowerDescriptor : public ConvertOpToLLVMPattern<DescriptorOp> {
32373232
}
32383233
};
32393234

3235+
template <typename SourceOp, typename TargetOp>
3236+
struct AMDGPUTensorLoadStoreOpLowering
3237+
: public ConvertOpToLLVMPattern<SourceOp> {
3238+
using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
3239+
using Adaptor = typename ConvertOpToLLVMPattern<SourceOp>::OneToNOpAdaptor;
3240+
AMDGPUTensorLoadStoreOpLowering(const LLVMTypeConverter &converter,
3241+
Chipset chipset)
3242+
: ConvertOpToLLVMPattern<SourceOp>(converter), chipset(chipset) {}
3243+
Chipset chipset;
3244+
3245+
LogicalResult
3246+
matchAndRewrite(SourceOp op, Adaptor adaptor,
3247+
ConversionPatternRewriter &rewriter) const override {
3248+
if (chipset < kGfx1250)
3249+
return op->emitOpError("is only supported on gfx1250");
3250+
3251+
ValueRange desc = adaptor.getDesc();
3252+
rewriter.replaceOpWithNewOp<TargetOp>(op, desc[0], desc[1], desc[2],
3253+
desc[3], /*cachePolicy=*/0,
3254+
/*alias_scopes=*/nullptr,
3255+
/*noalias_scopes=*/nullptr,
3256+
/*tbaa=*/nullptr);
3257+
return success();
3258+
}
3259+
};
3260+
32403261
struct ConvertAMDGPUToROCDLPass
32413262
: public impl::ConvertAMDGPUToROCDLPassBase<ConvertAMDGPUToROCDLPass> {
32423263
using Base::Base;
@@ -3306,6 +3327,33 @@ void mlir::populateAMDGPUTypeAndAttributeConversions(
33063327
Type i32 = IntegerType::get(type.getContext(), 32);
33073328
return typeConverter.convertType(VectorType::get(4, i32));
33083329
});
3330+
typeConverter.addConversion(
3331+
[&](TDMDescriptorType type,
3332+
SmallVectorImpl<Type> &result) -> std::optional<LogicalResult> {
3333+
Type i32 = IntegerType::get(type.getContext(), 32);
3334+
Type v4i32 = typeConverter.convertType(VectorType::get(4, i32));
3335+
Type v8i32 = typeConverter.convertType(VectorType::get(8, i32));
3336+
llvm::append_values(result, v4i32, v8i32, v4i32, v4i32);
3337+
return success();
3338+
});
3339+
3340+
auto addUnrealizedCast = [](OpBuilder &builder, TypeRange types,
3341+
ValueRange inputs,
3342+
Location loc) -> SmallVector<Value> {
3343+
// Only create unrealized_conversion_cast for TDMDescriptorType.
3344+
// All other types which are not expected, should be
3345+
// materialized by other target materialization functions.
3346+
if (inputs.size() != 1)
3347+
return {};
3348+
3349+
if (!isa<TDMDescriptorType>(inputs[0].getType()))
3350+
return {};
3351+
3352+
auto cast = UnrealizedConversionCastOp::create(builder, loc, types, inputs);
3353+
return cast.getResults();
3354+
};
3355+
3356+
typeConverter.addTargetMaterialization(addUnrealizedCast);
33093357
}
33103358

33113359
void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
@@ -3336,7 +3384,11 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
33363384
AMDGPUMakeDmaBaseLowering<MakeDmaBaseOp>,
33373385
AMDGPUMakeDmaBaseLowering<MakeGatherDmaBaseOp>,
33383386
AMDGPULowerDescriptor<MakeDmaDescriptorOp>,
3339-
AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>>(converter,
3340-
chipset);
3387+
AMDGPULowerDescriptor<MakeGatherDmaDescriptorOp>,
3388+
AMDGPUTensorLoadStoreOpLowering<TensorLoadToLDSOp,
3389+
ROCDL::TensorLoadToLDSOp>,
3390+
AMDGPUTensorLoadStoreOpLowering<TensorStoreFromLDSOp,
3391+
ROCDL::TensorStoreFromLDSOp>>(
3392+
converter, chipset);
33413393
patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
33423394
}

mlir/test/Conversion/AMDGPUToROCDL/gfx1250.mlir

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -773,6 +773,24 @@ func.func @make_dma_descriptor_workgroup_mask(%base: !amdgpu.tdm_base<i32>, %wg_
773773
func.return %descriptor : !amdgpu.tdm_descriptor
774774
}
775775

776+
// CHECK-LABEL: func @tensor_load_to_lds
777+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
778+
func.func @tensor_load_to_lds(%desc: !amdgpu.tdm_descriptor) {
779+
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
780+
// CHECK: rocdl.tensor.load.to.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
781+
amdgpu.tensor_load_to_lds %desc : !amdgpu.tdm_descriptor
782+
func.return
783+
}
784+
785+
// CHECK-LABEL: func @tensor_store_from_lds
786+
// CHECK-SAME: (%[[DESC:.+]]: !amdgpu.tdm_descriptor)
787+
func.func @tensor_store_from_lds(%desc: !amdgpu.tdm_descriptor) {
788+
// CHECK: %[[DGROUPS:.+]]:4 = builtin.unrealized_conversion_cast %[[DESC]]
789+
// CHECK: rocdl.tensor.store.from.lds %[[DGROUPS]]#0, %[[DGROUPS]]#1, %[[DGROUPS]]#2, %[[DGROUPS]]#3 cachepolicy 0 : vector<4xi32>, vector<8xi32>
790+
amdgpu.tensor_store_from_lds %desc : !amdgpu.tdm_descriptor
791+
func.return
792+
}
793+
776794
// -----
777795

778796
// CHECK-LABEL: func @make_gather_dma_descriptor

0 commit comments

Comments
 (0)