diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td index 5255286619e3b..1607304803942 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td +++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td @@ -33,12 +33,21 @@ def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> { If serialization fails then the method should return `std::nullopt`. - The `module` argument must be a GPU Module Op. The `options` argument is - meant to be used for passing additional options that are not in the + The `module` parameter must be a GPU Module Op. The `options` parameter + is meant to be used for passing additional options that are not in the attribute. }], "std::optional>", "serializeToObject", - (ins "Operation*":$module, "const gpu::TargetOptions&":$options)> + (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>, + InterfaceMethod<[{ + Creates a GPU object attribute from a binary string. + + The `object` parameter is a binary string. The `options` parameter is + meant to be used for passing additional options that are not in the + attribute. + }], "Attribute", "createObject", + (ins "const SmallVector&":$object, + "const gpu::TargetOptions&":$options)> ]; } diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td index 9c1110d8e9a94..6659f4a2c58e8 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td +++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td @@ -20,20 +20,62 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td" // GPU object attribute. //===----------------------------------------------------------------------===// +// For documentation on this enum cases, see the `GPU_ObjectAttr` docs. +def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">; +def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">; +def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">; +def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">; +def GPU_CompilationTargetEnum : GPU_I32Enum< + "CompilationTarget", "GPU compilation format", [ + GPU_ObjectOffload, + GPU_ObjectISA, + GPU_ObjectBinary, + GPU_ObjectFatbin + ]>; + def GPU_ObjectAttr : GPU_Attr<"Object", "object"> { let description = [{ - A GPU object attribute pairs a GPU target with a binary string, - encapsulating the information of how the object was generated with the - object itself. + A GPU object attribute glues together a GPU target, the object kind, a + binary string with the object, and the object properties, encapsulating how + the object was generated and its properties with the object itself. + + There are four object formats: + 1. `Offload`: represents generic objects not described by the other three + formats, and its meaning is target-dependent. For example, on the NVPTX and + AMDGPU targets, this format is associated with LLVM bitcode. + 2. `Assembly`: represents GPU assembly code. For example, in the NVPTX + target, assembly is PTX code, which can be JITted at runtime. + 3. `Binary`: represents executable code for a GPU single architecture. For + example, PTX code that was compiled for a specific compute capability. Note + that this format is likely to throw an error if there is an architecture + mismatch between the compiled and running architecture. + 4. `Fatbin`: represents a GPU fat binary with executable code for multiple + architectures. This format is the default; thus, it gets elided inassembly + code. - The target attribute must implement the `TargetAttrInterface` interface. + Object properties are specified through the `properties` dictionary + attribute and can be used to define additional information. + The target attribute must implement or promise the `TargetAttrInterface` + interface. ``` - #gpu.object<#nvvm.target, "..."> + #gpu.object<#rocdl.target, offload = "..."> // An offload object. + #gpu.object<#nvvm.target, properties = {O = 3 : i32}, assembly = "..."> // An assembly object with additional properties. + #gpu.object<#rocdl.target, bin = "..."> // A binary object. + #gpu.object<#nvvm.target, "..."> // A fatbin object. ``` }]; - let parameters = (ins "Attribute":$target, "StringAttr":$object); - let assemblyFormat = [{`<` $target `,` $object `>`}]; + let parameters = (ins + "Attribute":$target, + DefaultValuedParameter<"CompilationTarget", "CompilationTarget::Fatbin">:$format, + "StringAttr":$object, + OptionalParameter<"DictionaryAttr">:$properties + ); + let assemblyFormat = [{ `<` + $target `,` (`properties` `=` $properties ^ `,`)? + custom($format, $object) + `>` + }]; let genVerifyDecl = 1; } diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h index a1f64be57fa69..6d7cb5ca7a7f8 100644 --- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h +++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h @@ -25,6 +25,8 @@ namespace LLVM { class ModuleTranslation; } namespace gpu { +enum class CompilationTarget : uint32_t; + /// This class indicates that the attribute associated with this trait is a GPU /// offloading translation attribute. These kinds of attributes must implement /// an interface for handling the translation of GPU offloading operations like @@ -42,27 +44,15 @@ class OffloadingTranslationAttrTrait /// ensure type safeness. Targets are free to ignore these options. class TargetOptions { public: - /// The target representation of the compilation process. - typedef enum { - offload = 1, /// The process should produce an offloading representation. - /// For the NVVM & ROCDL targets this option produces LLVM IR. - assembly = 2, /// The process should produce assembly code. - binary = 4, /// The process should produce a binary. - fatbinary = 8, /// The process should produce a fat binary. - binOrFatbin = - binary | - fatbinary, /// The process should produce a binary or fatbinary. It's up - /// to the target to decide which. - } CompilationTarget; - /// Constructor initializing the toolkit path, the list of files to link to, /// extra command line options, the compilation target and a callback for /// obtaining the parent symbol table. The default compilation target is - /// `binOrFatbin`. - TargetOptions(StringRef toolkitPath = {}, - ArrayRef linkFiles = {}, StringRef cmdOptions = {}, - CompilationTarget compilationTarget = binOrFatbin, - function_ref getSymbolTableCallback = {}); + /// `Fatbin`. + TargetOptions( + StringRef toolkitPath = {}, ArrayRef linkFiles = {}, + StringRef cmdOptions = {}, + CompilationTarget compilationTarget = getDefaultCompilationTarget(), + function_ref getSymbolTableCallback = {}); /// Returns the typeID. TypeID getTypeID() const; @@ -90,13 +80,17 @@ class TargetOptions { /// table. SymbolTable *getSymbolTable() const; + /// Returns the default compilation target: `CompilationTarget::Fatbin`. + static CompilationTarget getDefaultCompilationTarget(); + protected: /// Derived classes must use this constructor to initialize `typeID` to the /// appropiate value: ie. `TargetOptions(TypeID::get())`. - TargetOptions(TypeID typeID, StringRef toolkitPath = {}, - ArrayRef linkFiles = {}, StringRef cmdOptions = {}, - CompilationTarget compilationTarget = binOrFatbin, - function_ref getSymbolTableCallback = {}); + TargetOptions( + TypeID typeID, StringRef toolkitPath = {}, + ArrayRef linkFiles = {}, StringRef cmdOptions = {}, + CompilationTarget compilationTarget = getDefaultCompilationTarget(), + function_ref getSymbolTableCallback = {}); /// Path to the target toolkit. std::string toolkitPath; @@ -108,7 +102,7 @@ class TargetOptions { /// process. std::string cmdOptions; - /// Compilation process target representation. + /// Compilation process target format. CompilationTarget compilationTarget; /// Callback for obtaining the parent symbol table of all the GPU modules diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td index 0bfb275099205..3de8e18851369 100644 --- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td @@ -68,7 +68,6 @@ def GpuModuleToBinaryPass 2. `assembly`, `isa`: produces assembly code. 3. `binary`, `bin`: produces binaries. 4. `fatbinary`, `fatbin`: produces fatbinaries. - 5. `binOrFatbin`: produces bins or fatbins, the target decides which. }]; let options = [ Option<"offloadingHandler", "handler", "Attribute", "nullptr", @@ -79,7 +78,7 @@ def GpuModuleToBinaryPass "Extra files to link to.">, Option<"cmdOptions", "opts", "std::string", [{""}], "Command line options to pass to the tools.">, - Option<"compilationTarget", "format", "std::string", [{"binOrFatbin"}], + Option<"compilationTarget", "format", "std::string", [{"fatbin"}], "The target representation of the compilation process."> ]; } diff --git a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h index 5deab8321cbcb..b07ab8b2a6034 100644 --- a/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h +++ b/mlir/include/mlir/Dialect/SparseTensor/Pipelines/Passes.h @@ -144,6 +144,23 @@ struct SparseCompilerOptions desc("GPU target architecture")}; PassOptions::Option gpuFeatures{*this, "gpu-features", desc("GPU target features")}; + /// For NVIDIA GPUs there are 3 compilation format options: + /// 1. `isa`: the compiler generates PTX and the driver JITs the PTX. + /// 2. `bin`: generates a CUBIN object for `chip=gpuChip`. + /// 3. `fatbin`: generates a fat binary with a CUBIN object for `gpuChip` and + /// also embeds the PTX in the fat binary. + /// Notes: + /// Option 1 adds a significant runtime performance hit, however, tests are + /// more likely to pass with this option. + /// Option 2 is better for execution time as there is no JIT; however, the + /// program will fail if there's an architecture mismatch between `gpuChip` + /// and the GPU running the program. + /// Option 3 is the best compromise between options 1 and 2 as it can JIT in + /// case of an architecture mismatch between `gpuChip` and the running + /// architecture. However, it's only possible to JIT to a higher CC than + /// `gpuChip`. + PassOptions::Option gpuFormat{ + *this, "gpu-format", desc("GPU compilation format"), init("fatbin")}; /// This option is used to enable GPU library generation. PassOptions::Option enableGPULibgen{ diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index fde379cd0afe1..5eb2cadc884e1 100644 --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -1959,7 +1959,8 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results, //===----------------------------------------------------------------------===// LogicalResult ObjectAttr::verify(function_ref emitError, - Attribute target, StringAttr object) { + Attribute target, CompilationTarget format, + StringAttr object, DictionaryAttr properties) { if (!target) return emitError() << "the target attribute cannot be null"; if (target.hasPromiseOrImplementsInterface()) @@ -1968,6 +1969,40 @@ LogicalResult ObjectAttr::verify(function_ref emitError, "`gpu::TargetAttrInterface`"; } +namespace { +LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format, + StringAttr &object) { + std::optional formatResult; + StringRef enumKeyword; + auto loc = odsParser.getCurrentLocation(); + if (failed(odsParser.parseOptionalKeyword(&enumKeyword))) + formatResult = CompilationTarget::Fatbin; + if (!formatResult && + (formatResult = + gpu::symbolizeEnum(enumKeyword)) && + odsParser.parseEqual()) + return odsParser.emitError(loc, "expected an equal sign"); + if (!formatResult) + return odsParser.emitError(loc, "expected keyword for GPU object format"); + FailureOr objectResult = + FieldParser::parse(odsParser); + if (failed(objectResult)) + return odsParser.emitError(odsParser.getCurrentLocation(), + "failed to parse GPU_ObjectAttr parameter " + "'object' which is to be a `StringAttr`"); + format = *formatResult; + object = *objectResult; + return success(); +} + +void printObject(AsmPrinter &odsParser, CompilationTarget format, + StringAttr object) { + if (format != CompilationTarget::Fatbin) + odsParser << stringifyEnum(format) << " = "; + odsParser << object; +} +} // namespace + //===----------------------------------------------------------------------===// // GPU select object attribute //===----------------------------------------------------------------------===// @@ -2020,6 +2055,14 @@ SymbolTable *TargetOptions::getSymbolTable() const { return getSymbolTableCallback ? getSymbolTableCallback() : nullptr; } +CompilationTarget TargetOptions::getCompilationTarget() const { + return compilationTarget; +} + +CompilationTarget TargetOptions::getDefaultCompilationTarget() { + return CompilationTarget::Fatbin; +} + std::pair> TargetOptions::tokenizeCmdOptions() const { std::pair> options; @@ -2043,10 +2086,6 @@ TargetOptions::tokenizeCmdOptions() const { return options; } -TargetOptions::CompilationTarget TargetOptions::getCompilationTarget() const { - return compilationTarget; -} - MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions) #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc" diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp index e29a1f0c3248d..2bf89f8c57903 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp @@ -57,14 +57,14 @@ void GpuModuleToBinaryPass::getDependentDialects( void GpuModuleToBinaryPass::runOnOperation() { RewritePatternSet patterns(&getContext()); - int targetFormat = llvm::StringSwitch(compilationTarget) - .Cases("offloading", "llvm", TargetOptions::offload) - .Cases("assembly", "isa", TargetOptions::assembly) - .Cases("binary", "bin", TargetOptions::binary) - .Cases("fatbinary", "fatbin", TargetOptions::fatbinary) - .Case("binOrFatbin", TargetOptions::binOrFatbin) - .Default(-1); - if (targetFormat == -1) + auto targetFormat = + llvm::StringSwitch>(compilationTarget) + .Cases("offloading", "llvm", CompilationTarget::Offload) + .Cases("assembly", "isa", CompilationTarget::Assembly) + .Cases("binary", "bin", CompilationTarget::Binary) + .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin) + .Default(std::nullopt); + if (!targetFormat) getOperation()->emitError() << "Invalid format specified."; // Lazy symbol table builder callback. @@ -82,10 +82,8 @@ void GpuModuleToBinaryPass::runOnOperation() { return &parentTable.value(); }; - TargetOptions targetOptions( - toolkitPath, linkFiles, cmdOptions, - static_cast(targetFormat), - lazyTableBuilder); + TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat, + lazyTableBuilder); if (failed(transformGpuModulesToBinaries( getOperation(), offloadingHandler ? dyn_cast( @@ -107,17 +105,19 @@ LogicalResult moduleSerializer(GPUModuleOp op, auto target = dyn_cast(targetAttr); assert(target && "Target attribute doesn't implements `TargetAttrInterface`."); - std::optional> object = + std::optional> serializedModule = target.serializeToObject(op, targetOptions); - - if (!object) { + if (!serializedModule) { op.emitError("An error happened while serializing the module."); return failure(); } - objects.push_back(builder.getAttr( - target, - builder.getStringAttr(StringRef(object->data(), object->size())))); + Attribute object = target.createObject(*serializedModule, targetOptions); + if (!object) { + op.emitError("An error happened while creating the object."); + return failure(); + } + objects.push_back(object); } builder.setInsertionPointAfter(op); builder.create(op.getLoc(), op.getName(), handler, diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index 37f9e09d34c04..54069064839ea 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -84,7 +84,9 @@ void mlir::sparse_tensor::buildSparseCompiler( nvvmTargetOptions.features = options.gpuFeatures; pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); pm.addPass(createGpuToLLVMConversionPass()); - pm.addPass(createGpuModuleToBinaryPass()); + GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions; + gpuModuleToBinaryPassOptions.compilationTarget = options.gpuFormat; + pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions)); } pm.addPass(createReconcileUnrealizedCastsPass()); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp index 7bf6804902479..d19d473a53276 100644 --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -126,6 +126,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { return module; } +extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data, + int optLevel) { + ScopedContext scopedContext; + CUmodule module = nullptr; + char jitErrorBuffer[4096] = {0}; + CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_OPTIMIZATION_LEVEL}; + void *jitOptionsVals[] = {jitErrorBuffer, + reinterpret_cast(sizeof(jitErrorBuffer)), + reinterpret_cast(optLevel)}; + + CUresult result = + cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals); + if (result) { + fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer); + CUDA_REPORT_IF_ERROR(result); + } + return module; +} + extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) { CUDA_REPORT_IF_ERROR(cuModuleUnload(module)); } diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index bd3868a8e196f..da2ae87fef671 100644 --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -38,6 +38,11 @@ extern "C" hipModule_t mgpuModuleLoad(void *data) { return module; } +extern "C" hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) { + assert(false && "This function is not available in HIP."); + return nullptr; +} + extern "C" void mgpuModuleUnload(hipModule_t module) { HIP_REPORT_IF_ERROR(hipModuleUnload(module)); } diff --git a/mlir/lib/Target/LLVM/NVVM/Target.cpp b/mlir/lib/Target/LLVM/NVVM/Target.cpp index 13188b1107d92..7f263627db54f 100644 --- a/mlir/lib/Target/LLVM/NVVM/Target.cpp +++ b/mlir/lib/Target/LLVM/NVVM/Target.cpp @@ -47,6 +47,10 @@ class NVVMTargetAttrImpl std::optional> serializeToObject(Attribute attribute, Operation *module, const gpu::TargetOptions &options) const; + + Attribute createObject(Attribute attribute, + const SmallVector &object, + const gpu::TargetOptions &options) const; }; } // namespace @@ -227,9 +231,9 @@ std::optional NVPTXSerializer::findTool(StringRef tool) { } // 2. Check PATH. - if (std::optional ptxasCompiler = + if (std::optional toolPath = llvm::sys::Process::FindInEnvPath("PATH", tool)) - return *ptxasCompiler; + return *toolPath; // 3. Check `getCUDAToolkitPath()`. pathRef = getCUDAToolkitPath(); @@ -255,8 +259,7 @@ NVPTXSerializer::compileToBinary(const std::string &ptxCode) { // Determine if the serializer should create a fatbinary with the PTX embeded // or a simple CUBIN binary. const bool createFatbin = - (targetOptions.getCompilationTarget() & gpu::TargetOptions::fatbinary) == - gpu::TargetOptions::fatbinary; + targetOptions.getCompilationTarget() == gpu::CompilationTarget::Fatbin; // Find the `ptxas` & `fatbinary` tools. std::optional ptxasCompiler = findTool("ptxas"); @@ -522,7 +525,7 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule, llvm::dbgs().flush(); }); #undef DEBUG_TYPE - if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload) + if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload) return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine); // Emit PTX code. @@ -541,8 +544,12 @@ NVPTXSerializer::moduleToObject(llvm::Module &llvmModule, #undef DEBUG_TYPE // Return PTX if the compilation target is assembly. - if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly) - return SmallVector(serializedISA->begin(), serializedISA->end()); + if (targetOptions.getCompilationTarget() == + gpu::CompilationTarget::Assembly) { + // Make sure to include the null terminator. + StringRef bin(serializedISA->c_str(), serializedISA->size() + 1); + return SmallVector(bin.begin(), bin.end()); + } // Compile to binary. #if MLIR_NVPTXCOMPILER_ENABLED == 1 @@ -573,3 +580,20 @@ NVVMTargetAttrImpl::serializeToObject(Attribute attribute, Operation *module, return std::nullopt; #endif // MLIR_CUDA_CONVERSIONS_ENABLED == 1 } + +Attribute +NVVMTargetAttrImpl::createObject(Attribute attribute, + const SmallVector &object, + const gpu::TargetOptions &options) const { + auto target = cast(attribute); + gpu::CompilationTarget format = options.getCompilationTarget(); + DictionaryAttr objectProps; + Builder builder(attribute.getContext()); + if (format == gpu::CompilationTarget::Assembly) + objectProps = builder.getDictionaryAttr( + {builder.getNamedAttr("O", builder.getI32IntegerAttr(target.getO()))}); + return builder.getAttr( + attribute, format, + builder.getStringAttr(StringRef(object.data(), object.size())), + objectProps); +} diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp index effb88554e8ee..611d08fe3e79e 100644 --- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp +++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp @@ -60,6 +60,10 @@ class ROCDLTargetAttrImpl std::optional> serializeToObject(Attribute attribute, Operation *module, const gpu::TargetOptions &options) const; + + Attribute createObject(Attribute attribute, + const SmallVector &object, + const gpu::TargetOptions &options) const; }; } // namespace @@ -417,7 +421,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule, << llvmModule << "\n"; }); #undef DEBUG_TYPE - if (targetOptions.getCompilationTarget() == gpu::TargetOptions::offload) + if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Offload) return SerializeGPUModuleBase::moduleToObject(llvmModule, targetMachine); // Translate the Module to ISA. @@ -434,7 +438,7 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule, }); #undef DEBUG_TYPE // Return ISA assembly code if the compilation target is assembly. - if (targetOptions.getCompilationTarget() == gpu::TargetOptions::assembly) + if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly) return SmallVector(serializedISA->begin(), serializedISA->end()); // Compile to binary. @@ -463,3 +467,16 @@ std::optional> ROCDLTargetAttrImpl::serializeToObject( return std::nullopt; #endif // MLIR_ROCM_CONVERSIONS_ENABLED == 1 } + +Attribute +ROCDLTargetAttrImpl::createObject(Attribute attribute, + const SmallVector &object, + const gpu::TargetOptions &options) const { + gpu::CompilationTarget format = options.getCompilationTarget(); + Builder builder(attribute.getContext()); + return builder.getAttr( + attribute, + format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary + : format, + builder.getStringAttr(StringRef(object.data(), object.size())), nullptr); +} diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp index 3b060ac1779db..47fe6973778cd 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp @@ -44,6 +44,9 @@ class SelectObjectAttrImpl Operation *binaryOperation, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) const; + + // Returns the selected object for embedding. + gpu::ObjectAttr getSelectedObject(gpu::BinaryOp op) const; }; // Returns an identifier for the global string holding the binary. std::string getBinaryIdentifier(StringRef binaryName) { @@ -58,24 +61,15 @@ void mlir::gpu::registerOffloadingLLVMTranslationInterfaceExternalModels( }); } -LogicalResult SelectObjectAttrImpl::embedBinary( - Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder, - LLVM::ModuleTranslation &moduleTranslation) const { - assert(operation && "The binary operation must be non null."); - if (!operation) - return failure(); - - auto op = mlir::dyn_cast(operation); - if (!op) { - operation->emitError("Operation must be a GPU binary."); - return failure(); - } - +gpu::ObjectAttr +SelectObjectAttrImpl::getSelectedObject(gpu::BinaryOp op) const { ArrayRef objects = op.getObjectsAttr().getValue(); // Obtain the index of the object to select. int64_t index = -1; - if (Attribute target = cast(attribute).getTarget()) { + if (Attribute target = + cast(op.getOffloadingHandlerAttr()) + .getTarget()) { // If the target attribute is a number it is the index. Otherwise compare // the attribute to every target inside the object array to find the index. if (auto indexAttr = mlir::dyn_cast(target)) { @@ -95,10 +89,28 @@ LogicalResult SelectObjectAttrImpl::embedBinary( } if (index < 0 || index >= static_cast(objects.size())) { - op->emitError("The requested target object couldn't be found."); + op->emitError("the requested target object couldn't be found"); + return nullptr; + } + return mlir::dyn_cast(objects[index]); +} + +LogicalResult SelectObjectAttrImpl::embedBinary( + Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const { + assert(operation && "The binary operation must be non null."); + if (!operation) + return failure(); + + auto op = mlir::dyn_cast(operation); + if (!op) { + operation->emitError("operation must be a GPU binary"); return failure(); } - auto object = mlir::dyn_cast(objects[index]); + + gpu::ObjectAttr object = getSelectedObject(op); + if (!object) + return failure(); llvm::Module *module = moduleTranslation.getLLVMModule(); @@ -130,6 +142,9 @@ class LaunchKernel { // Get the module load callee. FunctionCallee getModuleLoadFn(); + // Get the module load JIT callee. + FunctionCallee getModuleLoadJITFn(); + // Get the module unload callee. FunctionCallee getModuleUnloadFn(); @@ -149,7 +164,8 @@ class LaunchKernel { Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op); // Create the full kernel launch. - mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op); + mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op, + mlir::gpu::ObjectAttr object); private: Module &module; @@ -174,13 +190,22 @@ LogicalResult SelectObjectAttrImpl::launchKernel( auto launchFuncOp = mlir::dyn_cast(launchFuncOperation); if (!launchFuncOp) { - launchFuncOperation->emitError("Operation must be a GPU launch func Op."); + launchFuncOperation->emitError("operation must be a GPU launch func Op."); return failure(); } + auto binOp = mlir::dyn_cast(binaryOperation); + if (!binOp) { + binaryOperation->emitError("operation must be a GPU binary."); + return failure(); + } + gpu::ObjectAttr object = getSelectedObject(binOp); + if (!object) + return failure(); + return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder, moduleTranslation) - .createKernelLaunch(launchFuncOp); + .createKernelLaunch(launchFuncOp, object); } llvm::LaunchKernel::LaunchKernel( @@ -215,6 +240,12 @@ llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() { FunctionType::get(ptrTy, ArrayRef({ptrTy}), false)); } +llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadJITFn() { + return module.getOrInsertFunction( + "mgpuModuleLoadJIT", + FunctionType::get(ptrTy, ArrayRef({ptrTy, i32Ty}), false)); +} + llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() { return module.getOrInsertFunction( "mgpuModuleUnload", @@ -299,7 +330,8 @@ llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) { // call %streamDestroy(%4) // call %moduleUnload(%1) mlir::LogicalResult -llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) { +llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op, + mlir::gpu::ObjectAttr object) { auto llvmValue = [&](mlir::Value value) -> Value * { Value *v = moduleTranslation.lookupValue(value); assert(v && "Value has not been translated."); @@ -326,13 +358,29 @@ llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) { // Create the argument array. Value *argArray = createKernelArgArray(op); + // Default JIT optimization level. + llvm::Constant *optV = llvm::ConstantInt::get(i32Ty, 0); + // Check if there's an optimization level embedded in the object. + DictionaryAttr objectProps = object.getProperties(); + mlir::Attribute optAttr; + if (objectProps && (optAttr = objectProps.get("O"))) { + auto optLevel = dyn_cast(optAttr); + if (!optLevel) + return op.emitError("the optimization level must be an integer"); + optV = llvm::ConstantInt::get(i32Ty, optLevel.getValue()); + } + // Load the kernel module. StringRef moduleName = op.getKernelModuleName().getValue(); std::string binaryIdentifier = getBinaryIdentifier(moduleName); Value *binary = module.getGlobalVariable(binaryIdentifier, true); if (!binary) return op.emitError() << "Couldn't find the binary: " << binaryIdentifier; - Value *moduleObject = builder.CreateCall(getModuleLoadFn(), {binary}); + + Value *moduleObject = + object.getFormat() == gpu::CompilationTarget::Assembly + ? builder.CreateCall(getModuleLoadJITFn(), {binary, optV}) + : builder.CreateCall(getModuleLoadFn(), {binary}); // Load the kernel function. Value *moduleFunction = builder.CreateCall( diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 66a9cb01106ba..bf143d036c2f6 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -26,6 +26,8 @@ if (MLIR_INCLUDE_INTEGRATION_TESTS) "If arch-specific Arm integration tests run emulated, use this Arm native lli.") set(ARM_EMULATOR_UTILS_LIB_DIR "" CACHE STRING "If arch-specific Arm integration tests run emulated, find Arm native utility libraries in this directory.") + set(MLIR_GPU_COMPILATION_TEST_FORMAT "fatbin" CACHE STRING + "The GPU compilation format used by the tests.") option(MLIR_RUN_AMX_TESTS "Run AMX tests.") option(MLIR_RUN_X86VECTOR_TESTS "Run X86Vector tests.") option(MLIR_RUN_CUDA_TENSOR_CORE_TESTS "Run CUDA Tensor core WMMA tests.") diff --git a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir index 555b28a8293ee..22d7caa38feec 100644 --- a/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir +++ b/mlir/test/Dialect/GPU/module-to-binary-nvvm.mlir @@ -1,10 +1,10 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s -// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s +// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA module attributes {gpu.container_module} { // CHECK-LABEL:gpu.binary @kernel_module1 - // CHECK:[#gpu.object<#nvvm.target, "{{.*}}">] + // CHECK:[#gpu.object<#nvvm.target, offload = "{{.*}}">] gpu.module @kernel_module1 [#nvvm.target] { llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, @@ -14,7 +14,7 @@ module attributes {gpu.container_module} { } // CHECK-LABEL:gpu.binary @kernel_module2 - // CHECK:[#gpu.object<#nvvm.target, "{{.*}}">, #gpu.object<#nvvm.target, "{{.*}}">] + // CHECK-ISA:[#gpu.object<#nvvm.target, properties = {O = 2 : i32}, assembly = "{{.*}}">, #gpu.object<#nvvm.target, properties = {O = 2 : i32}, assembly = "{{.*}}">] gpu.module @kernel_module2 [#nvvm.target, #nvvm.target] { llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, diff --git a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir index fb7cfb70c17ed..9f987c71387f4 100644 --- a/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir +++ b/mlir/test/Dialect/GPU/module-to-binary-rocdl.mlir @@ -1,10 +1,10 @@ // REQUIRES: host-supports-amdgpu // RUN: mlir-opt %s --gpu-module-to-binary="format=llvm" | FileCheck %s -// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s +// RUN: mlir-opt %s --gpu-module-to-binary="format=isa" | FileCheck %s -check-prefix=CHECK-ISA module attributes {gpu.container_module} { // CHECK-LABEL:gpu.binary @kernel_module1 - // CHECK:[#gpu.object<#rocdl.target, "{{.*}}">] + // CHECK:[#gpu.object<#rocdl.target, offload = "{{.*}}">] gpu.module @kernel_module1 [#rocdl.target] { llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, @@ -14,7 +14,7 @@ module attributes {gpu.container_module} { } // CHECK-LABEL:gpu.binary @kernel_module2 - // CHECK:[#gpu.object<#rocdl.target, "{{.*}}">, #gpu.object<#rocdl.target, "{{.*}}">] + // CHECK-ISA:[#gpu.object<#rocdl.target, assembly = "{{.*}}">, #gpu.object<#rocdl.target, assembly = "{{.*}}">] gpu.module @kernel_module2 [#rocdl.target, #rocdl.target] { llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir index b314a768a0896..0d2f52e8adbfc 100644 --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -127,6 +127,16 @@ module attributes {gpu.container_module} { gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#nvvm.target, "">] + gpu.binary @binary_4 [#gpu.object<#nvvm.target, bin = "">, + #gpu.object<#nvvm.target, assembly = "">, + #gpu.object<#nvvm.target, offload = "">, + #gpu.object<#nvvm.target, properties = { O = 3 : i32 }, offload = ""> + ] + + // Check that fatbin gets ellided as it's the default format. + // CHECK: gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, "">] + gpu.binary @binary_5 [#gpu.object<#nvvm.target, properties = {O = 3 : i32}, fatbin = "">] + func.func private @two_value_generator() -> (f32, memref) func.func @foo() { diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg index 6788ccea3a222..19f12d39c8428 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg @@ -1,2 +1,4 @@ if not config.enable_cuda_runner or not config.mlir_run_cuda_sm80_tests: config.unsupported = True + +config.substitutions.append(("%gpu_compilation_format", config.gpu_compilation_format)) diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir index aa71abbcf0e71..67c8ce8dfa300 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib-from-linalg.mlir @@ -2,7 +2,7 @@ // NOTE: this test requires gpu-sm80 and cusparselt // // DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: --shared-libs=%mlir_cuda_runtime \ // DEFINE: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir index 062798a39b810..8917ab1e5a70d 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-prune.mlir @@ -1,7 +1,7 @@ // // NOTE: this test requires gpu-sm80 and cusparselt // -// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: mlir-opt --sparse-compiler="enable-runtime-library=false enable-gpu-libgen=true gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \ // RUN: %s \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir index a310e59d53038..9e29dbcca7ff4 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-gemm-lib.mlir @@ -4,7 +4,7 @@ // with RT lib: // // RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ @@ -14,7 +14,7 @@ // without RT lib: // // RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: --sparse-compiler="enable-runtime-library=false enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir index 1e51aae5f3892..b21576635eddd 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matmul-lib.mlir @@ -2,7 +2,7 @@ // NOTE: this test requires gpu-sm80 // // DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: --shared-libs=%mlir_cuda_runtime \ // DEFINE: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir index ca47de6cca27f..9f995e2d13492 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir @@ -3,7 +3,7 @@ // NOTE: this test requires gpu-sm80 // // RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir index 16a240838d7c4..b6dfce577f2a4 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-lib.mlir @@ -2,7 +2,7 @@ // NOTE: this test requires gpu-sm80 // // DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: --shared-libs=%mlir_cuda_runtime \ // DEFINE: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir index c5c3546cdf016..c6faf2660541a 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec.mlir @@ -3,7 +3,7 @@ // NOTE: this test requires gpu-sm80 // // RUN: mlir-opt %s \ -// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: --sparse-compiler="enable-runtime-library=false parallelization-strategy=dense-outer-loop gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir index aee8a6a6558e4..8ee7a266083b0 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir @@ -6,7 +6,7 @@ // RUN: mlir-opt \ // RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \ // RUN: %s \ -// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \ +// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir index 934bd837420c1..850c1ca069a1a 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir @@ -2,7 +2,7 @@ // NOTE: this test requires gpu-sm80 // // DEFINE: %{compile} = mlir-opt %s \ -// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 +// DEFINE: --sparse-compiler="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format // DEFINE: %{run} = TENSOR0="%mlir_src_dir/test/Integration/data/test.mtx" \ // DEFINE: mlir-cpu-runner \ // DEFINE: --shared-libs=%mlir_cuda_runtime \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir index d959fdb6a9db1..21f0e24d5e5da 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir @@ -3,7 +3,7 @@ // RUN: mlir-opt %s \ // RUN: -test-transform-dialect-interpreter \ // RUN: -test-transform-dialect-erase-schedule \ -// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \ +// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir index 0ec15f2a9c79d..22b422acf7cb5 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir @@ -13,7 +13,7 @@ // RUN: mlir-opt %s \ // RUN: -test-transform-dialect-interpreter \ // RUN: -test-transform-dialect-erase-schedule \ -// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \ +// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76 cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir index 4d8a281113593..a20da0673d653 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir index 664d344b2769b..643f563bcb7b7 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir @@ -5,7 +5,7 @@ // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention. // This test also uses gpu.memcpy operations (instead of gpu.host_register). // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \ +// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir index 4d76eb898dc29..4159b5b8e57e9 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70 cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index c48a515ed0221..558e19d15e910 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -10,7 +10,7 @@ // Same as above but with the memref bare pointer lowering convention. // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \ +// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1 cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir index e8ffc3f830c7c..f1ae0be2d9871 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir index fde50e9b6b92f..191f64d297a3b 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index 08c3571ef1c35..089b18e311d53 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index 134296f39c2b4..686c3931ee3d2 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index c2be1b65950ea..e8903b0812ef3 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index 6b75321b7bfc2..cfb22b3d470df 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir index 1314d32a779a8..be26ff6d7ac18 100644 --- a/mlir/test/Integration/GPU/CUDA/async.mlir +++ b/mlir/test/Integration/GPU/CUDA/async.mlir @@ -3,7 +3,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \ -// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \ +// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary="format=%gpu_compilation_format" \ // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \ // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \ // RUN: | mlir-cpu-runner \ diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index abc93f7b1703a..f3587ddfeecc7 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg index acb8dd43f50b4..5f1e33e87df9c 100644 --- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg +++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg @@ -1,2 +1,4 @@ if not config.enable_cuda_runner: config.unsupported = True + +config.substitutions.append(("%gpu_compilation_format", config.gpu_compilation_format)) diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index 3389f805ac63d..3c6135d1d72af 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index eef5ac66ca52a..52560e9ebb06e 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index 05cb854d18dd4..16ccf90595c57 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir index de68d3b90f11f..6e32eb147d499 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_128x64_swizzle128b.mlir @@ -16,7 +16,7 @@ // RUN: -canonicalize -cse \ // RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \ +// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir index 6d99852205815..760ded16556ff 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x8_8x128_noswizzle.mlir @@ -10,7 +10,7 @@ // RUN: -convert-func-to-llvm \ // RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \ +// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX // Basic PTX check to make sure we are generating the right instructions. @@ -34,7 +34,7 @@ // RUN: -convert-func-to-llvm \ // RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \ +// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir index e66978bc594b1..da38c160cf287 100644 --- a/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir +++ b/mlir/test/Integration/GPU/CUDA/sm90/transform-dialect/tma_load_64x8_8x128_noswizzle-transform.mlir @@ -16,7 +16,7 @@ // RUN: -canonicalize \ // RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \ +// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s --check-prefixes=CHECK-PTX // Basic PTX check to make sure we are generating the right instructions. @@ -43,7 +43,7 @@ // RUN: -canonicalize \ // RUN: -expand-strided-metadata --nvvm-attach-target="module=main_kernel features=+ptx80 chip=sm_90 O=3" \ // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-index-to-llvm{index-bitwidth=32},canonicalize,cse))' \ -// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary -canonicalize -cse -reconcile-unrealized-casts \ +// RUN: | mlir-opt --gpu-to-llvm --gpu-module-to-binary=format=%gpu_compilation_format -canonicalize -cse -reconcile-unrealized-casts \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index fde66de2fce6e..8cbbe000bfa74 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,7 +1,7 @@ // REQUIRES: host-supports-nvptx // RUN: mlir-opt %s \ -// RUN: | mlir-opt -test-lower-to-nvvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-format=%gpu_compilation_format" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp index 5d0c420f65d53..a554fe6dcadb1 100644 --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -70,6 +70,10 @@ struct TestLowerToNVVMOptions *this, "cubin-features", llvm::cl::desc("Features to use to serialize to cubin."), llvm::cl::init("+ptx60")}; + PassOptions::Option cubinFormat{ + *this, "cubin-format", + llvm::cl::desc("Compilation format to use to serialize to cubin."), + llvm::cl::init("isa")}; }; //===----------------------------------------------------------------------===// @@ -257,7 +261,9 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); // Serialize all GPU modules to binaries. - pm.addPass(createGpuModuleToBinaryPass()); + GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions; + gpuModuleToBinaryPassOptions.compilationTarget = options.cubinFormat; + pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions)); // Convert vector to LLVM (always needed). // TODO: C++20 designated initializers. diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index ef1fdbc0cba07..2de40ba5e8e57 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -29,6 +29,7 @@ config.run_cuda_tests = @MLIR_ENABLE_CUDA_CONVERSIONS@ config.enable_cuda_runner = @MLIR_ENABLE_CUDA_RUNNER@ config.run_rocm_tests = @MLIR_ENABLE_ROCM_CONVERSIONS@ config.enable_rocm_runner = @MLIR_ENABLE_ROCM_RUNNER@ +config.gpu_compilation_format = "@MLIR_GPU_COMPILATION_TEST_FORMAT@" config.rocm_test_chipset = "@ROCM_TEST_CHIPSET@" config.enable_spirv_cpu_runner = @MLIR_ENABLE_SPIRV_CPU_RUNNER@ config.enable_vulkan_runner = @MLIR_ENABLE_VULKAN_RUNNER@ diff --git a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp index 62c9b527e1e38..a00ebba7b97e6 100644 --- a/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp +++ b/mlir/unittests/Target/LLVM/SerializeNVVMTarget.cpp @@ -79,7 +79,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMMToLLVM)) { // Serialize the module. auto serializer = dyn_cast(target); ASSERT_TRUE(!!serializer); - gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload); + gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload); for (auto gpuModule : (*module).getBody()->getOps()) { std::optional> object = serializer.serializeToObject(gpuModule, options); @@ -115,7 +115,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToPTX)) { // Serialize the module. auto serializer = dyn_cast(target); ASSERT_TRUE(!!serializer); - gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly); + gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly); for (auto gpuModule : (*module).getBody()->getOps()) { std::optional> object = serializer.serializeToObject(gpuModule, options); @@ -145,7 +145,7 @@ TEST_F(MLIRTargetLLVMNVVM, SKIP_WITHOUT_NVPTX(SerializeNVVMToBinary)) { // Serialize the module. auto serializer = dyn_cast(target); ASSERT_TRUE(!!serializer); - gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary); + gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary); for (auto gpuModule : (*module).getBody()->getOps()) { std::optional> object = serializer.serializeToObject(gpuModule, options); diff --git a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp index 89cf5c5d2ada5..9ada2dab40ff7 100644 --- a/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp +++ b/mlir/unittests/Target/LLVM/SerializeROCDLTarget.cpp @@ -83,7 +83,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLMToLLVM)) { // Serialize the module. auto serializer = dyn_cast(target); ASSERT_TRUE(!!serializer); - gpu::TargetOptions options("", {}, "", gpu::TargetOptions::offload); + gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Offload); for (auto gpuModule : (*module).getBody()->getOps()) { std::optional> object = serializer.serializeToObject(gpuModule, options); @@ -119,7 +119,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToPTX)) { // Serialize the module. auto serializer = dyn_cast(target); ASSERT_TRUE(!!serializer); - gpu::TargetOptions options("", {}, "", gpu::TargetOptions::assembly); + gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Assembly); for (auto gpuModule : (*module).getBody()->getOps()) { std::optional> object = serializer.serializeToObject(gpuModule, options); @@ -149,7 +149,7 @@ TEST_F(MLIRTargetLLVMROCDL, SKIP_WITHOUT_AMDGPU(SerializeROCDLToBinary)) { // Serialize the module. auto serializer = dyn_cast(target); ASSERT_TRUE(!!serializer); - gpu::TargetOptions options("", {}, "", gpu::TargetOptions::binary); + gpu::TargetOptions options("", {}, "", gpu::CompilationTarget::Binary); for (auto gpuModule : (*module).getBody()->getOps()) { std::optional> object = serializer.serializeToObject(gpuModule, options);