llvm · fabianmcg · Sep 14, 2023 · Sep 12, 2023 · Sep 13, 2023 · Sep 14, 2023
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td
@@ -33,12 +33,21 @@ def GPUTargetAttrInterface : AttrInterface<"TargetAttrInterface"> {
 
         If serialization fails then the method should return `std::nullopt`.
 
-        The `module` argument must be a GPU Module Op. The `options` argument is
-        meant to be used for passing additional options that are not in the
+        The `module` parameter must be a GPU Module Op. The `options` parameter
+        is meant to be used for passing additional options that are not in the
         attribute.
       }],
       "std::optional<SmallVector<char, 0>>", "serializeToObject",
-      (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>
+      (ins "Operation*":$module, "const gpu::TargetOptions&":$options)>,
+    InterfaceMethod<[{
+        Creates a GPU object attribute from a binary string.
+
+        The `object` parameter is a binary string. The `options` parameter is
+        meant to be used for passing additional options that are not in the
+        attribute.
+      }], "Attribute", "createObject",
+        (ins "const SmallVector<char, 0>&":$object,
+             "const gpu::TargetOptions&":$options)>
   ];
 }
 

diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td
@@ -20,20 +20,62 @@ include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 // GPU object attribute.
 //===----------------------------------------------------------------------===//
 
+// For documentation on this enum cases, see the `GPU_ObjectAttr` docs.
+def GPU_ObjectOffload : I32EnumAttrCase<"Offload", 1, "offload">;
+def GPU_ObjectISA : I32EnumAttrCase<"Assembly", 2, "assembly">;
+def GPU_ObjectBinary : I32EnumAttrCase<"Binary", 3, "bin">;
+def GPU_ObjectFatbin : I32EnumAttrCase<"Fatbin", 4, "fatbin">;
+def GPU_CompilationTargetEnum : GPU_I32Enum<
+  "CompilationTarget", "GPU compilation format", [
+    GPU_ObjectOffload,
+    GPU_ObjectISA,
+    GPU_ObjectBinary,
+    GPU_ObjectFatbin
+  ]>;
+
 def GPU_ObjectAttr : GPU_Attr<"Object", "object"> {
   let description = [{
-    A GPU object attribute pairs a GPU target with a binary string,
-    encapsulating the information of how the object was generated with the
-    object itself.
+    A GPU object attribute glues together a GPU target, the object kind, a
+    binary string with the object, and the object properties, encapsulating how
+    the object was generated and its properties with the object itself.
+
+    There are four object formats:
+    1. `Offload`: represents generic objects not described by the other three
+    formats, and its meaning is target-dependent. For example, on the NVPTX and
+    AMDGPU targets, this format is associated with LLVM bitcode.
+    2. `Assembly`: represents GPU assembly code. For example, in the NVPTX
+    target, assembly is PTX code, which can be JITted at runtime.
+    3. `Binary`: represents executable code for a GPU single architecture. For
+    example, PTX code that was compiled for a specific compute capability. Note
+    that this format is likely to throw an error if there is an architecture
+    mismatch between the compiled and running architecture.
+    4. `Fatbin`: represents a GPU fat binary with executable code for multiple
+    architectures. This format is the default; thus, it gets elided inassembly
+    code.
 
-    The target attribute must implement the `TargetAttrInterface` interface.
+    Object properties are specified through the `properties` dictionary
+    attribute and can be used to define additional information.
+    The target attribute must implement or promise the `TargetAttrInterface`
+    interface.
 
     ```
-      #gpu.object<#nvvm.target, "...">
+      #gpu.object<#rocdl.target, offload = "..."> // An offload object.
+      #gpu.object<#nvvm.target, properties = {O = 3 : i32}, assembly = "..."> // An assembly object with additional properties.
+      #gpu.object<#rocdl.target, bin = "..."> // A binary object.
+      #gpu.object<#nvvm.target, "..."> // A fatbin object.
     ```
   }];
-  let parameters = (ins "Attribute":$target, "StringAttr":$object);
-  let assemblyFormat = [{`<` $target `,` $object `>`}];
+  let parameters = (ins
+    "Attribute":$target,
+    DefaultValuedParameter<"CompilationTarget", "CompilationTarget::Fatbin">:$format,
+    "StringAttr":$object,
+    OptionalParameter<"DictionaryAttr">:$properties
+  );
+  let assemblyFormat = [{ `<`
+      $target `,`  (`properties` `=` $properties ^ `,`)?
+      custom<Object>($format, $object)
+    `>`
+  }];
   let genVerifyDecl = 1;
 }
 

diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -25,6 +25,8 @@ namespace LLVM {
 class ModuleTranslation;
 }
 namespace gpu {
+enum class CompilationTarget : uint32_t;
+
 /// This class indicates that the attribute associated with this trait is a GPU
 /// offloading translation attribute. These kinds of attributes must implement
 /// an interface for handling the translation of GPU offloading operations like
@@ -42,27 +44,15 @@ class OffloadingTranslationAttrTrait
 /// ensure type safeness. Targets are free to ignore these options.
 class TargetOptions {
 public:
-  /// The target representation of the compilation process.
-  typedef enum {
-    offload = 1,  /// The process should produce an offloading representation.
-                  /// For the NVVM & ROCDL targets this option produces LLVM IR.
-    assembly = 2, /// The process should produce assembly code.
-    binary = 4,   /// The process should produce a binary.
-    fatbinary = 8, /// The process should produce a fat binary.
-    binOrFatbin =
-        binary |
-        fatbinary, /// The process should produce a binary or fatbinary. It's up
-                   /// to the target to decide which.
-  } CompilationTarget;
-
   /// Constructor initializing the toolkit path, the list of files to link to,
   /// extra command line options, the compilation target and a callback for
   /// obtaining the parent symbol table. The default compilation target is
-  /// `binOrFatbin`.
-  TargetOptions(StringRef toolkitPath = {},
-                ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
-                CompilationTarget compilationTarget = binOrFatbin,
-                function_ref<SymbolTable *()> getSymbolTableCallback = {});
+  /// `Fatbin`.
+  TargetOptions(
+      StringRef toolkitPath = {}, ArrayRef<std::string> linkFiles = {},
+      StringRef cmdOptions = {},
+      CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+      function_ref<SymbolTable *()> getSymbolTableCallback = {});
 
   /// Returns the typeID.
   TypeID getTypeID() const;
@@ -90,13 +80,17 @@ class TargetOptions {
   /// table.
   SymbolTable *getSymbolTable() const;
 
+  /// Returns the default compilation target: `CompilationTarget::Fatbin`.
+  static CompilationTarget getDefaultCompilationTarget();
+
 protected:
   /// Derived classes must use this constructor to initialize `typeID` to the
   /// appropiate value: ie. `TargetOptions(TypeID::get<DerivedClass>())`.
-  TargetOptions(TypeID typeID, StringRef toolkitPath = {},
-                ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
-                CompilationTarget compilationTarget = binOrFatbin,
-                function_ref<SymbolTable *()> getSymbolTableCallback = {});
+  TargetOptions(
+      TypeID typeID, StringRef toolkitPath = {},
+      ArrayRef<std::string> linkFiles = {}, StringRef cmdOptions = {},
+      CompilationTarget compilationTarget = getDefaultCompilationTarget(),
+      function_ref<SymbolTable *()> getSymbolTableCallback = {});
 
   /// Path to the target toolkit.
   std::string toolkitPath;
@@ -108,7 +102,7 @@ class TargetOptions {
   /// process.
   std::string cmdOptions;
 
-  /// Compilation process target representation.
+  /// Compilation process target format.
   CompilationTarget compilationTarget;
 
   /// Callback for obtaining the parent symbol table of all the GPU modules

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -68,7 +68,6 @@ def GpuModuleToBinaryPass
     2. `assembly`, `isa`: produces assembly code.
     3. `binary`, `bin`: produces binaries.
     4. `fatbinary`, `fatbin`: produces fatbinaries.
-    5. `binOrFatbin`: produces bins or fatbins, the target decides which.
   }];
   let options = [
     Option<"offloadingHandler", "handler", "Attribute", "nullptr",
@@ -79,7 +78,7 @@ def GpuModuleToBinaryPass
            "Extra files to link to.">,
     Option<"cmdOptions", "opts", "std::string", [{""}],
            "Command line options to pass to the tools.">,
-    Option<"compilationTarget", "format", "std::string", [{"binOrFatbin"}],
+    Option<"compilationTarget", "format", "std::string", [{"fatbin"}],
            "The target representation of the compilation process.">
   ];
 }

@@ -144,6 +144,23 @@ struct SparseCompilerOptions
                                            desc("GPU target architecture")};
   PassOptions::Option<std::string> gpuFeatures{*this, "gpu-features",
                                                desc("GPU target features")};
+  /// For NVIDIA GPUs there are 3 compilation format options:
+  /// 1. `isa`: the compiler generates PTX and the driver JITs the PTX.
+  /// 2. `bin`: generates a CUBIN object for `chip=gpuChip`.
+  /// 3. `fatbin`: generates a fat binary with a CUBIN object for `gpuChip` and
+  /// also embeds the PTX in the fat binary.
+  /// Notes:
+  /// Option 1 adds a significant runtime performance hit, however, tests are
+  /// more likely to pass with this option.
+  /// Option 2 is better for execution time as there is no JIT; however, the
+  /// program will fail if there's an architecture mismatch between `gpuChip`
+  /// and the GPU running the program.
+  /// Option 3 is the best compromise between options 1 and 2 as it can JIT in
+  /// case of an architecture mismatch between `gpuChip` and the running
+  /// architecture. However, it's only possible to JIT to a higher CC than
+  /// `gpuChip`.
+  PassOptions::Option<std::string> gpuFormat{
+      *this, "gpu-format", desc("GPU compilation format"), init("fatbin")};
 
   /// This option is used to enable GPU library generation.
   PassOptions::Option<bool> enableGPULibgen{

diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -1959,7 +1959,8 @@ void AllocOp::getCanonicalizationPatterns(RewritePatternSet &results,
 //===----------------------------------------------------------------------===//
 
 LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
-                                 Attribute target, StringAttr object) {
+                                 Attribute target, CompilationTarget format,
+                                 StringAttr object, DictionaryAttr properties) {
   if (!target)
     return emitError() << "the target attribute cannot be null";
   if (target.hasPromiseOrImplementsInterface<TargetAttrInterface>())
@@ -1968,6 +1969,40 @@ LogicalResult ObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
                         "`gpu::TargetAttrInterface`";
 }
 
+namespace {
+LogicalResult parseObject(AsmParser &odsParser, CompilationTarget &format,
+                          StringAttr &object) {
+  std::optional<CompilationTarget> formatResult;
+  StringRef enumKeyword;
+  auto loc = odsParser.getCurrentLocation();
+  if (failed(odsParser.parseOptionalKeyword(&enumKeyword)))
+    formatResult = CompilationTarget::Fatbin;
+  if (!formatResult &&
+      (formatResult =
+           gpu::symbolizeEnum<gpu::CompilationTarget>(enumKeyword)) &&
+      odsParser.parseEqual())
+    return odsParser.emitError(loc, "expected an equal sign");
+  if (!formatResult)
+    return odsParser.emitError(loc, "expected keyword for GPU object format");
+  FailureOr<StringAttr> objectResult =
+      FieldParser<StringAttr>::parse(odsParser);
+  if (failed(objectResult))
+    return odsParser.emitError(odsParser.getCurrentLocation(),
+                               "failed to parse GPU_ObjectAttr parameter "
+                               "'object' which is to be a `StringAttr`");
+  format = *formatResult;
+  object = *objectResult;
+  return success();
+}
+
+void printObject(AsmPrinter &odsParser, CompilationTarget format,
+                 StringAttr object) {
+  if (format != CompilationTarget::Fatbin)
+    odsParser << stringifyEnum(format) << " = ";
+  odsParser << object;
+}
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // GPU select object attribute
 //===----------------------------------------------------------------------===//
@@ -2020,6 +2055,14 @@ SymbolTable *TargetOptions::getSymbolTable() const {
   return getSymbolTableCallback ? getSymbolTableCallback() : nullptr;
 }
 
+CompilationTarget TargetOptions::getCompilationTarget() const {
+  return compilationTarget;
+}
+
+CompilationTarget TargetOptions::getDefaultCompilationTarget() {
+  return CompilationTarget::Fatbin;
+}
+
 std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
 TargetOptions::tokenizeCmdOptions() const {
   std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>> options;
@@ -2043,10 +2086,6 @@ TargetOptions::tokenizeCmdOptions() const {
   return options;
 }
 
-TargetOptions::CompilationTarget TargetOptions::getCompilationTarget() const {
-  return compilationTarget;
-}
-
 MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
 
 #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"

diff --git a/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp b/mlir/lib/Dialect/GPU/Transforms/ModuleToBinary.cpp
@@ -57,14 +57,14 @@ void GpuModuleToBinaryPass::getDependentDialects(
 
 void GpuModuleToBinaryPass::runOnOperation() {
   RewritePatternSet patterns(&getContext());
-  int targetFormat = llvm::StringSwitch<int>(compilationTarget)
-                         .Cases("offloading", "llvm", TargetOptions::offload)
-                         .Cases("assembly", "isa", TargetOptions::assembly)
-                         .Cases("binary", "bin", TargetOptions::binary)
-                         .Cases("fatbinary", "fatbin", TargetOptions::fatbinary)
-                         .Case("binOrFatbin", TargetOptions::binOrFatbin)
-                         .Default(-1);
-  if (targetFormat == -1)
+  auto targetFormat =
+      llvm::StringSwitch<std::optional<CompilationTarget>>(compilationTarget)
+          .Cases("offloading", "llvm", CompilationTarget::Offload)
+          .Cases("assembly", "isa", CompilationTarget::Assembly)
+          .Cases("binary", "bin", CompilationTarget::Binary)
+          .Cases("fatbinary", "fatbin", CompilationTarget::Fatbin)
+          .Default(std::nullopt);
+  if (!targetFormat)
     getOperation()->emitError() << "Invalid format specified.";
 
   // Lazy symbol table builder callback.
@@ -82,10 +82,8 @@ void GpuModuleToBinaryPass::runOnOperation() {
     return &parentTable.value();
   };
 
-  TargetOptions targetOptions(
-      toolkitPath, linkFiles, cmdOptions,
-      static_cast<TargetOptions::CompilationTarget>(targetFormat),
-      lazyTableBuilder);
+  TargetOptions targetOptions(toolkitPath, linkFiles, cmdOptions, *targetFormat,
+                              lazyTableBuilder);
   if (failed(transformGpuModulesToBinaries(
           getOperation(),
           offloadingHandler ? dyn_cast<OffloadingLLVMTranslationAttrInterface>(
@@ -107,17 +105,19 @@ LogicalResult moduleSerializer(GPUModuleOp op,
     auto target = dyn_cast<gpu::TargetAttrInterface>(targetAttr);
     assert(target &&
            "Target attribute doesn't implements `TargetAttrInterface`.");
-    std::optional<SmallVector<char, 0>> object =
+    std::optional<SmallVector<char, 0>> serializedModule =
         target.serializeToObject(op, targetOptions);
-
-    if (!object) {
+    if (!serializedModule) {
       op.emitError("An error happened while serializing the module.");
       return failure();
     }
 
-    objects.push_back(builder.getAttr<gpu::ObjectAttr>(
-        target,
-        builder.getStringAttr(StringRef(object->data(), object->size()))));
+    Attribute object = target.createObject(*serializedModule, targetOptions);
+    if (!object) {
+      op.emitError("An error happened while creating the object.");
+      return failure();
+    }
+    objects.push_back(object);
   }
   builder.setInsertionPointAfter(op);
   builder.create<gpu::BinaryOp>(op.getLoc(), op.getName(), handler,

@@ -84,7 +84,9 @@ void mlir::sparse_tensor::buildSparseCompiler(
     nvvmTargetOptions.features = options.gpuFeatures;
     pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions));
     pm.addPass(createGpuToLLVMConversionPass());
-    pm.addPass(createGpuModuleToBinaryPass());
+    GpuModuleToBinaryPassOptions gpuModuleToBinaryPassOptions;
+    gpuModuleToBinaryPassOptions.compilationTarget = options.gpuFormat;
+    pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions));
   }
 
   pm.addPass(createReconcileUnrealizedCastsPass());

diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -126,6 +126,27 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) {
   return module;
 }
 
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoadJIT(void *data,
+                                                                int optLevel) {
+  ScopedContext scopedContext;
+  CUmodule module = nullptr;
+  char jitErrorBuffer[4096] = {0};
+  CUjit_option jitOptions[] = {CU_JIT_ERROR_LOG_BUFFER,
+                               CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+                               CU_JIT_OPTIMIZATION_LEVEL};
+  void *jitOptionsVals[] = {jitErrorBuffer,
+                            reinterpret_cast<void *>(sizeof(jitErrorBuffer)),
+                            reinterpret_cast<void *>(optLevel)};
+
+  CUresult result =
+      cuModuleLoadDataEx(&module, data, 3, jitOptions, jitOptionsVals);
+  if (result) {
+    fprintf(stderr, "JIT compilation failed with: '%s'\n", jitErrorBuffer);
+    CUDA_REPORT_IF_ERROR(result);
+  }
+  return module;
+}
+
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuModuleUnload(CUmodule module) {
   CUDA_REPORT_IF_ERROR(cuModuleUnload(module));
 }

diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp
@@ -38,6 +38,11 @@ extern "C" hipModule_t mgpuModuleLoad(void *data) {
   return module;
 }
 
+extern "C" hipModule_t mgpuModuleLoadJIT(void *data, int optLevel) {
+  assert(false && "This function is not available in HIP.");
+  return nullptr;
+}
+
 extern "C" void mgpuModuleUnload(hipModule_t module) {
   HIP_REPORT_IF_ERROR(hipModuleUnload(module));
 }