[CIR][HIP] Lower Device CIR to LLVM IR (#1967)

koparasy · web-flow · commit e4003cd3a263 · 2025-11-10T11:02:41.000-08:00
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
@@ -13,6 +13,7 @@ add_clang_library(TargetLowering
   TargetInfo.cpp
   TargetLoweringInfo.cpp
   Targets/AArch64.cpp
+  Targets/AMDGPU.cpp
   Targets/NVPTX.cpp
   Targets/SPIR.cpp
   Targets/X86.cpp
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
@@ -71,6 +71,8 @@ createTargetLoweringInfo(LowerModule &LM) {
 
     return createAArch64TargetLoweringInfo(LM, Kind);
   }
+  case llvm::Triple::amdgcn:
+    return createAMDGPUTargetLoweringInfo(LM);
   case llvm::Triple::x86_64: {
     switch (Triple.getOS()) {
     case llvm::Triple::Win32:
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/TargetInfo.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/TargetInfo.h
@@ -33,6 +33,9 @@ createSPIRVTargetLoweringInfo(LowerModule &CGM);
 std::unique_ptr<TargetLoweringInfo>
 createNVPTXTargetLoweringInfo(LowerModule &CGM);
 
+std::unique_ptr<TargetLoweringInfo>
+createAMDGPUTargetLoweringInfo(LowerModule &CGM);
+
 } // namespace cir
 
 #endif // LLVM_CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_TARGETINFO_H
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/AMDGPU.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/AMDGPU.cpp
@@ -0,0 +1,71 @@
+//===- AMDGPU.cpp - TargetInfo for AMDGPU ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ABIInfoImpl.h"
+#include "LowerFunctionInfo.h"
+#include "LowerTypes.h"
+#include "TargetInfo.h"
+#include "TargetLoweringInfo.h"
+#include "clang/CIR/ABIArgInfo.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "clang/CIR/MissingFeatures.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using ABIArgInfo = cir::ABIArgInfo;
+using MissingFeature = cir::MissingFeatures;
+
+namespace cir {
+
+//===----------------------------------------------------------------------===//
+// AMDGPU ABI Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class AMDGPUABIInfo : public ABIInfo {
+public:
+  AMDGPUABIInfo(LowerTypes &lt) : ABIInfo(lt) {}
+
+private:
+  void computeInfo(LowerFunctionInfo &fi) const override {
+    llvm_unreachable("NYI");
+  }
+};
+
+class AMDGPUTargetLoweringInfo : public TargetLoweringInfo {
+public:
+  AMDGPUTargetLoweringInfo(LowerTypes &lt)
+      : TargetLoweringInfo(std::make_unique<AMDGPUABIInfo>(lt)) {}
+  // Taken from here: https://llvm.org/docs/AMDGPUUsage.html#address-spaces
+  unsigned getTargetAddrSpaceFromCIRAddrSpace(
+      cir::AddressSpace addrSpace) const override {
+    switch (addrSpace) {
+    case cir::AddressSpace::OffloadPrivate:
+      return 5;
+    case cir::AddressSpace::OffloadLocal:
+      return 3;
+    case cir::AddressSpace::OffloadGlobal:
+      return 1;
+    case cir::AddressSpace::OffloadConstant:
+      return 4;
+    case cir::AddressSpace::OffloadGeneric:
+      return 0;
+    default:
+      cir_cconv_unreachable("Unknown CIR address space for this target");
+    }
+  }
+};
+
+} // namespace
+std::unique_ptr<TargetLoweringInfo>
+createAMDGPUTargetLoweringInfo(LowerModule &lowerModule) {
+  return std::make_unique<AMDGPUTargetLoweringInfo>(lowerModule.getTypes());
+}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1749,19 +1749,31 @@ mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
       convertTypeForMemory(*getTypeConverter(), dataLayout, op.getAllocaType());
   auto resultTy = getTypeConverter()->convertType(op.getType());
   // Verification between the CIR alloca AS and the one from data layout.
-  {
+  auto allocaAS = [&]() {
     auto resPtrTy = mlir::cast<mlir::LLVM::LLVMPointerType>(resultTy);
     auto dlAllocaASAttr = mlir::cast_if_present<mlir::IntegerAttr>(
         dataLayout.getAllocaMemorySpace());
-    // Absence means 0
     // TODO: The query for the alloca AS should be done through CIRDataLayout
     // instead to reuse the logic of interpret null attr as 0.
-    auto dlAllocaAS = dlAllocaASAttr ? dlAllocaASAttr.getInt() : 0;
-    if (dlAllocaAS != resPtrTy.getAddressSpace()) {
-      return op.emitError() << "alloca address space doesn't match the one "
-                               "from the target data layout: "
-                            << dlAllocaAS;
-    }
+    if (!dlAllocaASAttr)
+      return 0u;
+    return static_cast<unsigned>(dlAllocaASAttr.getValue().getZExtValue());
+  }();
+
+  auto resPtrTy =
+      mlir::LLVM::LLVMPointerType::get(elementTy.getContext(), allocaAS);
+
+  auto llvmAlloca = rewriter.create<mlir::LLVM::AllocaOp>(
+      op.getLoc(), resPtrTy, elementTy, size, op.getAlignmentAttr().getInt());
+
+  auto expectedPtrTy = mlir::cast<mlir::LLVM::LLVMPointerType>(
+      getTypeConverter()->convertType(op.getResult().getType()));
+
+  mlir::Value finalPtr = llvmAlloca.getResult();
+
+  if (expectedPtrTy.getAddressSpace() != allocaAS) {
+    finalPtr = rewriter.create<mlir::LLVM::AddrSpaceCastOp>(
+        op.getLoc(), expectedPtrTy, finalPtr);
   }
 
   // If there are annotations available, copy them out before we destroy the
@@ -1770,11 +1782,10 @@ mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
   if (op.getAnnotations())
     annotations = op.getAnnotationsAttr();
 
-  auto llvmAlloca = rewriter.replaceOpWithNewOp<mlir::LLVM::AllocaOp>(
-      op, resultTy, elementTy, size, op.getAlignmentAttr().getInt());
-
   if (annotations && !annotations.empty())
     buildAllocaAnnotations(llvmAlloca, adaptor, rewriter, annotations);
+  rewriter.replaceOp(op, finalPtr);
+
   return mlir::success();
 }
 
diff --git a/clang/test/CIR/CodeGen/HIP/address-spaces.cpp b/clang/test/CIR/CodeGen/HIP/address-spaces.cpp
@@ -0,0 +1,19 @@
+#include "cuda.h"
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ -emit-cir %s -o %t.ll
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.ll %s
+
+__global__ void fn() {
+  int i = 0;
+  __shared__ int j;
+  j = i;
+}
+
+// CIR: cir.global "private" internal dso_local addrspace(offload_local) @_ZZ2fnvE1j : !s32i
+// CIR: cir.func dso_local @_Z2fnv
+// CIR: [[Local:%[0-9]+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i", init]
+// CIR: [[Shared:%[0-9]+]] = cir.get_global @_ZZ2fnvE1j : !cir.ptr<!s32i, addrspace(offload_local)>
+// CIR: [[Tmp:%[0-9]+]] = cir.load {{.*}} [[Local]] : !cir.ptr<!s32i>, !s32i
+// CIR: cir.store{{.*}} [[Tmp]], [[Shared]] : !s32i, !cir.ptr<!s32i, addrspace(offload_local)>
diff --git a/clang/test/CIR/CodeGen/HIP/addrspace-lowering.cpp b/clang/test/CIR/CodeGen/HIP/addrspace-lowering.cpp
@@ -0,0 +1,25 @@
+#include "cuda.h"
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip  \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s
+
+
+__shared__ int a;
+// LLVM-DEVICE: @a = addrspace(3) global i32 undef, align 4
+// OGCG-DEVICE: @a = addrspace(3) global i32 undef, align 4
+
+__device__ int b;
+// LLVM-DEVICE: @b = addrspace(1) externally_initialized global i32 0, align 4
+// OGCG-DEVICE: @b = addrspace(1) externally_initialized global i32 0, align 4
+
+__constant__ int c;
+// LLVM-DEVICE: @c = addrspace(4) externally_initialized constant i32 0, align 4
+// OGCG-DEVICE: @c = addrspace(4) externally_initialized constant i32 0, align 4
+
diff --git a/clang/test/CIR/CodeGen/HIP/simple.cpp b/clang/test/CIR/CodeGen/HIP/simple.cpp
@@ -1,20 +1,36 @@
-#include "../Inputs/cuda.h"
+#include "cuda.h"
 
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
 // RUN:            -x hip -fhip-new-launch-api \
-// RUN:            -emit-cir %s -o %t.cir
+// RUN:            -I%S/../Inputs/ -emit-cir %s -o %t.cir
 // RUN: FileCheck --check-prefix=CIR-HOST --input-file=%t.cir %s
 
 // RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
 // RUN:            -fcuda-is-device -fhip-new-launch-api \
-// RUN:              -emit-cir %s -o %t.cir
+// RUN:            -I%S/../Inputs/ -emit-cir %s -o %t.cir
 // RUN: FileCheck --check-prefix=CIR-DEVICE --input-file=%t.cir %s
 //
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir \
 // RUN:            -x hip -emit-llvm -fhip-new-launch-api \
-// RUN:            %s -o %t.ll
+// RUN:            -I%S/../Inputs/ %s -o %t.ll
 // RUN: FileCheck --check-prefix=LLVM-HOST --input-file=%t.ll %s
 
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip -fclangir \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=LLVM-DEVICE --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu  \
+// RUN:            -x  hip -emit-llvm -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG-HOST --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -triple=amdgcn-amd-amdhsa -x hip \
+// RUN:            -fcuda-is-device -fhip-new-launch-api \
+// RUN:            -I%S/../Inputs/ -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG-DEVICE --input-file=%t.ll %s
+
+
 // Attribute for global_fn
 // CIR-HOST: [[Kernel:#[a-zA-Z_0-9]+]] = {{.*}}#cir.cu.kernel_name<_Z9global_fni>{{.*}}
 
@@ -29,6 +45,8 @@ __device__ void device_fn(int* a, double b, float c) {}
 
 __global__ void global_fn(int a) {}
 // CIR-DEVICE: @_Z9global_fni
+// LLVM-DEVICE: define dso_local void @_Z9global_fni
+// OGCG-DEVICE: define dso_local amdgpu_kernel void @_Z9global_fni 
 
 // CIR-HOST: @_Z24__device_stub__global_fni{{.*}}extra([[Kernel]])
 // CIR-HOST: %[[#CIRKernelArgs:]] = cir.alloca {{.*}}"kernel_args"
@@ -43,6 +61,12 @@ __global__ void global_fn(int a) {}
 // LLVM-HOST: %[[#GEP2:]] = getelementptr [1 x ptr], ptr %[[#KernelArgs]], i32 0, i64 0
 // LLVM-HOST: call i32 @__hipPopCallConfiguration
 // LLVM-HOST: call i32 @hipLaunchKernel(ptr @_Z9global_fni 
+//
+// OGCG-HOST: define dso_local void @_Z24__device_stub__global_fni
+// OGCG-HOST: %kernel_args = alloca ptr, i64 1, align 16
+// OGCG-HOST: getelementptr ptr, ptr %kernel_args, i32 0
+// OGCG-HOST: call i32 @__hipPopCallConfiguration
+// OGCG-HOST: %call = call noundef i32 @hipLaunchKernel(ptr noundef @_Z9global_fni
 
 
 int main() {
@@ -78,3 +102,18 @@ int main() {
 // LLVM-HOST:   %[[#]] = load i32
 // LLVM-HOST:   ret i32
 
+// OGCG-HOST:  define dso_local noundef i32 @main
+// OGCG-HOST:  %agg.tmp = alloca %struct.dim3, align 4
+// OGCG-HOST:  %agg.tmp1 = alloca %struct.dim3, align 4
+// OGCG-HOST:  call void @_ZN4dim3C1Ejjj
+// OGCG-HOST:  call void @_ZN4dim3C1Ejjj
+// OGCG-HOST:  %call = call i32 @__hipPushCallConfiguration
+// OGCG-HOST:  %tobool = icmp ne i32 %call, 0
+// OGCG-HOST:  br i1 %tobool, label %kcall.end, label %kcall.configok
+// OGCG-HOST: kcall.configok:
+// OGCG-HOST:  call void @_Z24__device_stub__global_fni(i32 noundef 1)
+// OGCG-HOST:  br label %kcall.end
+// OGCG-HOST: kcall.end:
+// OGCG-HOST:  %{{[0-9]+}} = load i32, ptr %retval, align 4
+// OGCG-HOST:  ret i32 %8
+

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,8 @@ createTargetLoweringInfo(LowerModule &LM) {`
`71`	`71`
`72`	`72`	`return createAArch64TargetLoweringInfo(LM, Kind);`
`73`	`73`	`}`
	`74`	`+ case llvm::Triple::amdgcn:`
	`75`	`+ return createAMDGPUTargetLoweringInfo(LM);`
`74`	`76`	`case llvm::Triple::x86_64: {`
`75`	`77`	`switch (Triple.getOS()) {`
`76`	`78`	`case llvm::Triple::Win32:`