From 0a16c3256025927b80a56c418761962af5d85f7a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 19 Mar 2025 15:42:47 -0500
Subject: [PATCH] [LLVM] Make the GPU loader utilities an LLVM tool

Summary:
These tools `amdhsa-loader` and `nvptx-loader` are used to execute unit
tests directly on the GPU. We use this for `libc` and `libcxx` unit
tests as well as general GPU experimentation. It looks like this.

```console
> clang++ main.cpp --target=amdgcn-amd-amdhsa -mcpu=native -flto -lc ./lib/amdgcn-amd-amdhsa/crt1.o
> llvm-gpu-loader a.out
Hello World!
```

Currently these are a part of the `libc` project, but this creates
issues as `libc` itself depends on them to run tests. Right now we get
around this by force-including the `libc` project prior to running the
runtimes build so that this dependency can be built first. We should
instead just make this a simple LLVM tool so it's always available.

This has the effect of installing these by default now instead of just
when `libc` was enabled, but they should be relatively small. Right now
this only supports a 'static' configuration. That is, we locate the CUDA
and HSA dependencies at LLVM compile time. In the future we should be
able to provide this by default using `dlopen` and some API.

I don't know if it's required to reformat all of these names since they
used the `libc` naming convention so I just left it for now.
---
 libc/CMakeLists.txt                           |  7 ---
 libc/utils/gpu/CMakeLists.txt                 |  1 -
 libc/utils/gpu/loader/CMakeLists.txt          | 54 -------------------
 libc/utils/gpu/loader/amdgpu/CMakeLists.txt   | 10 ----
 libc/utils/gpu/loader/nvptx/CMakeLists.txt    |  9 ----
 llvm/CMakeLists.txt                           |  4 --
 llvm/runtimes/CMakeLists.txt                  | 14 -----
 llvm/tools/llvm-gpu-loader/CMakeLists.txt     | 45 ++++++++++++++++
 .../tools/llvm-gpu-loader/amdhsa.cpp          |  8 +--
 .../tools/llvm-gpu-loader/llvm-gpu-loader.cpp | 46 +++++++++++++---
 .../tools/llvm-gpu-loader/llvm-gpu-loader.h   | 13 +++--
 .../tools/llvm-gpu-loader/nvptx.cpp           |  8 +--
 12 files changed, 102 insertions(+), 117 deletions(-)
 delete mode 100644 libc/utils/gpu/CMakeLists.txt
 delete mode 100644 libc/utils/gpu/loader/CMakeLists.txt
 delete mode 100644 libc/utils/gpu/loader/amdgpu/CMakeLists.txt
 delete mode 100644 libc/utils/gpu/loader/nvptx/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-gpu-loader/CMakeLists.txt
 rename libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp => llvm/tools/llvm-gpu-loader/amdhsa.cpp (99%)
 rename libc/utils/gpu/loader/Main.cpp => llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp (76%)
 rename libc/utils/gpu/loader/Loader.h => llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h (93%)
 rename libc/utils/gpu/loader/nvptx/nvptx-loader.cpp => llvm/tools/llvm-gpu-loader/nvptx.cpp (98%)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index ad39ff6fbcb1e..b264dcb4974c7 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -59,13 +59,6 @@ set(LIBC_NAMESPACE ${default_namespace}
   CACHE STRING "The namespace to use to enclose internal implementations. Must start with '__llvm_libc'."
 )
 
-# We will build the GPU utilities if we are not doing a runtimes build.
-option(LIBC_BUILD_GPU_LOADER "Always build the GPU loader utilities" OFF)
-if(LIBC_BUILD_GPU_LOADER OR ((NOT LLVM_RUNTIMES_BUILD) AND LLVM_LIBC_GPU_BUILD))
-  add_subdirectory(utils/gpu)
-  return()
-endif()
-
 option(LIBC_CMAKE_VERBOSE_LOGGING
   "Log details warnings and notifications during CMake configuration." OFF)
 
diff --git a/libc/utils/gpu/CMakeLists.txt b/libc/utils/gpu/CMakeLists.txt
deleted file mode 100644
index e529646a1206e..0000000000000
--- a/libc/utils/gpu/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(loader)
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
deleted file mode 100644
index 9b3bd009dc0f1..0000000000000
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-add_library(gpu_loader OBJECT Main.cpp)
-
-include(FindLibcCommonUtils)
-target_link_libraries(gpu_loader PUBLIC llvm-libc-common-utilities)
-
-target_include_directories(gpu_loader PUBLIC
-  ${CMAKE_CURRENT_SOURCE_DIR}
-  ${LIBC_SOURCE_DIR}/include
-  ${LIBC_SOURCE_DIR}
-  ${LLVM_MAIN_INCLUDE_DIR}
-  ${LLVM_BINARY_DIR}/include
-)
-if(NOT LLVM_ENABLE_RTTI)
-  target_compile_options(gpu_loader PUBLIC -fno-rtti)
-endif()
-
-find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
-if(hsa-runtime64_FOUND)
-  add_subdirectory(amdgpu)
-endif()
-
-# The CUDA loader requires LLVM to traverse the ELF image for symbols.
-find_package(CUDAToolkit 11.2 QUIET)
-if(CUDAToolkit_FOUND)
-  add_subdirectory(nvptx)
-endif()
-
-if(TARGET amdhsa-loader AND LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
-  add_custom_target(libc.utils.gpu.loader)
-  add_dependencies(libc.utils.gpu.loader amdhsa-loader)
-  set_target_properties(
-    libc.utils.gpu.loader
-    PROPERTIES
-      TARGET amdhsa-loader
-      EXECUTABLE "$<TARGET_FILE:amdhsa-loader>"
-  )
-elseif(TARGET nvptx-loader AND LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
-  add_custom_target(libc.utils.gpu.loader)
-  add_dependencies(libc.utils.gpu.loader nvptx-loader)
-  set_target_properties(
-    libc.utils.gpu.loader
-    PROPERTIES
-      TARGET nvptx-loader
-      EXECUTABLE "$<TARGET_FILE:nvptx-loader>"
-  )
-endif()
-
-foreach(gpu_loader_tgt amdhsa-loader nvptx-loader)
-  if(TARGET ${gpu_loader_tgt})
-    install(TARGETS ${gpu_loader_tgt}
-            DESTINATION ${CMAKE_INSTALL_BINDIR}
-            COMPONENT libc)
-  endif()
-endforeach()
diff --git a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt b/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
deleted file mode 100644
index 17878daf0b6fe..0000000000000
--- a/libc/utils/gpu/loader/amdgpu/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  BinaryFormat
-  Object
-  Option
-  Support
-  FrontendOffloading
-  )
-
-add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)
-target_link_libraries(amdhsa-loader PRIVATE hsa-runtime64::hsa-runtime64 gpu_loader)
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
deleted file mode 100644
index 42510ac31dad4..0000000000000
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-set(LLVM_LINK_COMPONENTS
-  BinaryFormat
-  Object
-  Option
-  Support
-  )
-
-add_llvm_executable(nvptx-loader nvptx-loader.cpp)
-target_link_libraries(nvptx-loader PRIVATE gpu_loader CUDA::cuda_driver)
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 18b6ee85fae8d..f9ace9f078e2b 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -210,10 +210,6 @@ if("${LIBC_TARGET_TRIPLE}" STREQUAL "amdgcn-amd-amdhsa" OR
    "${LIBC_TARGET_TRIPLE}" STREQUAL "nvptx64-nvidia-cuda")
   set(LLVM_LIBC_GPU_BUILD ON)
 endif()
-if (NOT "libc" IN_LIST LLVM_ENABLE_PROJECTS AND LLVM_LIBC_GPU_BUILD)
-  message(STATUS "Enabling libc project to build libc testing tools")
-  list(APPEND LLVM_ENABLE_PROJECTS "libc")
-endif()
 
 # LLVM_ENABLE_PROJECTS_USED is `ON` if the user has ever used the
 # `LLVM_ENABLE_PROJECTS` CMake cache variable.  This exists for
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 136099dc48ab8..51433d1ec9831 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -534,20 +534,6 @@ if(build_runtimes)
   endif()
   if(LLVM_LIBC_GPU_BUILD)
     list(APPEND extra_cmake_args "-DLLVM_LIBC_GPU_BUILD=ON")
-    if("libc" IN_LIST RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES)
-      if(TARGET amdhsa-loader)
-        list(APPEND extra_cmake_args
-             "-DRUNTIMES_amdgcn-amd-amdhsa_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:amdhsa-loader>")
-        list(APPEND extra_deps amdhsa-loader)
-      endif()
-    endif()
-    if("libc" IN_LIST RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES)
-      if(TARGET nvptx-loader)
-        list(APPEND extra_cmake_args
-             "-DRUNTIMES_nvptx64-nvidia-cuda_LIBC_GPU_LOADER_EXECUTABLE=$<TARGET_FILE:nvptx-loader>")
-        list(APPEND extra_deps nvptx-loader)
-      endif()
-    endif()
     if(TARGET clang-offload-packager)
       list(APPEND extra_deps clang-offload-packager)
     endif()
diff --git a/llvm/tools/llvm-gpu-loader/CMakeLists.txt b/llvm/tools/llvm-gpu-loader/CMakeLists.txt
new file mode 100644
index 0000000000000..4b4a6e72e47ae
--- /dev/null
+++ b/llvm/tools/llvm-gpu-loader/CMakeLists.txt
@@ -0,0 +1,45 @@
+set(LLVM_LINK_COMPONENTS
+  BinaryFormat
+  Object
+  Option
+  Support
+  FrontendOffloading
+)
+
+add_llvm_tool(llvm-gpu-loader
+  llvm-gpu-loader.cpp
+
+  # TODO: We intentionally split this currently due to statically linking the
+  #       GPU runtimes. Dynamically load the dependencies, possibly using the
+  #       LLVM offloading API when it is complete.
+  PARTIAL_SOURCES_INTENDED
+
+  DEPENDS
+  intrinsics_gen
+)
+
+# Locate the RPC server handling interface.
+include(FindLibcCommonUtils)
+target_link_libraries(llvm-gpu-loader PUBLIC llvm-libc-common-utilities)
+
+# Check for HSA support for targeting AMD GPUs.
+find_package(hsa-runtime64 QUIET 1.2.0 HINTS ${CMAKE_INSTALL_PREFIX} PATHS /opt/rocm)
+if(hsa-runtime64_FOUND)
+  target_sources(llvm-gpu-loader PRIVATE amdhsa.cpp)
+  target_compile_definitions(llvm-gpu-loader PRIVATE AMDHSA_SUPPORT)
+  target_link_libraries(llvm-gpu-loader PRIVATE hsa-runtime64::hsa-runtime64)
+
+  # Compatibility with the old amdhsa-loader name.
+  add_llvm_tool_symlink(amdhsa-loader llvm-gpu-loader)
+endif()
+
+# Check for CUDA support for targeting NVIDIA GPUs.
+find_package(CUDAToolkit 11.2 QUIET)
+if(CUDAToolkit_FOUND)
+  target_sources(llvm-gpu-loader PRIVATE nvptx.cpp)
+  target_compile_definitions(llvm-gpu-loader PRIVATE NVPTX_SUPPORT)
+  target_link_libraries(llvm-gpu-loader PRIVATE CUDA::cuda_driver)
+
+  # Compatibility with the old nvptx-loader name.
+  add_llvm_tool_symlink(nvptx-loader llvm-gpu-loader)
+endif()
diff --git a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
similarity index 99%
rename from libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
rename to llvm/tools/llvm-gpu-loader/amdhsa.cpp
index 00fde147b0abd..f3c8f646b6421 100644
--- a/libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp
+++ b/llvm/tools/llvm-gpu-loader/amdhsa.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Loader.h"
+#include "llvm-gpu-loader.h"
 
 #include "hsa/hsa.h"
 #include "hsa/hsa_ext_amd.h"
@@ -330,9 +330,9 @@ static hsa_status_t hsa_memcpy(void *dst, hsa_agent_t dst_agent,
   return HSA_STATUS_SUCCESS;
 }
 
-int load(int argc, const char **argv, const char **envp, void *image,
-         size_t size, const LaunchParameters &params,
-         bool print_resource_usage) {
+int load_amdhsa(int argc, const char **argv, const char **envp, void *image,
+                size_t size, const LaunchParameters &params,
+                bool print_resource_usage) {
   // Initialize the HSA runtime used to communicate with the device.
   if (hsa_status_t err = hsa_init())
     handle_error(err);
diff --git a/libc/utils/gpu/loader/Main.cpp b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
similarity index 76%
rename from libc/utils/gpu/loader/Main.cpp
rename to llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
index c3aeeffd56368..9b157e3f9dcb1 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
@@ -6,14 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file opens a device image passed on the command line and passes it to
-// one of the loader implementations for launch.
+// This utility is used to launch standard programs onto the GPU in conjunction
+// with the LLVM 'libc' project. It is designed to mimic a standard emulator
+// workflow, allowing for unit tests to be run on the GPU directly.
 //
 //===----------------------------------------------------------------------===//
 
-#include "Loader.h"
+#include "llvm-gpu-loader.h"
 
 #include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -21,6 +24,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/WithColor.h"
+#include "llvm/TargetParser/Triple.h"
 
 #include <cerrno>
 #include <cstdio>
@@ -125,12 +129,40 @@ int main(int argc, const char **argv, const char **envp) {
                                      strerror(errno)));
   }
 
-  // Drop the loader from the program arguments.
   LaunchParameters params{threads_x, threads_y, threads_z,
                           blocks_x,  blocks_y,  blocks_z};
-  int ret = load(new_argv.size(), new_argv.data(), envp,
-                 const_cast<char *>(image.getBufferStart()),
-                 image.getBufferSize(), params, print_resource_usage);
+
+  Expected<llvm::object::ELF64LEObjectFile> elf_or_err =
+      llvm::object::ELF64LEObjectFile::create(image);
+  if (!elf_or_err)
+    report_error(std::move(elf_or_err.takeError()));
+
+  int ret = 1;
+  if (elf_or_err->getArch() == Triple::amdgcn) {
+#ifdef AMDHSA_SUPPORT
+    ret = load_amdhsa(new_argv.size(), new_argv.data(), envp,
+                      const_cast<char *>(image.getBufferStart()),
+                      image.getBufferSize(), params, print_resource_usage);
+#else
+    report_error(createStringError(
+        "Unsupported architecture; %s",
+        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
+#endif
+  } else if (elf_or_err->getArch() == Triple::nvptx64) {
+#ifdef NVPTX_SUPPORT
+    ret = load_nvptx(new_argv.size(), new_argv.data(), envp,
+                     const_cast<char *>(image.getBufferStart()),
+                     image.getBufferSize(), params, print_resource_usage);
+#else
+    report_error(createStringError(
+        "Unsupported architecture; %s",
+        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
+#endif
+  } else {
+    report_error(createStringError(
+        "Unsupported architecture; %s",
+        Triple::getArchTypeName(elf_or_err->getArch()).bytes_begin()));
+  }
 
   if (no_parallelism) {
     if (flock(fd, LOCK_UN) == -1)
diff --git a/libc/utils/gpu/loader/Loader.h b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
similarity index 93%
rename from libc/utils/gpu/loader/Loader.h
rename to llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
index ec05117a041ab..29da395e3bc20 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.h
@@ -54,9 +54,16 @@ struct end_args_t {
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
 /// Returns the final value of the `main` function on the device.
-int load(int argc, const char **argv, const char **evnp, void *image,
-         size_t size, const LaunchParameters &params,
-         bool print_resource_usage);
+#ifdef AMDHSA_SUPPORT
+int load_amdhsa(int argc, const char **argv, const char **evnp, void *image,
+                size_t size, const LaunchParameters &params,
+                bool print_resource_usage);
+#endif
+#ifdef NVPTX_SUPPORT
+int load_nvptx(int argc, const char **argv, const char **evnp, void *image,
+               size_t size, const LaunchParameters &params,
+               bool print_resource_usage);
+#endif
 
 /// Return \p V aligned "upwards" according to \p Align.
 template <typename V, typename A> inline V align_up(V val, A align) {
diff --git a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp b/llvm/tools/llvm-gpu-loader/nvptx.cpp
similarity index 98%
rename from libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
rename to llvm/tools/llvm-gpu-loader/nvptx.cpp
index 7d6c176c6f360..f7495605ecc68 100644
--- a/libc/utils/gpu/loader/nvptx/nvptx-loader.cpp
+++ b/llvm/tools/llvm-gpu-loader/nvptx.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Loader.h"
+#include "llvm-gpu-loader.h"
 
 #include "cuda.h"
 
@@ -236,9 +236,9 @@ CUresult launch_kernel(CUmodule binary, CUstream stream, rpc::Server &server,
   return CUDA_SUCCESS;
 }
 
-int load(int argc, const char **argv, const char **envp, void *image,
-         size_t size, const LaunchParameters &params,
-         bool print_resource_usage) {
+int load_nvptx(int argc, const char **argv, const char **envp, void *image,
+               size_t size, const LaunchParameters &params,
+               bool print_resource_usage) {
   if (CUresult err = cuInit(0))
     handle_error(err);
   // Obtain the first device found on the system.