Skip to content

Commit 84778e9

Browse files
authored
CUDA/HIP: Share the same unified memory allocation logic. (#12934)
Replace compile-time `GGML_HIP_UMA` with environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY`. This unifies the usage on NVIDIA and AMD GPUs, and allows a single binary to be shared between integrated and dedicated GPUs.
1 parent 5106764 commit 84778e9

File tree

6 files changed

+22
-26
lines changed

6 files changed

+22
-26
lines changed

Makefile

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -780,10 +780,6 @@ ifdef GGML_HIP
780780

781781
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
782782

783-
ifdef GGML_HIP_UMA
784-
MK_CPPFLAGS += -DGGML_HIP_UMA
785-
endif # GGML_HIP_UMA
786-
787783
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
788784
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
789785
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas

docs/build.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
259259
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
260260
&& cmake --build build --config Release -- -j 16
261261
```
262-
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
263-
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
264262

265263
To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
266264

@@ -296,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
296294
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
297295
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
298296

297+
### Unified Memory
298+
299+
On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
300+
299301
## Vulkan
300302

301303
**Windows**

ggml/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ option(GGML_HIP "ggml: use HIP"
170170
option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
171171
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
172172
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
173-
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
174173
option(GGML_VULKAN "ggml: use Vulkan" OFF)
175174
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
176175
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
9696

9797
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
9898
ggml_cuda_set_device(device);
99-
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
100-
auto res = hipMallocManaged(ptr, size);
101-
if (res == hipSuccess) {
102-
// if error we "need" to know why...
103-
CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
104-
}
105-
return res;
106-
#else
107-
108-
#if !defined(GGML_USE_HIP)
10999
cudaError_t err;
110100
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
111101
{
112102
err = cudaMallocManaged(ptr, size);
103+
#if defined(GGML_USE_HIP)
104+
if (err == hipSuccess) {
105+
CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
106+
}
107+
108+
// fall back to cudaMalloc if not supported (e.g. on Windows)
109+
if (err == hipErrorNotSupported) {
110+
static bool warned_unsupported = false;
111+
if (!warned_unsupported) {
112+
GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
113+
warned_unsupported = true;
114+
}
115+
116+
err = cudaMalloc(ptr, size);
117+
}
118+
#endif // defined(GGML_USE_HIP)
113119
}
114120
else
115121
{
116122
err = cudaMalloc(ptr, size);
117123
}
118124
return err;
119-
#else
120-
return cudaMalloc(ptr, size);
121-
#endif // !defined(GGML_USE_HIP)
122-
123-
#endif
124125
}
125126

126127
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)

ggml/src/ggml-cuda/vendors/hip.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@
7171
#define cudaLaunchHostFunc hipLaunchHostFunc
7272
#define cudaMalloc hipMalloc
7373
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
74+
#define cudaMallocManaged hipMallocManaged
75+
#define cudaMemAdvise hipMemAdvise
7476
#define cudaMemcpy hipMemcpy
7577
#define cudaMemcpyAsync hipMemcpyAsync
7678
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync

ggml/src/ggml-hip/CMakeLists.txt

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,10 +89,6 @@ endif()
8989

9090
add_compile_definitions(GGML_USE_HIP)
9191

92-
if (GGML_HIP_UMA)
93-
add_compile_definitions(GGML_HIP_UMA)
94-
endif()
95-
9692
if (GGML_CUDA_FORCE_MMQ)
9793
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
9894
endif()

0 commit comments

Comments
 (0)