Skip to content

Commit 43c16cb

Browse files
committed
check for vmm support, disable for hip
1 parent bd78dc9 commit 43c16cb

File tree

1 file changed

+48
-26
lines changed

1 file changed

+48
-26
lines changed

ggml-cuda.cu

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6564,18 +6564,16 @@ struct scoped_spin_lock {
65646564

65656565
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
65666566

6567-
#if 0
6568-
#define DEBUG_CUDA_MALLOC
6567+
// #define DEBUG_CUDA_MALLOC
65696568
struct cuda_buffer {
65706569
void * ptr = nullptr;
65716570
size_t size = 0;
65726571
};
65736572

65746573
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6575-
65766574
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
65776575

6578-
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6576+
static void * ggml_cuda_pool_malloc_leg(size_t size, size_t * actual_size) {
65796577
scoped_spin_lock lock(g_cuda_pool_lock);
65806578
int id;
65816579
CUDA_CHECK(cudaGetDevice(&id));
@@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
66296627
return ptr;
66306628
}
66316629

6632-
static void ggml_cuda_pool_free(void * ptr, size_t size) {
6630+
static void ggml_cuda_pool_free_leg(void * ptr, size_t size) {
66336631
scoped_spin_lock lock(g_cuda_pool_lock);
66346632
int id;
66356633
CUDA_CHECK(cudaGetDevice(&id));
@@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
66466644
CUDA_CHECK(cudaFree(ptr));
66476645
g_cuda_pool_size[id] -= size;
66486646
}
6649-
#else
66506647

6648+
#if !defined(GGML_USE_HIPBLAS)
6649+
// pool with virtual memory
66516650
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
66526651
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
6653-
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
66546652
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
6653+
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
66556654

6656-
static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36; // 64 GB
6657-
6658-
//#define DEBUG_CUDA_MALLOC
6659-
6660-
#define ggml_cuda_pool_malloc(size, actual_size) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
6661-
static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const char * call) {
6655+
static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
66626656
scoped_spin_lock lock(g_cuda_pool_lock);
66636657
int id;
66646658
CUDA_CHECK(cudaGetDevice(&id));
@@ -6681,14 +6675,14 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
66816675
// round up to the nearest granularity
66826676
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
66836677

6684-
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE);
6678+
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
66856679

66866680
CUmemGenericAllocationHandle handle;
66876681
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
66886682

66896683
// reserve virtual address space (if not already reserved)
66906684
if (g_cuda_pool_addr[id] == 0) {
6691-
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE, 0, 0, 0));
6685+
CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
66926686
}
66936687

66946688
// map at the end of the pool
@@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
67056699
g_cuda_pool_handles[id].push_back(handle);
67066700
g_cuda_pool_size[id] += reserve_size;
67076701

6708-
printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s]\n",
6709-
id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6710-
(unsigned long long) (reserve_size/1024/1024), call);
6702+
//printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
6703+
// id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6704+
// (unsigned long long) (reserve_size/1024/1024));
67116705
}
67126706

67136707
GGML_ASSERT(g_cuda_pool_addr[id] != 0);
@@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
67176711
g_cuda_pool_used[id] += size;
67186712

67196713
#ifdef DEBUG_CUDA_MALLOC
6720-
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
6714+
printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
67216715
#endif
67226716

67236717
return ptr;
6724-
6725-
GGML_UNUSED(call);
67266718
}
67276719

6728-
#define ggml_cuda_pool_free(ptr, size) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
6729-
static void ggml_cuda_pool_free_(void * ptr, size_t size, const char * call) {
6720+
static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
67306721
scoped_spin_lock lock(g_cuda_pool_lock);
67316722
int id;
67326723
CUDA_CHECK(cudaGetDevice(&id));
67336724

67346725
#ifdef DEBUG_CUDA_MALLOC
6735-
printf("cuda pool[%d]: free %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr, call);
6726+
printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
67366727
#endif
67376728

67386729
g_cuda_pool_used[id] -= size;
67396730

67406731
// all deallocations must be in reverse order of the allocations
67416732
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
6733+
}
67426734

6743-
GGML_UNUSED(call);
6735+
static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false};
6736+
6737+
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6738+
int id;
6739+
CUDA_CHECK(cudaGetDevice(&id));
6740+
if (g_device_vmm[id]) {
6741+
return ggml_cuda_pool_malloc_vmm(size, actual_size);
6742+
} else {
6743+
return ggml_cuda_pool_malloc_leg(size, actual_size);
6744+
}
67446745
}
67456746

6747+
static void ggml_cuda_pool_free(void * ptr, size_t size) {
6748+
int id;
6749+
CUDA_CHECK(cudaGetDevice(&id));
6750+
if (g_device_vmm[id]) {
6751+
ggml_cuda_pool_free_vmm(ptr, size);
6752+
} else {
6753+
ggml_cuda_pool_free_leg(ptr, size);
6754+
}
6755+
}
6756+
#else
6757+
#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
6758+
#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
67466759
#endif
67476760

67486761
static bool g_cublas_loaded = false;
@@ -6783,9 +6796,18 @@ void ggml_init_cublas() {
67836796
#endif
67846797
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
67856798
for (int id = 0; id < g_device_count; ++id) {
6799+
int deviceSupportsVmm = 0;
6800+
6801+
#if !defined(GGML_USE_HIPBLAS)
6802+
CUdevice device;
6803+
CU_CHECK(cuDeviceGet(&device, id));
6804+
CU_CHECK(cuDeviceGetAttribute(&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6805+
g_device_vmm[id] = !!deviceSupportsVmm;
6806+
#endif
6807+
67866808
cudaDeviceProp prop;
67876809
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
6788-
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
6810+
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, deviceSupportsVmm ? "yes" : "no");
67896811

67906812
g_tensor_split[id] = total_vram;
67916813
total_vram += prop.totalGlobalMem;

0 commit comments

Comments
 (0)