@@ -6564,18 +6564,16 @@ struct scoped_spin_lock {
6564
6564
6565
6565
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
6566
6566
6567
- #if 0
6568
- #define DEBUG_CUDA_MALLOC
6567
+ // #define DEBUG_CUDA_MALLOC
6569
6568
struct cuda_buffer {
6570
6569
void * ptr = nullptr ;
6571
6570
size_t size = 0 ;
6572
6571
};
6573
6572
6574
6573
static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
6575
-
6576
6574
static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0 };
6577
6575
6578
- static void * ggml_cuda_pool_malloc (size_t size, size_t * actual_size) {
6576
+ static void * ggml_cuda_pool_malloc_leg (size_t size, size_t * actual_size) {
6579
6577
scoped_spin_lock lock (g_cuda_pool_lock);
6580
6578
int id;
6581
6579
CUDA_CHECK (cudaGetDevice (&id));
@@ -6629,7 +6627,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6629
6627
return ptr;
6630
6628
}
6631
6629
6632
- static void ggml_cuda_pool_free (void * ptr, size_t size) {
6630
+ static void ggml_cuda_pool_free_leg (void * ptr, size_t size) {
6633
6631
scoped_spin_lock lock (g_cuda_pool_lock);
6634
6632
int id;
6635
6633
CUDA_CHECK (cudaGetDevice (&id));
@@ -6646,19 +6644,15 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
6646
6644
CUDA_CHECK (cudaFree (ptr));
6647
6645
g_cuda_pool_size[id] -= size;
6648
6646
}
6649
- #else
6650
6647
6648
+ #if !defined(GGML_USE_HIPBLAS)
6649
+ // pool with virtual memory
6651
6650
static std::vector<CUmemGenericAllocationHandle> g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
6652
6651
static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0 };
6653
- static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0 };
6654
6652
static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0 };
6653
+ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36 ; // 64 GB
6655
6654
6656
- static const size_t CUDA_POOL_MAX_SIZE = 1ull << 36 ; // 64 GB
6657
-
6658
- // #define DEBUG_CUDA_MALLOC
6659
-
6660
- #define ggml_cuda_pool_malloc (size, actual_size ) ggml_cuda_pool_malloc_(size, actual_size, #size " " #actual_size)
6661
- static void * ggml_cuda_pool_malloc_ (size_t size, size_t * actual_size, const char * call) {
6655
+ static void * ggml_cuda_pool_malloc_vmm (size_t size, size_t * actual_size) {
6662
6656
scoped_spin_lock lock (g_cuda_pool_lock);
6663
6657
int id;
6664
6658
CUDA_CHECK (cudaGetDevice (&id));
@@ -6681,14 +6675,14 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
6681
6675
// round up to the nearest granularity
6682
6676
reserve_size = granularity * ((reserve_size + granularity - 1 ) / granularity);
6683
6677
6684
- GGML_ASSERT (g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_MAX_SIZE );
6678
+ GGML_ASSERT (g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE );
6685
6679
6686
6680
CUmemGenericAllocationHandle handle;
6687
6681
CU_CHECK (cuMemCreate (&handle, reserve_size, &prop, 0 ));
6688
6682
6689
6683
// reserve virtual address space (if not already reserved)
6690
6684
if (g_cuda_pool_addr[id] == 0 ) {
6691
- CU_CHECK (cuMemAddressReserve (&g_cuda_pool_addr[id], CUDA_POOL_MAX_SIZE , 0 , 0 , 0 ));
6685
+ CU_CHECK (cuMemAddressReserve (&g_cuda_pool_addr[id], CUDA_POOL_VMM_MAX_SIZE , 0 , 0 , 0 ));
6692
6686
}
6693
6687
6694
6688
// map at the end of the pool
@@ -6705,9 +6699,9 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
6705
6699
g_cuda_pool_handles[id].push_back (handle);
6706
6700
g_cuda_pool_size[id] += reserve_size;
6707
6701
6708
- printf (" cuda pool[%d]: size increased to %llu MB (reserved %llu MB) [%s] \n " ,
6709
- id, (unsigned long long ) (g_cuda_pool_size[id]/1024 /1024 ),
6710
- (unsigned long long ) (reserve_size/1024 /1024 ), call );
6702
+ // printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
6703
+ // id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
6704
+ // (unsigned long long) (reserve_size/1024/1024));
6711
6705
}
6712
6706
6713
6707
GGML_ASSERT (g_cuda_pool_addr[id] != 0 );
@@ -6717,32 +6711,51 @@ static void * ggml_cuda_pool_malloc_(size_t size, size_t * actual_size, const ch
6717
6711
g_cuda_pool_used[id] += size;
6718
6712
6719
6713
#ifdef DEBUG_CUDA_MALLOC
6720
- printf (" cuda pool[%d]: allocated %llu bytes at %llx [%s]\n " , id, (unsigned long long ) size, ptr, call );
6714
+ printf (" cuda pool[%d]: allocated %llu bytes at %llx [%s]\n " , id, (unsigned long long ) size, ptr);
6721
6715
#endif
6722
6716
6723
6717
return ptr;
6724
-
6725
- GGML_UNUSED (call);
6726
6718
}
6727
6719
6728
- #define ggml_cuda_pool_free (ptr, size ) ggml_cuda_pool_free_(ptr, size, #ptr " " #size)
6729
- static void ggml_cuda_pool_free_ (void * ptr, size_t size, const char * call) {
6720
+ static void ggml_cuda_pool_free_vmm (void * ptr, size_t size) {
6730
6721
scoped_spin_lock lock (g_cuda_pool_lock);
6731
6722
int id;
6732
6723
CUDA_CHECK (cudaGetDevice (&id));
6733
6724
6734
6725
#ifdef DEBUG_CUDA_MALLOC
6735
- printf (" cuda pool[%d]: free %llu bytes at %llx [%s] \n " , id, (unsigned long long ) size, ptr, call );
6726
+ printf (" cuda pool[%d]: freed %llu bytes at %llx\n " , id, (unsigned long long ) size, ptr);
6736
6727
#endif
6737
6728
6738
6729
g_cuda_pool_used[id] -= size;
6739
6730
6740
6731
// all deallocations must be in reverse order of the allocations
6741
6732
GGML_ASSERT (ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
6733
+ }
6742
6734
6743
- GGML_UNUSED (call);
6735
+ static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false };
6736
+
6737
+ static void * ggml_cuda_pool_malloc (size_t size, size_t * actual_size) {
6738
+ int id;
6739
+ CUDA_CHECK (cudaGetDevice (&id));
6740
+ if (g_device_vmm[id]) {
6741
+ return ggml_cuda_pool_malloc_vmm (size, actual_size);
6742
+ } else {
6743
+ return ggml_cuda_pool_malloc_leg (size, actual_size);
6744
+ }
6744
6745
}
6745
6746
6747
+ static void ggml_cuda_pool_free (void * ptr, size_t size) {
6748
+ int id;
6749
+ CUDA_CHECK (cudaGetDevice (&id));
6750
+ if (g_device_vmm[id]) {
6751
+ ggml_cuda_pool_free_vmm (ptr, size);
6752
+ } else {
6753
+ ggml_cuda_pool_free_leg (ptr, size);
6754
+ }
6755
+ }
6756
+ #else
6757
+ #define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
6758
+ #define ggml_cuda_pool_free ggml_cuda_pool_free_leg
6746
6759
#endif
6747
6760
6748
6761
static bool g_cublas_loaded = false ;
@@ -6783,9 +6796,18 @@ void ggml_init_cublas() {
6783
6796
#endif
6784
6797
fprintf (stderr, " %s: found %d " GGML_CUDA_NAME " devices:\n " , __func__, g_device_count);
6785
6798
for (int id = 0 ; id < g_device_count; ++id) {
6799
+ int deviceSupportsVmm = 0 ;
6800
+
6801
+ #if !defined(GGML_USE_HIPBLAS)
6802
+ CUdevice device;
6803
+ CU_CHECK (cuDeviceGet (&device, id));
6804
+ CU_CHECK (cuDeviceGetAttribute (&deviceSupportsVmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6805
+ g_device_vmm[id] = !!deviceSupportsVmm;
6806
+ #endif
6807
+
6786
6808
cudaDeviceProp prop;
6787
6809
CUDA_CHECK (cudaGetDeviceProperties (&prop, id));
6788
- fprintf (stderr, " Device %d: %s, compute capability %d.%d\n " , id, prop.name , prop.major , prop.minor );
6810
+ fprintf (stderr, " Device %d: %s, compute capability %d.%d, VMM: %s \n " , id, prop.name , prop.major , prop.minor , deviceSupportsVmm ? " yes " : " no " );
6789
6811
6790
6812
g_tensor_split[id] = total_vram;
6791
6813
total_vram += prop.totalGlobalMem ;
0 commit comments