@@ -535,9 +535,17 @@ inline cudaError_t ggml_cuda_set_device(const int device) {
535
535
536
536
static int g_device_count = -1 ;
537
537
static int g_main_device = 0 ;
538
- static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
539
538
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
540
539
540
+ struct device_capabilities {
541
+ int cc; // compute capability
542
+ bool vmm; // virtual memory support
543
+ size_t vmm_granularity; // granularity of virtual memory
544
+ };
545
+
546
+ static device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0 , false , 0 } };
547
+
548
+
541
549
static void * g_scratch_buffer = nullptr ;
542
550
static size_t g_scratch_size = 0 ; // disabled by default
543
551
static size_t g_scratch_offset = 0 ;
@@ -5894,7 +5902,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
5894
5902
5895
5903
int id;
5896
5904
CUDA_CHECK (cudaGetDevice (&id));
5897
- const int compute_capability = g_compute_capabilities [id];
5905
+ const int compute_capability = g_device_caps [id]. cc ;
5898
5906
5899
5907
int mmq_x, mmq_y, nwarps;
5900
5908
if (compute_capability >= CC_RDNA2) {
@@ -5939,7 +5947,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
5939
5947
5940
5948
int id;
5941
5949
CUDA_CHECK (cudaGetDevice (&id));
5942
- const int compute_capability = g_compute_capabilities [id];
5950
+ const int compute_capability = g_device_caps [id]. cc ;
5943
5951
5944
5952
int mmq_x, mmq_y, nwarps;
5945
5953
if (compute_capability >= CC_RDNA2) {
@@ -5984,7 +5992,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
5984
5992
5985
5993
int id;
5986
5994
CUDA_CHECK (cudaGetDevice (&id));
5987
- const int compute_capability = g_compute_capabilities [id];
5995
+ const int compute_capability = g_device_caps [id]. cc ;
5988
5996
5989
5997
int mmq_x, mmq_y, nwarps;
5990
5998
if (compute_capability >= CC_RDNA2) {
@@ -6029,7 +6037,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
6029
6037
6030
6038
int id;
6031
6039
CUDA_CHECK (cudaGetDevice (&id));
6032
- const int compute_capability = g_compute_capabilities [id];
6040
+ const int compute_capability = g_device_caps [id]. cc ;
6033
6041
6034
6042
int mmq_x, mmq_y, nwarps;
6035
6043
if (compute_capability >= CC_RDNA2) {
@@ -6074,7 +6082,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
6074
6082
6075
6083
int id;
6076
6084
CUDA_CHECK (cudaGetDevice (&id));
6077
- const int compute_capability = g_compute_capabilities [id];
6085
+ const int compute_capability = g_device_caps [id]. cc ;
6078
6086
6079
6087
int mmq_x, mmq_y, nwarps;
6080
6088
if (compute_capability >= CC_RDNA2) {
@@ -6119,7 +6127,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
6119
6127
6120
6128
int id;
6121
6129
CUDA_CHECK (cudaGetDevice (&id));
6122
- const int compute_capability = g_compute_capabilities [id];
6130
+ const int compute_capability = g_device_caps [id]. cc ;
6123
6131
6124
6132
int mmq_x, mmq_y, nwarps;
6125
6133
if (compute_capability >= CC_RDNA2) {
@@ -6166,7 +6174,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
6166
6174
6167
6175
int id;
6168
6176
CUDA_CHECK (cudaGetDevice (&id));
6169
- const int compute_capability = g_compute_capabilities [id];
6177
+ const int compute_capability = g_device_caps [id]. cc ;
6170
6178
6171
6179
int mmq_x, mmq_y, nwarps;
6172
6180
if (compute_capability >= CC_RDNA2) {
@@ -6212,7 +6220,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
6212
6220
6213
6221
int id;
6214
6222
CUDA_CHECK (cudaGetDevice (&id));
6215
- const int compute_capability = g_compute_capabilities [id];
6223
+ const int compute_capability = g_device_caps [id]. cc ;
6216
6224
6217
6225
int mmq_x, mmq_y, nwarps;
6218
6226
if (compute_capability >= CC_RDNA2) {
@@ -6257,7 +6265,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
6257
6265
6258
6266
int id;
6259
6267
CUDA_CHECK (cudaGetDevice (&id));
6260
- const int compute_capability = g_compute_capabilities [id];
6268
+ const int compute_capability = g_device_caps [id]. cc ;
6261
6269
6262
6270
int mmq_x, mmq_y, nwarps;
6263
6271
if (compute_capability >= CC_RDNA2) {
@@ -6302,7 +6310,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
6302
6310
6303
6311
int id;
6304
6312
CUDA_CHECK (cudaGetDevice (&id));
6305
- const int compute_capability = g_compute_capabilities [id];
6313
+ const int compute_capability = g_device_caps [id]. cc ;
6306
6314
6307
6315
int mmq_x, mmq_y, nwarps;
6308
6316
if (compute_capability >= CC_RDNA2) {
@@ -6660,23 +6668,18 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
6660
6668
size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
6661
6669
6662
6670
if (size > avail) {
6671
+ // round up to the next multiple of the granularity
6663
6672
size_t reserve_size = size - avail;
6673
+ const size_t granularity = g_device_caps[id].vmm_granularity ;
6674
+ reserve_size = granularity * ((reserve_size + granularity - 1 ) / granularity);
6675
+
6676
+ GGML_ASSERT (g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
6664
6677
6665
6678
// allocate more physical memory
6666
6679
CUmemAllocationProp prop = {};
6667
6680
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6668
6681
prop.location .type = CU_MEM_LOCATION_TYPE_DEVICE;
6669
6682
prop.location .id = id;
6670
-
6671
- // get the minimum allocation granularity for this device
6672
- size_t granularity;
6673
- CU_CHECK (cuMemGetAllocationGranularity (&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
6674
-
6675
- // round up to the next multiple of the granularity
6676
- reserve_size = granularity * ((reserve_size + granularity - 1 ) / granularity);
6677
-
6678
- GGML_ASSERT (g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
6679
-
6680
6683
CUmemGenericAllocationHandle handle;
6681
6684
CU_CHECK (cuMemCreate (&handle, reserve_size, &prop, 0 ));
6682
6685
@@ -6732,12 +6735,10 @@ static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
6732
6735
GGML_ASSERT (ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
6733
6736
}
6734
6737
6735
- static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false };
6736
-
6737
6738
static void * ggml_cuda_pool_malloc (size_t size, size_t * actual_size) {
6738
6739
int id;
6739
6740
CUDA_CHECK (cudaGetDevice (&id));
6740
- if (g_device_vmm [id]) {
6741
+ if (g_device_caps [id]. vmm ) {
6741
6742
return ggml_cuda_pool_malloc_vmm (size, actual_size);
6742
6743
} else {
6743
6744
return ggml_cuda_pool_malloc_leg (size, actual_size);
@@ -6747,7 +6748,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
6747
6748
static void ggml_cuda_pool_free (void * ptr, size_t size) {
6748
6749
int id;
6749
6750
CUDA_CHECK (cudaGetDevice (&id));
6750
- if (g_device_vmm [id]) {
6751
+ if (g_device_caps [id]. vmm ) {
6751
6752
ggml_cuda_pool_free_vmm (ptr, size);
6752
6753
} else {
6753
6754
ggml_cuda_pool_free_leg (ptr, size);
@@ -6802,8 +6803,16 @@ void ggml_init_cublas() {
6802
6803
CUdevice device;
6803
6804
CU_CHECK (cuDeviceGet (&device, id));
6804
6805
CU_CHECK (cuDeviceGetAttribute (&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6805
- g_device_vmm[id] = !!device_vmm;
6806
+
6807
+ if (device_vmm) {
6808
+ CUmemAllocationProp alloc_prop = {};
6809
+ alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6810
+ alloc_prop.location .type = CU_MEM_LOCATION_TYPE_DEVICE;
6811
+ alloc_prop.location .id = id;
6812
+ CU_CHECK (cuMemGetAllocationGranularity (&g_device_caps[id].vmm_granularity , &alloc_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
6813
+ }
6806
6814
#endif
6815
+ g_device_caps[id].vmm = !!device_vmm;
6807
6816
6808
6817
cudaDeviceProp prop;
6809
6818
CUDA_CHECK (cudaGetDeviceProperties (&prop, id));
@@ -6812,9 +6821,9 @@ void ggml_init_cublas() {
6812
6821
g_tensor_split[id] = total_vram;
6813
6822
total_vram += prop.totalGlobalMem ;
6814
6823
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6815
- g_compute_capabilities [id] = 100 *prop.major + 10 *prop.minor + CC_OFFSET_AMD;
6824
+ g_device_caps [id] = 100 *prop.major + 10 *prop.minor + CC_OFFSET_AMD;
6816
6825
#else
6817
- g_compute_capabilities [id] = 100 *prop.major + 10 *prop.minor ;
6826
+ g_device_caps [id]. cc = 100 *prop.major + 10 *prop.minor ;
6818
6827
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6819
6828
}
6820
6829
for (int id = 0 ; id < g_device_count; ++id) {
@@ -7324,11 +7333,11 @@ static int64_t get_row_rounding(ggml_type type) {
7324
7333
int64_t max_compute_capability = INT_MIN;
7325
7334
for (int64_t id = 0 ; id < g_device_count; ++id) {
7326
7335
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1 ] : 1 .0f )) {
7327
- if (min_compute_capability > g_compute_capabilities [id]) {
7328
- min_compute_capability = g_compute_capabilities [id];
7336
+ if (min_compute_capability > g_device_caps [id]. cc ) {
7337
+ min_compute_capability = g_device_caps [id]. cc ;
7329
7338
}
7330
- if (max_compute_capability < g_compute_capabilities [id]) {
7331
- max_compute_capability = g_compute_capabilities [id];
7339
+ if (max_compute_capability < g_device_caps [id]. cc ) {
7340
+ max_compute_capability = g_device_caps [id]. cc ;
7332
7341
}
7333
7342
}
7334
7343
}
@@ -7536,7 +7545,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
7536
7545
// ldc == nrows of the matrix that cuBLAS writes into
7537
7546
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
7538
7547
7539
- const int compute_capability = g_compute_capabilities [id];
7548
+ const int compute_capability = g_device_caps [id]. cc ;
7540
7549
7541
7550
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized (src0->type )) && ggml_is_contiguous (src0) && row_diff == src0->ne [1 ] && dst->op_params [0 ] == GGML_PREC_DEFAULT) {
7542
7551
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
@@ -8671,8 +8680,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
8671
8680
8672
8681
int64_t min_compute_capability = INT_MAX;
8673
8682
for (int64_t id = 0 ; id < g_device_count; ++id) {
8674
- if (min_compute_capability > g_compute_capabilities [id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1 ] : 1 .0f )) {
8675
- min_compute_capability = g_compute_capabilities [id];
8683
+ if (min_compute_capability > g_device_caps [id]. cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1 ] : 1 .0f )) {
8684
+ min_compute_capability = g_device_caps [id]. cc ;
8676
8685
}
8677
8686
}
8678
8687
0 commit comments