Skip to content

Commit 4c0f300

Browse files
committed
move all caps to g_device_caps
1 parent 20860da commit 4c0f300

File tree

1 file changed

+44
-35
lines changed

1 file changed

+44
-35
lines changed

ggml-cuda.cu

Lines changed: 44 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -535,9 +535,17 @@ inline cudaError_t ggml_cuda_set_device(const int device) {
535535

536536
static int g_device_count = -1;
537537
static int g_main_device = 0;
538-
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
539538
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
540539

540+
struct device_capabilities {
541+
int cc; // compute capability
542+
bool vmm; // virtual memory support
543+
size_t vmm_granularity; // granularity of virtual memory
544+
};
545+
546+
static device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
547+
548+
541549
static void * g_scratch_buffer = nullptr;
542550
static size_t g_scratch_size = 0; // disabled by default
543551
static size_t g_scratch_offset = 0;
@@ -5894,7 +5902,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
58945902

58955903
int id;
58965904
CUDA_CHECK(cudaGetDevice(&id));
5897-
const int compute_capability = g_compute_capabilities[id];
5905+
const int compute_capability = g_device_caps[id].cc;
58985906

58995907
int mmq_x, mmq_y, nwarps;
59005908
if (compute_capability >= CC_RDNA2) {
@@ -5939,7 +5947,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
59395947

59405948
int id;
59415949
CUDA_CHECK(cudaGetDevice(&id));
5942-
const int compute_capability = g_compute_capabilities[id];
5950+
const int compute_capability = g_device_caps[id].cc;
59435951

59445952
int mmq_x, mmq_y, nwarps;
59455953
if (compute_capability >= CC_RDNA2) {
@@ -5984,7 +5992,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
59845992

59855993
int id;
59865994
CUDA_CHECK(cudaGetDevice(&id));
5987-
const int compute_capability = g_compute_capabilities[id];
5995+
const int compute_capability = g_device_caps[id].cc;
59885996

59895997
int mmq_x, mmq_y, nwarps;
59905998
if (compute_capability >= CC_RDNA2) {
@@ -6029,7 +6037,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
60296037

60306038
int id;
60316039
CUDA_CHECK(cudaGetDevice(&id));
6032-
const int compute_capability = g_compute_capabilities[id];
6040+
const int compute_capability = g_device_caps[id].cc;
60336041

60346042
int mmq_x, mmq_y, nwarps;
60356043
if (compute_capability >= CC_RDNA2) {
@@ -6074,7 +6082,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
60746082

60756083
int id;
60766084
CUDA_CHECK(cudaGetDevice(&id));
6077-
const int compute_capability = g_compute_capabilities[id];
6085+
const int compute_capability = g_device_caps[id].cc;
60786086

60796087
int mmq_x, mmq_y, nwarps;
60806088
if (compute_capability >= CC_RDNA2) {
@@ -6119,7 +6127,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
61196127

61206128
int id;
61216129
CUDA_CHECK(cudaGetDevice(&id));
6122-
const int compute_capability = g_compute_capabilities[id];
6130+
const int compute_capability = g_device_caps[id].cc;
61236131

61246132
int mmq_x, mmq_y, nwarps;
61256133
if (compute_capability >= CC_RDNA2) {
@@ -6166,7 +6174,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
61666174

61676175
int id;
61686176
CUDA_CHECK(cudaGetDevice(&id));
6169-
const int compute_capability = g_compute_capabilities[id];
6177+
const int compute_capability = g_device_caps[id].cc;
61706178

61716179
int mmq_x, mmq_y, nwarps;
61726180
if (compute_capability >= CC_RDNA2) {
@@ -6212,7 +6220,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
62126220

62136221
int id;
62146222
CUDA_CHECK(cudaGetDevice(&id));
6215-
const int compute_capability = g_compute_capabilities[id];
6223+
const int compute_capability = g_device_caps[id].cc;
62166224

62176225
int mmq_x, mmq_y, nwarps;
62186226
if (compute_capability >= CC_RDNA2) {
@@ -6257,7 +6265,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
62576265

62586266
int id;
62596267
CUDA_CHECK(cudaGetDevice(&id));
6260-
const int compute_capability = g_compute_capabilities[id];
6268+
const int compute_capability = g_device_caps[id].cc;
62616269

62626270
int mmq_x, mmq_y, nwarps;
62636271
if (compute_capability >= CC_RDNA2) {
@@ -6302,7 +6310,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
63026310

63036311
int id;
63046312
CUDA_CHECK(cudaGetDevice(&id));
6305-
const int compute_capability = g_compute_capabilities[id];
6313+
const int compute_capability = g_device_caps[id].cc;
63066314

63076315
int mmq_x, mmq_y, nwarps;
63086316
if (compute_capability >= CC_RDNA2) {
@@ -6660,23 +6668,18 @@ static void * ggml_cuda_pool_malloc_vmm(size_t size, size_t * actual_size) {
66606668
size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
66616669

66626670
if (size > avail) {
6671+
// round up to the next multiple of the granularity
66636672
size_t reserve_size = size - avail;
6673+
const size_t granularity = g_device_caps[id].vmm_granularity;
6674+
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
6675+
6676+
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
66646677

66656678
// allocate more physical memory
66666679
CUmemAllocationProp prop = {};
66676680
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
66686681
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
66696682
prop.location.id = id;
6670-
6671-
// get the minimum allocation granularity for this device
6672-
size_t granularity;
6673-
CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
6674-
6675-
// round up to the next multiple of the granularity
6676-
reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
6677-
6678-
GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
6679-
66806683
CUmemGenericAllocationHandle handle;
66816684
CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
66826685

@@ -6732,12 +6735,10 @@ static void ggml_cuda_pool_free_vmm(void * ptr, size_t size) {
67326735
GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
67336736
}
67346737

6735-
static bool g_device_vmm[GGML_CUDA_MAX_DEVICES] = {false};
6736-
67376738
static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
67386739
int id;
67396740
CUDA_CHECK(cudaGetDevice(&id));
6740-
if (g_device_vmm[id]) {
6741+
if (g_device_caps[id].vmm) {
67416742
return ggml_cuda_pool_malloc_vmm(size, actual_size);
67426743
} else {
67436744
return ggml_cuda_pool_malloc_leg(size, actual_size);
@@ -6747,7 +6748,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
67476748
static void ggml_cuda_pool_free(void * ptr, size_t size) {
67486749
int id;
67496750
CUDA_CHECK(cudaGetDevice(&id));
6750-
if (g_device_vmm[id]) {
6751+
if (g_device_caps[id].vmm) {
67516752
ggml_cuda_pool_free_vmm(ptr, size);
67526753
} else {
67536754
ggml_cuda_pool_free_leg(ptr, size);
@@ -6802,8 +6803,16 @@ void ggml_init_cublas() {
68026803
CUdevice device;
68036804
CU_CHECK(cuDeviceGet(&device, id));
68046805
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
6805-
g_device_vmm[id] = !!device_vmm;
6806+
6807+
if (device_vmm) {
6808+
CUmemAllocationProp alloc_prop = {};
6809+
alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
6810+
alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
6811+
alloc_prop.location.id = id;
6812+
CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
6813+
}
68066814
#endif
6815+
g_device_caps[id].vmm = !!device_vmm;
68076816

68086817
cudaDeviceProp prop;
68096818
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@@ -6812,9 +6821,9 @@ void ggml_init_cublas() {
68126821
g_tensor_split[id] = total_vram;
68136822
total_vram += prop.totalGlobalMem;
68146823
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
6815-
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
6824+
g_device_caps[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
68166825
#else
6817-
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
6826+
g_device_caps[id].cc = 100*prop.major + 10*prop.minor;
68186827
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
68196828
}
68206829
for (int id = 0; id < g_device_count; ++id) {
@@ -7324,11 +7333,11 @@ static int64_t get_row_rounding(ggml_type type) {
73247333
int64_t max_compute_capability = INT_MIN;
73257334
for (int64_t id = 0; id < g_device_count; ++id) {
73267335
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
7327-
if (min_compute_capability > g_compute_capabilities[id]) {
7328-
min_compute_capability = g_compute_capabilities[id];
7336+
if (min_compute_capability > g_device_caps[id].cc) {
7337+
min_compute_capability = g_device_caps[id].cc;
73297338
}
7330-
if (max_compute_capability < g_compute_capabilities[id]) {
7331-
max_compute_capability = g_compute_capabilities[id];
7339+
if (max_compute_capability < g_device_caps[id].cc) {
7340+
max_compute_capability = g_device_caps[id].cc;
73327341
}
73337342
}
73347343
}
@@ -7536,7 +7545,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
75367545
// ldc == nrows of the matrix that cuBLAS writes into
75377546
int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
75387547

7539-
const int compute_capability = g_compute_capabilities[id];
7548+
const int compute_capability = g_device_caps[id].cc;
75407549

75417550
if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
75427551
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
@@ -8671,8 +8680,8 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
86718680

86728681
int64_t min_compute_capability = INT_MAX;
86738682
for (int64_t id = 0; id < g_device_count; ++id) {
8674-
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8675-
min_compute_capability = g_compute_capabilities[id];
8683+
if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
8684+
min_compute_capability = g_device_caps[id].cc;
86768685
}
86778686
}
86788687

0 commit comments

Comments
 (0)