Skip to content

Commit 89773b8

Browse files
Lang Yualexdeucher
authored andcommitted
drm/amdkfd: Let VRAM allocations go to GTT domain on small APUs
Small APUs(i.e., consumer, embedded products) usually have a small carveout device memory which can't satisfy most compute workloads memory allocation requirements. We can't even run a Basic MNIST Example with a default 512MB carveout. https://github.com/pytorch/examples/tree/main/mnist. Error Log: "torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 84.00 MiB. GPU 0 has a total capacity of 512.00 MiB of which 0 bytes is free. Of the allocated memory 103.83 MiB is allocated by PyTorch, and 22.17 MiB is reserved by PyTorch but unallocated" Though we can change BIOS settings to enlarge carveout size, which is inflexible and may bring complaint. On the other hand, the memory resource can't be effectively used between host and device. The solution is MI300A approach, i.e., let VRAM allocations go to GTT. Then device and host can flexibly and effectively share memory resource. v2: Report local_mem_size_private as 0. (Felix) Signed-off-by: Lang Yu <[email protected]> Reviewed-by: Felix Kuehling <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 3b3c9e8 commit 89773b8

File tree

5 files changed

+23
-13
lines changed

5 files changed

+23
-13
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,9 @@ void amdgpu_amdkfd_get_local_mem_info(struct amdgpu_device *adev,
455455
else
456456
mem_info->local_mem_size_private =
457457
KFD_XCP_MEMORY_SIZE(adev, xcp->id);
458+
} else if (adev->flags & AMD_IS_APU) {
459+
mem_info->local_mem_size_public = (ttm_tt_pages_limit() << PAGE_SHIFT);
460+
mem_info->local_mem_size_private = 0;
458461
} else {
459462
mem_info->local_mem_size_public = adev->gmc.visible_vram_size;
460463
mem_info->local_mem_size_private = adev->gmc.real_vram_size -
@@ -824,6 +827,8 @@ u64 amdgpu_amdkfd_xcp_memory_size(struct amdgpu_device *adev, int xcp_id)
824827
}
825828
do_div(tmp, adev->xcp_mgr->num_xcp_per_mem_partition);
826829
return ALIGN_DOWN(tmp, PAGE_SIZE);
830+
} else if (adev->flags & AMD_IS_APU) {
831+
return (ttm_tt_pages_limit() << PAGE_SHIFT);
827832
} else {
828833
return adev->gmc.real_vram_size;
829834
}

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
196196
return -EINVAL;
197197

198198
vram_size = KFD_XCP_MEMORY_SIZE(adev, xcp_id);
199-
if (adev->gmc.is_app_apu) {
199+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
200200
system_mem_needed = size;
201201
ttm_mem_needed = size;
202202
}
@@ -232,7 +232,8 @@ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
232232
"adev reference can't be null when vram is used");
233233
if (adev && xcp_id >= 0) {
234234
adev->kfd.vram_used[xcp_id] += vram_needed;
235-
adev->kfd.vram_used_aligned[xcp_id] += adev->gmc.is_app_apu ?
235+
adev->kfd.vram_used_aligned[xcp_id] +=
236+
(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
236237
vram_needed :
237238
ALIGN(vram_needed, VRAM_AVAILABLITY_ALIGN);
238239
}
@@ -260,7 +261,7 @@ void amdgpu_amdkfd_unreserve_mem_limit(struct amdgpu_device *adev,
260261

261262
if (adev) {
262263
adev->kfd.vram_used[xcp_id] -= size;
263-
if (adev->gmc.is_app_apu) {
264+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
264265
adev->kfd.vram_used_aligned[xcp_id] -= size;
265266
kfd_mem_limit.system_mem_used -= size;
266267
kfd_mem_limit.ttm_mem_used -= size;
@@ -889,7 +890,7 @@ static int kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem,
889890
* if peer device has large BAR. In contrast, access over xGMI is
890891
* allowed for both small and large BAR configurations of peer device
891892
*/
892-
if ((adev != bo_adev && !adev->gmc.is_app_apu) &&
893+
if ((adev != bo_adev && !(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)) &&
893894
((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
894895
(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
895896
(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) {
@@ -1674,7 +1675,7 @@ size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev,
16741675
- atomic64_read(&adev->vram_pin_size)
16751676
- reserved_for_pt;
16761677

1677-
if (adev->gmc.is_app_apu) {
1678+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
16781679
system_mem_available = no_system_mem_limit ?
16791680
kfd_mem_limit.max_system_mem_limit :
16801681
kfd_mem_limit.max_system_mem_limit -
@@ -1722,7 +1723,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
17221723
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
17231724
domain = alloc_domain = AMDGPU_GEM_DOMAIN_VRAM;
17241725

1725-
if (adev->gmc.is_app_apu) {
1726+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) {
17261727
domain = AMDGPU_GEM_DOMAIN_GTT;
17271728
alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
17281729
alloc_flags = 0;
@@ -1973,7 +1974,7 @@ int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
19731974
if (size) {
19741975
if (!is_imported &&
19751976
(mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM ||
1976-
(adev->gmc.is_app_apu &&
1977+
((adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) &&
19771978
mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))
19781979
*size = bo_size;
19791980
else
@@ -2395,8 +2396,9 @@ static int import_obj_create(struct amdgpu_device *adev,
23952396
(*mem)->dmabuf = dma_buf;
23962397
(*mem)->bo = bo;
23972398
(*mem)->va = va;
2398-
(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) && !adev->gmc.is_app_apu ?
2399-
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
2399+
(*mem)->domain = (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) &&
2400+
!(adev->gmc.is_app_apu || adev->flags & AMD_IS_APU) ?
2401+
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT;
24002402

24012403
(*mem)->mapped_to_gpu_memory = 0;
24022404
(*mem)->process_info = avm->process_info;

drivers/gpu/drm/amd/amdkfd/kfd_migrate.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,7 @@ int kgd2kfd_init_zone_device(struct amdgpu_device *adev)
10231023
if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 0, 1))
10241024
return -EINVAL;
10251025

1026-
if (adev->gmc.is_app_apu)
1026+
if (adev->gmc.is_app_apu || adev->flags & AMD_IS_APU)
10271027
return 0;
10281028

10291029
pgmap = &kfddev->pgmap;

drivers/gpu/drm/amd/amdkfd/kfd_svm.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2631,7 +2631,8 @@ svm_range_best_restore_location(struct svm_range *prange,
26312631
return -1;
26322632
}
26332633

2634-
if (node->adev->gmc.is_app_apu)
2634+
if (node->adev->gmc.is_app_apu ||
2635+
node->adev->flags & AMD_IS_APU)
26352636
return 0;
26362637

26372638
if (prange->preferred_loc == gpuid ||
@@ -3349,7 +3350,8 @@ svm_range_best_prefetch_location(struct svm_range *prange)
33493350
goto out;
33503351
}
33513352

3352-
if (bo_node->adev->gmc.is_app_apu) {
3353+
if (bo_node->adev->gmc.is_app_apu ||
3354+
bo_node->adev->flags & AMD_IS_APU) {
33533355
best_loc = 0;
33543356
goto out;
33553357
}

drivers/gpu/drm/amd/amdkfd/kfd_svm.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,8 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s
201201
* is initialized to not 0 when page migration register device memory.
202202
*/
203203
#define KFD_IS_SVM_API_SUPPORTED(adev) ((adev)->kfd.pgmap.type != 0 ||\
204-
(adev)->gmc.is_app_apu)
204+
(adev)->gmc.is_app_apu ||\
205+
((adev)->flags & AMD_IS_APU))
205206

206207
void svm_range_bo_unref_async(struct svm_range_bo *svm_bo);
207208

0 commit comments

Comments
 (0)