Skip to content

Commit e54478f

Browse files
committed
Merge tag 'amd-drm-next-6.8-2024-01-05' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.8-2024-01-05: amdgpu: - VRR fixes - PSR-SU fixes - SubVP fixes - DCN 3.5 fixes - Documentation updates - DMCUB fixes - DML2 fixes - UMC 12.0 updates - GPUVM fix - Misc code cleanups and whitespace cleanups - DP MST fix - Let KFD sync with GPUVM fences - GFX11 reset fix - SMU 13.0.6 fixes - VSC fix for DP/eDP - Navi12 display fix - RN/CZN system aperture fix - DCN 2.1 bandwidth validation fix - DCN INIT cleanup amdkfd: - SVM fixes - Revert TBA/TMA location change Signed-off-by: Dave Airlie <[email protected]> From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 3c064ae + 754d349 commit e54478f

File tree

198 files changed

+4093
-2111
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

198 files changed

+4093
-2111
lines changed

Documentation/gpu/amdgpu/apu-asic-info-table.csv

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ SteamDeck, VANGOGH, DCN 3.0.1, 10.3.1, VCN 3.1.0, 5.2.1, 11.5.0
77
Ryzen 5000 series / Ryzen 7x30 series, GREEN SARDINE / Cezanne / Barcelo / Barcelo-R, DCN 2.1, 9.3, VCN 2.2, 4.1.1, 12.0.1
88
Ryzen 6000 series / Ryzen 7x35 series / Ryzen 7x36 series, YELLOW CARP / Rembrandt / Rembrandt-R, 3.1.2, 10.3.3, VCN 3.1.1, 5.2.3, 13.0.3
99
Ryzen 7000 series (AM5), Raphael, 3.1.5, 10.3.6, 3.1.2, 5.2.6, 13.0.5
10-
Ryzen 7x45 series (FL1), / Dragon Range, 3.1.5, 10.3.6, 3.1.2, 5.2.6, 13.0.5
10+
Ryzen 7x45 series (FL1), Dragon Range, 3.1.5, 10.3.6, 3.1.2, 5.2.6, 13.0.5
1111
Ryzen 7x20 series, Mendocino, 3.1.6, 10.3.7, 3.1.1, 5.2.7, 13.0.8
12-
Ryzen 7x40 series, Phoenix, 3.1.4, 11.0.1 / 11.0.4, 4.0.2, 6.0.1, 13.0.4 / 13.0.11
12+
Ryzen 7x40 series, Phoenix, 3.1.4, 11.0.1 / 11.0.4, 4.0.2, 6.0.1, 13.0.4 / 13.0.11
13+
Ryzen 8x40 series, Hawk Point, 3.1.4, 11.0.1 / 11.0.4, 4.0.2, 6.0.1, 13.0.4 / 13.0.11

drivers/gpu/drm/amd/amdgpu/aldebaran.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,7 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
330330
{
331331
struct list_head *reset_device_list = reset_context->reset_device_list;
332332
struct amdgpu_device *tmp_adev = NULL;
333+
struct amdgpu_ras *con;
333334
int r;
334335

335336
if (reset_device_list == NULL)
@@ -355,7 +356,30 @@ aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
355356
*/
356357
amdgpu_register_gpu_instance(tmp_adev);
357358

358-
/* Resume RAS */
359+
/* Resume RAS, ecc_irq */
360+
con = amdgpu_ras_get_context(tmp_adev);
361+
if (!amdgpu_sriov_vf(tmp_adev) && con) {
362+
if (tmp_adev->sdma.ras &&
363+
tmp_adev->sdma.ras->ras_block.ras_late_init) {
364+
r = tmp_adev->sdma.ras->ras_block.ras_late_init(tmp_adev,
365+
&tmp_adev->sdma.ras->ras_block.ras_comm);
366+
if (r) {
367+
dev_err(tmp_adev->dev, "SDMA failed to execute ras_late_init! ret:%d\n", r);
368+
goto end;
369+
}
370+
}
371+
372+
if (tmp_adev->gfx.ras &&
373+
tmp_adev->gfx.ras->ras_block.ras_late_init) {
374+
r = tmp_adev->gfx.ras->ras_block.ras_late_init(tmp_adev,
375+
&tmp_adev->gfx.ras->ras_block.ras_comm);
376+
if (r) {
377+
dev_err(tmp_adev->dev, "GFX failed to execute ras_late_init! ret:%d\n", r);
378+
goto end;
379+
}
380+
}
381+
}
382+
359383
amdgpu_ras_resume(tmp_adev);
360384

361385
/* Update PSP FW topology after reset */

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,8 @@ extern int amdgpu_agp;
254254

255255
extern int amdgpu_wbrf;
256256

257+
extern int fw_bo_location;
258+
257259
#define AMDGPU_VM_MAX_NUM_CTX 4096
258260
#define AMDGPU_SG_THRESHOLD (256*1024*1024)
259261
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_fence.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct dma_fence *f)
9090
return NULL;
9191

9292
fence = container_of(f, struct amdgpu_amdkfd_fence, base);
93-
if (fence && f->ops == &amdkfd_fence_ops)
93+
if (f->ops == &amdkfd_fence_ops)
9494
return fence;
9595

9696
return NULL;

drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1103,7 +1103,7 @@ amdgpu_connector_dvi_detect(struct drm_connector *connector, bool force)
11031103
* DDC line. The latter is more complex because with DVI<->HDMI adapters
11041104
* you don't really know what's connected to which port as both are digital.
11051105
*/
1106-
amdgpu_connector_shared_ddc(&ret, connector, amdgpu_connector);
1106+
amdgpu_connector_shared_ddc(&ret, connector, amdgpu_connector);
11071107
}
11081108
}
11091109

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -870,9 +870,9 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
870870
struct amdgpu_bo *bo = e->bo;
871871
int i;
872872

873-
e->user_pages = kvmalloc_array(bo->tbo.ttm->num_pages,
874-
sizeof(struct page *),
875-
GFP_KERNEL | __GFP_ZERO);
873+
e->user_pages = kvcalloc(bo->tbo.ttm->num_pages,
874+
sizeof(struct page *),
875+
GFP_KERNEL);
876876
if (!e->user_pages) {
877877
DRM_ERROR("kvmalloc_array failure\n");
878878
r = -ENOMEM;

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,11 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf,
540540
while (size) {
541541
uint32_t value;
542542

543-
value = RREG32_PCIE(*pos);
543+
if (upper_32_bits(*pos))
544+
value = RREG32_PCIE_EXT(*pos);
545+
else
546+
value = RREG32_PCIE(*pos);
547+
544548
r = put_user(value, (uint32_t *)buf);
545549
if (r)
546550
goto out;
@@ -600,7 +604,10 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user
600604
if (r)
601605
goto out;
602606

603-
WREG32_PCIE(*pos, value);
607+
if (upper_32_bits(*pos))
608+
WREG32_PCIE_EXT(*pos, value);
609+
else
610+
WREG32_PCIE(*pos, value);
604611

605612
result += 4;
606613
buf += 4;

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2251,15 +2251,8 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
22512251

22522252
adev->firmware.gpu_info_fw = NULL;
22532253

2254-
if (adev->mman.discovery_bin) {
2255-
/*
2256-
* FIXME: The bounding box is still needed by Navi12, so
2257-
* temporarily read it from gpu_info firmware. Should be dropped
2258-
* when DAL no longer needs it.
2259-
*/
2260-
if (adev->asic_type != CHIP_NAVI12)
2261-
return 0;
2262-
}
2254+
if (adev->mman.discovery_bin)
2255+
return 0;
22632256

22642257
switch (adev->asic_type) {
22652258
default:

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ int amdgpu_seamless = -1; /* auto */
210210
uint amdgpu_debug_mask;
211211
int amdgpu_agp = -1; /* auto */
212212
int amdgpu_wbrf = -1;
213+
int fw_bo_location = -1;
213214

214215
static void amdgpu_drv_delayed_reset_work_handler(struct work_struct *work);
215216

@@ -989,6 +990,10 @@ MODULE_PARM_DESC(wbrf,
989990
"Enable Wifi RFI interference mitigation (0 = disabled, 1 = enabled, -1 = auto(default)");
990991
module_param_named(wbrf, amdgpu_wbrf, int, 0444);
991992

993+
MODULE_PARM_DESC(fw_bo_location,
994+
"location to put firmware bo for frontdoor loading (-1 = auto (default), 0 = on ram, 1 = on vram");
995+
module_param(fw_bo_location, int, 0644);
996+
992997
/* These devices are not supported by amdgpu.
993998
* They are supported by the mach64, r128, radeon drivers
994999
*/

drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st
218218
int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, struct ras_err_data *err_data)
219219
{
220220
struct amdgpu_smuio_mcm_config_info mcm_info;
221+
struct ras_err_addr err_addr = {0};
221222
struct mca_bank_set mca_set;
222223
struct mca_bank_node *node;
223224
struct mca_bank_entry *entry;
@@ -246,10 +247,18 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_blo
246247
mcm_info.socket_id = entry->info.socket_id;
247248
mcm_info.die_id = entry->info.aid;
248249

250+
if (blk == AMDGPU_RAS_BLOCK__UMC) {
251+
err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS];
252+
err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID];
253+
err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR];
254+
}
255+
249256
if (type == AMDGPU_MCA_ERROR_TYPE_UE)
250-
amdgpu_ras_error_statistic_ue_count(err_data, &mcm_info, (uint64_t)count);
257+
amdgpu_ras_error_statistic_ue_count(err_data,
258+
&mcm_info, &err_addr, (uint64_t)count);
251259
else
252-
amdgpu_ras_error_statistic_ce_count(err_data, &mcm_info, (uint64_t)count);
260+
amdgpu_ras_error_statistic_ce_count(err_data,
261+
&mcm_info, &err_addr, (uint64_t)count);
253262
}
254263

255264
out_mca_release:
@@ -351,6 +360,9 @@ int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_err
351360
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
352361
int count;
353362

363+
if (!mca_funcs || !mca_funcs->mca_get_mca_entry)
364+
return -EOPNOTSUPP;
365+
354366
switch (type) {
355367
case AMDGPU_MCA_ERROR_TYPE_UE:
356368
count = mca_funcs->max_ue_count;
@@ -365,10 +377,7 @@ int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_err
365377
if (idx >= count)
366378
return -EINVAL;
367379

368-
if (mca_funcs && mca_funcs->mca_get_mca_entry)
369-
return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
370-
371-
return -EOPNOTSUPP;
380+
return mca_funcs->mca_get_mca_entry(adev, type, idx, entry);
372381
}
373382

374383
#if defined(CONFIG_DEBUG_FS)

drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ static int psp_sw_init(void *handle)
466466
}
467467

468468
ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
469-
amdgpu_sriov_vf(adev) ?
469+
(amdgpu_sriov_vf(adev) || fw_bo_location == 1) ?
470470
AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
471471
&psp->fw_pri_bo,
472472
&psp->fw_pri_mc_addr,

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1156,8 +1156,10 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
11561156
for_each_ras_error(err_node, err_data) {
11571157
err_info = &err_node->err_info;
11581158

1159-
amdgpu_ras_error_statistic_ce_count(&obj->err_data, &err_info->mcm_info, err_info->ce_count);
1160-
amdgpu_ras_error_statistic_ue_count(&obj->err_data, &err_info->mcm_info, err_info->ue_count);
1159+
amdgpu_ras_error_statistic_ce_count(&obj->err_data,
1160+
&err_info->mcm_info, NULL, err_info->ce_count);
1161+
amdgpu_ras_error_statistic_ue_count(&obj->err_data,
1162+
&err_info->mcm_info, NULL, err_info->ue_count);
11611163
}
11621164
} else {
11631165
/* for legacy asic path which doesn't has error source info */
@@ -1174,6 +1176,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
11741176
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
11751177
struct amdgpu_ras_block_object *block_obj = NULL;
11761178

1179+
if (blk == AMDGPU_RAS_BLOCK_COUNT)
1180+
return -EINVAL;
1181+
11771182
if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
11781183
return -EINVAL;
11791184

@@ -2538,7 +2543,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
25382543
return 0;
25392544

25402545
data = &con->eh_data;
2541-
*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
2546+
*data = kzalloc(sizeof(**data), GFP_KERNEL);
25422547
if (!*data) {
25432548
ret = -ENOMEM;
25442549
goto out;
@@ -2825,10 +2830,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
28252830
if (con)
28262831
return 0;
28272832

2828-
con = kmalloc(sizeof(struct amdgpu_ras) +
2833+
con = kzalloc(sizeof(*con) +
28292834
sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT +
28302835
sizeof(struct ras_manager) * AMDGPU_RAS_MCA_BLOCK_COUNT,
2831-
GFP_KERNEL|__GFP_ZERO);
2836+
GFP_KERNEL);
28322837
if (!con)
28332838
return -ENOMEM;
28342839

@@ -3133,8 +3138,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
31333138
if (amdgpu_sriov_vf(adev))
31343139
return 0;
31353140

3136-
/* enable MCA debug on APU device */
3137-
amdgpu_ras_set_mca_debug_mode(adev, !!(adev->flags & AMD_IS_APU));
3141+
amdgpu_ras_set_mca_debug_mode(adev, false);
31383142

31393143
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
31403144
if (!node->ras_obj) {
@@ -3691,7 +3695,8 @@ static int ras_err_info_cmp(void *priv, const struct list_head *a, const struct
36913695
}
36923696

36933697
static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
3694-
struct amdgpu_smuio_mcm_config_info *mcm_info)
3698+
struct amdgpu_smuio_mcm_config_info *mcm_info,
3699+
struct ras_err_addr *err_addr)
36953700
{
36963701
struct ras_err_node *err_node;
36973702

@@ -3705,6 +3710,9 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
37053710

37063711
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
37073712

3713+
if (err_addr)
3714+
memcpy(&err_node->err_info.err_addr, err_addr, sizeof(*err_addr));
3715+
37083716
err_data->err_list_count++;
37093717
list_add_tail(&err_node->node, &err_data->err_node_list);
37103718
list_sort(NULL, &err_data->err_node_list, ras_err_info_cmp);
@@ -3713,7 +3721,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
37133721
}
37143722

37153723
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
3716-
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
3724+
struct amdgpu_smuio_mcm_config_info *mcm_info,
3725+
struct ras_err_addr *err_addr, u64 count)
37173726
{
37183727
struct ras_err_info *err_info;
37193728

@@ -3723,7 +3732,7 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
37233732
if (!count)
37243733
return 0;
37253734

3726-
err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
3735+
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
37273736
if (!err_info)
37283737
return -EINVAL;
37293738

@@ -3734,7 +3743,8 @@ int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
37343743
}
37353744

37363745
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
3737-
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
3746+
struct amdgpu_smuio_mcm_config_info *mcm_info,
3747+
struct ras_err_addr *err_addr, u64 count)
37383748
{
37393749
struct ras_err_info *err_info;
37403750

@@ -3744,7 +3754,7 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
37443754
if (!count)
37453755
return 0;
37463756

3747-
err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
3757+
err_info = amdgpu_ras_error_get_info(err_data, mcm_info, err_addr);
37483758
if (!err_info)
37493759
return -EINVAL;
37503760

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -452,10 +452,17 @@ struct ras_fs_data {
452452
char debugfs_name[32];
453453
};
454454

455+
struct ras_err_addr {
456+
uint64_t err_status;
457+
uint64_t err_ipid;
458+
uint64_t err_addr;
459+
};
460+
455461
struct ras_err_info {
456462
struct amdgpu_smuio_mcm_config_info mcm_info;
457463
u64 ce_count;
458464
u64 ue_count;
465+
struct ras_err_addr err_addr;
459466
};
460467

461468
struct ras_err_node {
@@ -806,8 +813,10 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
806813
int amdgpu_ras_error_data_init(struct ras_err_data *err_data);
807814
void amdgpu_ras_error_data_fini(struct ras_err_data *err_data);
808815
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
809-
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
816+
struct amdgpu_smuio_mcm_config_info *mcm_info,
817+
struct ras_err_addr *err_addr, u64 count);
810818
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
811-
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count);
819+
struct amdgpu_smuio_mcm_config_info *mcm_info,
820+
struct ras_err_addr *err_addr, u64 count);
812821

813822
#endif

drivers/gpu/drm/amd/amdgpu/amdgpu_rlc.c

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -531,13 +531,12 @@ int amdgpu_gfx_rlc_init_microcode(struct amdgpu_device *adev,
531531
if (version_major == 2 && version_minor == 1)
532532
adev->gfx.rlc.is_rlc_v2_1 = true;
533533

534-
if (version_minor >= 0) {
535-
err = amdgpu_gfx_rlc_init_microcode_v2_0(adev);
536-
if (err) {
537-
dev_err(adev->dev, "fail to init rlc v2_0 microcode\n");
538-
return err;
539-
}
534+
err = amdgpu_gfx_rlc_init_microcode_v2_0(adev);
535+
if (err) {
536+
dev_err(adev->dev, "fail to init rlc v2_0 microcode\n");
537+
return err;
540538
}
539+
541540
if (version_minor >= 1)
542541
amdgpu_gfx_rlc_init_microcode_v2_1(adev);
543542
if (version_minor >= 2)

drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,8 @@ static bool amdgpu_sync_test_fence(struct amdgpu_device *adev,
191191

192192
/* Never sync to VM updates either. */
193193
if (fence_owner == AMDGPU_FENCE_OWNER_VM &&
194-
owner != AMDGPU_FENCE_OWNER_UNDEFINED)
194+
owner != AMDGPU_FENCE_OWNER_UNDEFINED &&
195+
owner != AMDGPU_FENCE_OWNER_KFD)
195196
return false;
196197

197198
/* Ignore fences depending on the sync mode */

0 commit comments

Comments
 (0)