Skip to content

Commit 1dd2fa0

Browse files
Lijo Lazaralexdeucher
authored andcommitted
drm/amdgpu: Save and restore switch state
During a DPC error kernel waits for the link to be active before notifying downstream devices. On certain platforms with Broadcom switch in synthetiic mode, switch responds with values even though the link is not fully ready. The config space restoration done by pcie port driver for SWUS/DS of dGPU is thus not effective as the switch is still doing internal enumeration. As a workaround, save state of SWUS/DS device in driver. Add additional check to see if link is active and restore the values during DPC error callbacks. Signed-off-by: Lijo Lazar <[email protected]> Reviewed-by: Yang Wang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 111821e commit 1dd2fa0

File tree

2 files changed

+83
-5
lines changed

2 files changed

+83
-5
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,9 @@ struct amdgpu_pcie_reset_ctx {
910910
bool in_link_reset;
911911
bool occurs_dpc;
912912
bool audio_suspended;
913+
struct pci_dev *swus;
914+
struct pci_saved_state *swus_pcistate;
915+
struct pci_saved_state *swds_pcistate;
913916
};
914917

915918
/*

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,8 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
178178
BIT(AMD_IP_BLOCK_TYPE_PSP)
179179
};
180180

181+
static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);
182+
181183
static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
182184
enum amd_ip_block_type block)
183185
{
@@ -5013,7 +5015,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
50135015
adev->reset_domain = NULL;
50145016

50155017
kfree(adev->pci_state);
5016-
5018+
kfree(adev->pcie_reset_ctx.swds_pcistate);
5019+
kfree(adev->pcie_reset_ctx.swus_pcistate);
50175020
}
50185021

50195022
/**
@@ -6986,16 +6989,34 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
69866989
struct amdgpu_device *tmp_adev;
69876990
struct amdgpu_hive_info *hive;
69886991
struct list_head device_list;
6989-
int r = 0, i;
6992+
struct pci_dev *link_dev;
6993+
int r = 0, i, timeout;
69906994
u32 memsize;
6995+
u16 status;
69916996

69926997
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
69936998

69946999
memset(&reset_context, 0, sizeof(reset_context));
69957000

6996-
/* wait for asic to come out of reset */
6997-
msleep(700);
7001+
if (adev->pcie_reset_ctx.swus)
7002+
link_dev = adev->pcie_reset_ctx.swus;
7003+
else
7004+
link_dev = adev->pdev;
7005+
/* wait for asic to come out of reset, timeout = 10s */
7006+
timeout = 10000;
7007+
do {
7008+
usleep_range(10000, 10500);
7009+
r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
7010+
timeout -= 10;
7011+
} while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) &&
7012+
(status != PCI_VENDOR_ID_AMD));
69987013

7014+
if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) {
7015+
r = -ETIME;
7016+
goto out;
7017+
}
7018+
7019+
amdgpu_device_load_switch_state(adev);
69997020
/* Restore PCI confspace */
70007021
amdgpu_device_load_pci_state(pdev);
70017022

@@ -7097,6 +7118,58 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
70977118
}
70987119
}
70997120

7121+
static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev)
7122+
{
7123+
struct pci_dev *parent = pci_upstream_bridge(adev->pdev);
7124+
int r;
7125+
7126+
if (parent->vendor != PCI_VENDOR_ID_ATI)
7127+
return;
7128+
7129+
/* If already saved, return */
7130+
if (adev->pcie_reset_ctx.swus)
7131+
return;
7132+
/* Upstream bridge is ATI, assume it's SWUS/DS architecture */
7133+
r = pci_save_state(parent);
7134+
if (r)
7135+
return;
7136+
adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent);
7137+
7138+
parent = pci_upstream_bridge(parent);
7139+
r = pci_save_state(parent);
7140+
if (r)
7141+
return;
7142+
adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent);
7143+
7144+
adev->pcie_reset_ctx.swus = parent;
7145+
}
7146+
7147+
static void amdgpu_device_load_switch_state(struct amdgpu_device *adev)
7148+
{
7149+
struct pci_dev *pdev;
7150+
int r;
7151+
7152+
if (!adev->pcie_reset_ctx.swds_pcistate ||
7153+
!adev->pcie_reset_ctx.swus_pcistate)
7154+
return;
7155+
7156+
pdev = adev->pcie_reset_ctx.swus;
7157+
r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate);
7158+
if (!r) {
7159+
pci_restore_state(pdev);
7160+
} else {
7161+
dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r);
7162+
return;
7163+
}
7164+
7165+
pdev = pci_upstream_bridge(adev->pdev);
7166+
r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate);
7167+
if (!r)
7168+
pci_restore_state(pdev);
7169+
else
7170+
dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r);
7171+
}
7172+
71007173
bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
71017174
{
71027175
struct drm_device *dev = pci_get_drvdata(pdev);
@@ -7121,6 +7194,8 @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
71217194
return false;
71227195
}
71237196

7197+
amdgpu_device_cache_switch_state(adev);
7198+
71247199
return true;
71257200
}
71267201

@@ -7556,4 +7631,4 @@ u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
75567631
}
75577632

75587633
return uid_info->uid[type][inst];
7559-
}
7634+
}

0 commit comments

Comments
 (0)