summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c299
1 files changed, 237 insertions, 62 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 01d234cf8156..a77000c2e0bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -95,6 +95,7 @@ MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/cyan_skillfish_gpu_info.bin");
#define AMDGPU_RESUME_MS 2000
#define AMDGPU_MAX_RETRY_LIMIT 2
@@ -178,6 +179,8 @@ struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
BIT(AMD_IP_BLOCK_TYPE_PSP)
};
+static void amdgpu_device_load_switch_state(struct amdgpu_device *adev);
+
static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
enum amd_ip_block_type block)
{
@@ -2445,6 +2448,33 @@ int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
return 1;
}
+static const char *ip_block_names[] = {
+ [AMD_IP_BLOCK_TYPE_COMMON] = "common",
+ [AMD_IP_BLOCK_TYPE_GMC] = "gmc",
+ [AMD_IP_BLOCK_TYPE_IH] = "ih",
+ [AMD_IP_BLOCK_TYPE_SMC] = "smu",
+ [AMD_IP_BLOCK_TYPE_PSP] = "psp",
+ [AMD_IP_BLOCK_TYPE_DCE] = "dce",
+ [AMD_IP_BLOCK_TYPE_GFX] = "gfx",
+ [AMD_IP_BLOCK_TYPE_SDMA] = "sdma",
+ [AMD_IP_BLOCK_TYPE_UVD] = "uvd",
+ [AMD_IP_BLOCK_TYPE_VCE] = "vce",
+ [AMD_IP_BLOCK_TYPE_ACP] = "acp",
+ [AMD_IP_BLOCK_TYPE_VCN] = "vcn",
+ [AMD_IP_BLOCK_TYPE_MES] = "mes",
+ [AMD_IP_BLOCK_TYPE_JPEG] = "jpeg",
+ [AMD_IP_BLOCK_TYPE_VPE] = "vpe",
+ [AMD_IP_BLOCK_TYPE_UMSCH_MM] = "umsch_mm",
+ [AMD_IP_BLOCK_TYPE_ISP] = "isp",
+};
+
+static const char *ip_block_name(struct amdgpu_device *adev, enum amd_ip_block_type type)
+{
+ int idx = (int)type;
+
+ return idx < ARRAY_SIZE(ip_block_names) ? ip_block_names[idx] : "unknown";
+}
+
/**
* amdgpu_device_ip_block_add
*
@@ -2473,8 +2503,13 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
break;
}
- dev_info(adev->dev, "detected ip block number %d <%s>\n",
- adev->num_ip_blocks, ip_block_version->funcs->name);
+ dev_info(adev->dev, "detected ip block number %d <%s_v%d_%d_%d> (%s)\n",
+ adev->num_ip_blocks,
+ ip_block_name(adev, ip_block_version->type),
+ ip_block_version->major,
+ ip_block_version->minor,
+ ip_block_version->rev,
+ ip_block_version->funcs->name);
adev->ip_blocks[adev->num_ip_blocks].adev = adev;
@@ -2595,6 +2630,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
return 0;
chip_name = "navi12";
break;
+ case CHIP_CYAN_SKILLFISH:
+ chip_name = "cyan_skillfish";
+ break;
}
err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
@@ -2674,6 +2712,24 @@ out:
return err;
}
+static void amdgpu_uid_init(struct amdgpu_device *adev)
+{
+ /* Initialize the UID for the device */
+ adev->uid_info = kzalloc(sizeof(struct amdgpu_uid), GFP_KERNEL);
+ if (!adev->uid_info) {
+ dev_warn(adev->dev, "Failed to allocate memory for UID\n");
+ return;
+ }
+ adev->uid_info->adev = adev;
+}
+
+static void amdgpu_uid_fini(struct amdgpu_device *adev)
+{
+ /* Free the UID memory */
+ kfree(adev->uid_info);
+ adev->uid_info = NULL;
+}
+
/**
* amdgpu_device_ip_early_init - run early init for hardware IPs
*
@@ -2857,6 +2913,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
if (adev->gmc.xgmi.supported)
amdgpu_xgmi_early_init(adev);
+ if (amdgpu_is_multi_aid(adev))
+ amdgpu_uid_init(adev);
ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX);
if (ip_block->status.valid != false)
amdgpu_amdkfd_device_probe(adev);
@@ -3389,7 +3447,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
for (i = 0; i < mgpu_info.num_dgpu; i++) {
gpu_ins = &(mgpu_info.gpu_ins[i]);
adev = gpu_ins->adev;
- if (!(adev->flags & AMD_IS_APU) &&
+ if (!(adev->flags & AMD_IS_APU || amdgpu_sriov_multi_vf_mode(adev)) &&
!gpu_ins->mgpu_fan_enabled) {
ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
if (ret)
@@ -3648,6 +3706,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
}
amdgpu_ras_fini(adev);
+ amdgpu_uid_fini(adev);
return 0;
}
@@ -4992,7 +5051,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
adev->reset_domain = NULL;
kfree(adev->pci_state);
-
+ kfree(adev->pcie_reset_ctx.swds_pcistate);
+ kfree(adev->pcie_reset_ctx.swus_pcistate);
}
/**
@@ -5012,6 +5072,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
if (!adev->in_s4 && (adev->flags & AMD_IS_APU))
return 0;
+ /* No need to evict when going to S5 through S4 callbacks */
+ if (system_state == SYSTEM_POWER_OFF)
+ return 0;
+
ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
if (ret) {
dev_warn(adev->dev, "evicting device resources failed\n");
@@ -5136,7 +5200,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
adev->in_suspend = true;
if (amdgpu_sriov_vf(adev)) {
- if (!adev->in_s0ix && !adev->in_runpm)
+ if (!adev->in_runpm)
amdgpu_amdkfd_suspend_process(adev);
amdgpu_virt_fini_data_exchange(adev);
r = amdgpu_virt_request_full_gpu(adev, false);
@@ -5156,10 +5220,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool notify_clients)
amdgpu_device_ip_suspend_phase1(adev);
- if (!adev->in_s0ix) {
- amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
- amdgpu_userq_suspend(adev);
- }
+ amdgpu_amdkfd_suspend(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
+ amdgpu_userq_suspend(adev);
r = amdgpu_device_evict_resources(adev);
if (r)
@@ -5254,15 +5316,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool notify_clients)
goto exit;
}
- if (!adev->in_s0ix) {
- r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
- if (r)
- goto exit;
+ r = amdgpu_amdkfd_resume(adev, !amdgpu_sriov_vf(adev) && !adev->in_runpm);
+ if (r)
+ goto exit;
- r = amdgpu_userq_resume(adev);
- if (r)
- goto exit;
- }
+ r = amdgpu_userq_resume(adev);
+ if (r)
+ goto exit;
r = amdgpu_device_ip_late_init(adev);
if (r)
@@ -5275,7 +5335,7 @@ exit:
amdgpu_virt_init_data_exchange(adev);
amdgpu_virt_release_full_gpu(adev, true);
- if (!adev->in_s0ix && !r && !adev->in_runpm)
+ if (!r && !adev->in_runpm)
r = amdgpu_amdkfd_resume_process(adev);
}
@@ -5701,7 +5761,7 @@ int amdgpu_device_link_reset(struct amdgpu_device *adev)
dev_info(adev->dev, "GPU link reset\n");
- if (!adev->pcie_reset_ctx.occurs_dpc)
+ if (!amdgpu_reset_in_dpc(adev))
ret = amdgpu_dpm_link_reset(adev);
if (ret)
@@ -5830,6 +5890,7 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
amdgpu_set_init_level(tmp_adev, init_level);
if (full_reset) {
/* post card */
+ amdgpu_reset_set_dpc_status(tmp_adev, false);
amdgpu_ras_clear_err_state(tmp_adev);
r = amdgpu_device_asic_init(tmp_adev);
if (r) {
@@ -6136,12 +6197,11 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
return ret;
}
-static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
+static void amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
struct list_head *device_list,
struct amdgpu_hive_info *hive)
{
struct amdgpu_device *tmp_adev = NULL;
- int r;
/*
* Build list of devices to reset.
@@ -6153,7 +6213,7 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
list_add_tail(&tmp_adev->reset_list, device_list);
if (adev->shutdown)
tmp_adev->shutdown = true;
- if (adev->pcie_reset_ctx.occurs_dpc)
+ if (amdgpu_reset_in_dpc(adev))
tmp_adev->pcie_reset_ctx.in_link_reset = true;
}
if (!list_is_first(&adev->reset_list, device_list))
@@ -6161,14 +6221,6 @@ static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
} else {
list_add_tail(&adev->reset_list, device_list);
}
-
- if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
- r = amdgpu_device_health_check(device_list);
- if (r)
- return r;
- }
-
- return 0;
}
static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
@@ -6237,9 +6289,8 @@ static void amdgpu_device_halt_activities(struct amdgpu_device *adev,
drm_client_dev_suspend(adev_to_drm(tmp_adev), false);
/* disable ras on ALL IPs */
- if (!need_emergency_restart &&
- (!adev->pcie_reset_ctx.occurs_dpc) &&
- amdgpu_device_ip_need_full_reset(tmp_adev))
+ if (!need_emergency_restart && !amdgpu_reset_in_dpc(adev) &&
+ amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -6267,11 +6318,7 @@ static int amdgpu_device_asic_reset(struct amdgpu_device *adev,
retry: /* Rest of adevs pre asic reset from XGMI hive. */
list_for_each_entry(tmp_adev, device_list, reset_list) {
- if (adev->pcie_reset_ctx.occurs_dpc)
- tmp_adev->no_hw_access = true;
r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
- if (adev->pcie_reset_ctx.occurs_dpc)
- tmp_adev->no_hw_access = false;
/*TODO Should we stop ?*/
if (r) {
dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
@@ -6449,8 +6496,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
emergency_restart();
}
- dev_info(adev->dev, "GPU %s begin!\n",
- need_emergency_restart ? "jobs stop":"reset");
+ dev_info(adev->dev, "GPU %s begin!. Source: %d\n",
+ need_emergency_restart ? "jobs stop" : "reset",
+ reset_context->src);
if (!amdgpu_sriov_vf(adev))
hive = amdgpu_get_xgmi_hive(adev);
@@ -6461,8 +6509,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
reset_context->hive = hive;
INIT_LIST_HEAD(&device_list);
- if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
- goto end_reset;
+ amdgpu_device_recovery_prepare(adev, &device_list, hive);
+
+ if (!amdgpu_sriov_vf(adev)) {
+ r = amdgpu_device_health_check(&device_list);
+ if (r)
+ goto end_reset;
+ }
/* We need to lock reset domain only once both for XGMI and single device */
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
@@ -6884,17 +6937,13 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
{
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
- struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
+ struct amdgpu_hive_info *hive __free(xgmi_put_hive) =
+ amdgpu_get_xgmi_hive(adev);
struct amdgpu_reset_context reset_context;
struct list_head device_list;
dev_info(adev->dev, "PCI error: detected callback!!\n");
- if (!amdgpu_dpm_is_link_reset_supported(adev)) {
- dev_warn(adev->dev, "No support for XGMI hive yet...\n");
- return PCI_ERS_RESULT_DISCONNECT;
- }
-
adev->pci_channel_state = state;
switch (state) {
@@ -6904,10 +6953,23 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
case pci_channel_io_frozen:
/* Fatal error, prepare for slot reset */
dev_info(adev->dev, "pci_channel_io_frozen: state(%d)!!\n", state);
+ if (hive) {
+ /* Hive devices should be able to support FW based
+ * link reset on other devices, if not return.
+ */
+ if (!amdgpu_dpm_is_link_reset_supported(adev)) {
+ dev_warn(adev->dev,
+ "No support for XGMI hive yet...\n");
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+ /* Set dpc status only if device is part of hive
+ * Non-hive devices should be able to recover after
+ * link reset.
+ */
+ amdgpu_reset_set_dpc_status(adev, true);
- if (hive)
mutex_lock(&hive->hive_lock);
- adev->pcie_reset_ctx.occurs_dpc = true;
+ }
memset(&reset_context, 0, sizeof(reset_context));
INIT_LIST_HEAD(&device_list);
@@ -6915,10 +6977,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
hive, false);
- if (hive) {
+ if (hive)
mutex_unlock(&hive->hive_lock);
- amdgpu_put_xgmi_hive(hive);
- }
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
/* Permanent error, prepare for device removal */
@@ -6966,22 +7026,34 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
struct amdgpu_device *tmp_adev;
struct amdgpu_hive_info *hive;
struct list_head device_list;
- int r = 0, i;
+ struct pci_dev *link_dev;
+ int r = 0, i, timeout;
u32 memsize;
-
- /* PCI error slot reset should be skipped During RAS recovery */
- if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
- amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
- amdgpu_ras_in_recovery(adev))
- return PCI_ERS_RESULT_RECOVERED;
+ u16 status;
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
memset(&reset_context, 0, sizeof(reset_context));
- /* wait for asic to come out of reset */
- msleep(700);
+ if (adev->pcie_reset_ctx.swus)
+ link_dev = adev->pcie_reset_ctx.swus;
+ else
+ link_dev = adev->pdev;
+ /* wait for asic to come out of reset, timeout = 10s */
+ timeout = 10000;
+ do {
+ usleep_range(10000, 10500);
+ r = pci_read_config_word(link_dev, PCI_VENDOR_ID, &status);
+ timeout -= 10;
+ } while (timeout > 0 && (status != PCI_VENDOR_ID_ATI) &&
+ (status != PCI_VENDOR_ID_AMD));
+ if ((status != PCI_VENDOR_ID_ATI) && (status != PCI_VENDOR_ID_AMD)) {
+ r = -ETIME;
+ goto out;
+ }
+
+ amdgpu_device_load_switch_state(adev);
/* Restore PCI confspace */
amdgpu_device_load_pci_state(pdev);
@@ -7076,7 +7148,6 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
amdgpu_device_sched_resume(&device_list, NULL, NULL);
amdgpu_device_gpu_resume(adev, &device_list, false);
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
- adev->pcie_reset_ctx.occurs_dpc = false;
if (hive) {
mutex_unlock(&hive->hive_lock);
@@ -7084,6 +7155,58 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
}
}
+static void amdgpu_device_cache_switch_state(struct amdgpu_device *adev)
+{
+ struct pci_dev *parent = pci_upstream_bridge(adev->pdev);
+ int r;
+
+ if (!parent || parent->vendor != PCI_VENDOR_ID_ATI)
+ return;
+
+ /* If already saved, return */
+ if (adev->pcie_reset_ctx.swus)
+ return;
+ /* Upstream bridge is ATI, assume it's SWUS/DS architecture */
+ r = pci_save_state(parent);
+ if (r)
+ return;
+ adev->pcie_reset_ctx.swds_pcistate = pci_store_saved_state(parent);
+
+ parent = pci_upstream_bridge(parent);
+ r = pci_save_state(parent);
+ if (r)
+ return;
+ adev->pcie_reset_ctx.swus_pcistate = pci_store_saved_state(parent);
+
+ adev->pcie_reset_ctx.swus = parent;
+}
+
+static void amdgpu_device_load_switch_state(struct amdgpu_device *adev)
+{
+ struct pci_dev *pdev;
+ int r;
+
+ if (!adev->pcie_reset_ctx.swds_pcistate ||
+ !adev->pcie_reset_ctx.swus_pcistate)
+ return;
+
+ pdev = adev->pcie_reset_ctx.swus;
+ r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swus_pcistate);
+ if (!r) {
+ pci_restore_state(pdev);
+ } else {
+ dev_warn(adev->dev, "Failed to load SWUS state, err:%d\n", r);
+ return;
+ }
+
+ pdev = pci_upstream_bridge(adev->pdev);
+ r = pci_load_saved_state(pdev, adev->pcie_reset_ctx.swds_pcistate);
+ if (!r)
+ pci_restore_state(pdev);
+ else
+ dev_warn(adev->dev, "Failed to load SWDS state, err:%d\n", r);
+}
+
bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
@@ -7108,6 +7231,8 @@ bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
return false;
}
+ amdgpu_device_cache_switch_state(adev);
+
return true;
}
@@ -7494,3 +7619,53 @@ ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
size += sysfs_emit_at(buf, size, "\n");
return size;
}
+
+void amdgpu_device_set_uid(struct amdgpu_uid *uid_info,
+ enum amdgpu_uid_type type, uint8_t inst,
+ uint64_t uid)
+{
+ if (!uid_info)
+ return;
+
+ if (type >= AMDGPU_UID_TYPE_MAX) {
+ dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
+ type);
+ return;
+ }
+
+ if (inst >= AMDGPU_UID_INST_MAX) {
+ dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
+ inst);
+ return;
+ }
+
+ if (uid_info->uid[type][inst] != 0) {
+ dev_warn_once(
+ uid_info->adev->dev,
+ "Overwriting existing UID %llu for type %d instance %d\n",
+ uid_info->uid[type][inst], type, inst);
+ }
+
+ uid_info->uid[type][inst] = uid;
+}
+
+u64 amdgpu_device_get_uid(struct amdgpu_uid *uid_info,
+ enum amdgpu_uid_type type, uint8_t inst)
+{
+ if (!uid_info)
+ return 0;
+
+ if (type >= AMDGPU_UID_TYPE_MAX) {
+ dev_err_once(uid_info->adev->dev, "Invalid UID type %d\n",
+ type);
+ return 0;
+ }
+
+ if (inst >= AMDGPU_UID_INST_MAX) {
+ dev_err_once(uid_info->adev->dev, "Invalid UID instance %d\n",
+ inst);
+ return 0;
+ }
+
+ return uid_info->uid[type][inst];
+}