From bac640ddb51e8066d1f35ed810e7a4c6dd341d39 Mon Sep 17 00:00:00 2001 From: Eric Huang Date: Tue, 4 Jun 2024 11:46:56 -0400 Subject: drm/amdgpu: add reset source in various cases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To fullfill the reset event description. Suggested-by: Lijo Lazar Signed-off-by: Eric Huang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index b3d11703df04..fd6a06bd2683 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2487,6 +2487,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; + reset_context.src = AMDGPU_RESET_SRC_RAS; /* Perform full reset in fatal error mode */ if (!amdgpu_ras_is_poison_mode_supported(ras->adev)) -- cgit v1.2.3 From 9817f06173cfa74b78e9e701a064864d62e88cd2 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 4 Jun 2024 16:30:41 +0800 Subject: drm/amdgpu: move aca/mca init functions into ras_init() stage adjust the function position to better match aca/mca fini code in ras_fini(). Signed-off-by: Yang Wang Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 13 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 19 ++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 73 ++++++++++++++++++++++----------- 4 files changed, 69 insertions(+), 37 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c index 01d50ad603d3..04515c1c7241 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c @@ -712,6 +712,15 @@ void amdgpu_aca_fini(struct amdgpu_device *adev) atomic_set(&aca->ue_update_flag, 0); } +int amdgpu_aca_reset(struct amdgpu_device *adev) +{ + struct amdgpu_aca *aca = &adev->aca; + + atomic_set(&aca->ue_update_flag, 0); + + return 0; +} + void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs) { struct amdgpu_aca *aca = &adev->aca; @@ -885,9 +894,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(aca_debug_mode_fops, NULL, amdgpu_aca_smu_debug_mode_se void amdgpu_aca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) { #if defined(CONFIG_DEBUG_FS) - if (!root || - (adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 6) && - adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 14))) + if (!root) return; debugfs_create_file("aca_debug_mode", 0200, root, adev, &aca_debug_mode_fops); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index 4327ce1ceacf..ba724c2a997d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -192,6 +192,7 @@ struct aca_info { int amdgpu_aca_init(struct amdgpu_device *adev); void amdgpu_aca_fini(struct amdgpu_device *adev); +int amdgpu_aca_reset(struct amdgpu_device *adev); void amdgpu_aca_set_smu_funcs(struct amdgpu_device *adev, const struct aca_smu_funcs *smu_funcs); bool amdgpu_aca_is_enabled(struct amdgpu_device *adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index da40c2d97df8..c7e602d69f2c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -193,27 +193,26 @@ static int amdgpu_mca_bank_set_merge(struct mca_bank_set *mca_set, struct mca_ba return 0; } -static int amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node) +static void amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node) { if (!node) - return -EINVAL; + return; list_del(&node->node); kvfree(node); mca_set->nr_entries--; - - return 0; } static void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set) { struct mca_bank_node *node, *tmp; - list_for_each_entry_safe(node, tmp, &mca_set->list, node) { - list_del(&node->node); - kvfree(node); - } + if (list_empty(&mca_set->list)) + return; + + list_for_each_entry_safe(node, tmp, &mca_set->list, node) + amdgpu_mca_bank_set_remove_node(mca_set, node); } void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs) @@ -608,9 +607,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(mca_debug_mode_fops, NULL, amdgpu_mca_smu_debug_mode_se void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) { #if defined(CONFIG_DEBUG_FS) - if (!root || - (amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6) && - amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 14))) + if (!root) return; debugfs_create_file("mca_debug_mode", 0200, root, adev, &mca_debug_mode_fops); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index fd6a06bd2683..2dffce25e273 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1911,6 +1911,23 @@ static void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, obj, &amdgpu_ras_debugfs_ops); } +static bool amdgpu_ras_aca_is_supported(struct amdgpu_device *adev) +{ + bool ret; + + switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) { + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): + ret = true; + break; + default: + ret = false; + break; + } + + return ret; +} + void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1937,10 +1954,12 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) } } - if (amdgpu_aca_is_enabled(adev)) - amdgpu_aca_smu_debugfs_init(adev, dir); - else - amdgpu_mca_smu_debugfs_init(adev, dir); + if (amdgpu_ras_aca_is_supported(adev)) { + if (amdgpu_aca_is_enabled(adev)) + amdgpu_aca_smu_debugfs_init(adev, dir); + else + amdgpu_mca_smu_debugfs_init(adev, dir); + } } /* debugfs end */ @@ -3428,6 +3447,15 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + if (amdgpu_ras_aca_is_supported(adev)) { + if (amdgpu_aca_is_enabled(adev)) + r = amdgpu_aca_init(adev); + else + r = amdgpu_mca_init(adev); + if (r) + goto release_con; + } + dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", adev->ras_hw_enabled, adev->ras_enabled); @@ -3636,25 +3664,22 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) amdgpu_ras_event_mgr_init(adev); - if (amdgpu_aca_is_enabled(adev)) { - if (!amdgpu_in_reset(adev)) { - r = amdgpu_aca_init(adev); + if (amdgpu_ras_aca_is_supported(adev)) { + if (amdgpu_in_reset(adev)) { + if (amdgpu_aca_is_enabled(adev)) + r = amdgpu_aca_reset(adev); + else + r = amdgpu_mca_reset(adev); if (r) return r; } - if (!amdgpu_sriov_vf(adev)) - amdgpu_ras_set_aca_debug_mode(adev, false); - } else { - if (amdgpu_in_reset(adev)) - r = amdgpu_mca_reset(adev); - else - r = amdgpu_mca_init(adev); - if (r) - return r; - - if (!amdgpu_sriov_vf(adev)) - amdgpu_ras_set_mca_debug_mode(adev, false); + if (!amdgpu_sriov_vf(adev)) { + if (amdgpu_aca_is_enabled(adev)) + amdgpu_ras_set_aca_debug_mode(adev, false); + else + amdgpu_ras_set_mca_debug_mode(adev, false); + } } /* Guest side doesn't need init ras feature */ @@ -3728,10 +3753,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) amdgpu_ras_fs_fini(adev); amdgpu_ras_interrupt_remove_all(adev); - if (amdgpu_aca_is_enabled(adev)) - amdgpu_aca_fini(adev); - else - amdgpu_mca_fini(adev); + if (amdgpu_ras_aca_is_supported(adev)) { + if (amdgpu_aca_is_enabled(adev)) + amdgpu_aca_fini(adev); + else + amdgpu_mca_fini(adev); + } WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared"); -- cgit v1.2.3 From 5f7697bbc1a41d4799797204137be85121063f65 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 23 May 2024 17:58:47 +0800 Subject: drm/amdgpu: trigger mode1 reset for RAS RMA status Check RMA status in bad page retirement flow. v2: fix coding bugs in v1. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 ++++++++++++++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 4 +++- 3 files changed, 29 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2dffce25e273..671bbb19995a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2068,8 +2068,9 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * struct amdgpu_device *adev = obj->adev; struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, obj->head.block, 0); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - if (!block_obj) + if (!block_obj || !con) return; /* both query_poison_status and handle_poison_consumption are optional, @@ -2092,14 +2093,17 @@ static void amdgpu_ras_interrupt_poison_consumption_handler(struct ras_manager * if (block_obj->hw_ops && block_obj->hw_ops->handle_poison_consumption) poison_stat = block_obj->hw_ops->handle_poison_consumption(adev); - /* gpu reset is fallback for failed and default cases */ - if (poison_stat) { + /* gpu reset is fallback for failed and default cases. + * For RMA case, amdgpu_umc_poison_handler will handle gpu reset. + */ + if (poison_stat && !con->is_rma) { dev_info(adev->dev, "GPU reset for %s RAS poison consumption is issued!\n", block_obj->ras_comm.name); amdgpu_ras_reset_gpu(adev); - } else { - amdgpu_gfx_poison_consumption_handler(adev, entry); } + + if (!poison_stat) + amdgpu_gfx_poison_consumption_handler(adev, entry); } static void amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj, @@ -2815,6 +2819,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) page_retirement_dwork.work); struct amdgpu_device *adev = con->adev; struct ras_err_data err_data; + unsigned long err_cnt; if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) return; @@ -2822,9 +2827,13 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_init(&err_data); amdgpu_umc_handle_bad_pages(adev, &err_data); + err_cnt = err_data.err_addr_cnt; amdgpu_ras_error_data_fini(&err_data); + if (err_cnt && con->is_rma) + amdgpu_ras_reset_gpu(adev); + mutex_lock(&con->umc_ecc_log.lock); if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree, UMC_ECC_NEW_DETECTED_TAG)) @@ -2881,7 +2890,8 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, if (poison_msg->pasid_fn) poison_msg->pasid_fn(adev, pasid, poison_msg->data); - if (reset) { + /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ + if (reset && !con->is_rma) { flush_delayed_work(&con->page_retirement_dwork); con->gpu_reset_flags |= reset; @@ -4010,6 +4020,12 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* mode1 is the only selection for RMA status */ + if (ras->is_rma) { + ras->gpu_reset_flags = 0; + ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET; + } + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 540e0f066b26..20e0e522fb51 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -195,7 +195,8 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); amdgpu_umc_handle_bad_pages(adev, ras_error_status); - if (err_data->ue_count && reset) { + if ((err_data->ue_count || err_data->de_count) && + (reset || (con && con->is_rma))) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } @@ -211,6 +212,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, .block = AMDGPU_RAS_BLOCK__UMC, }; struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); uint32_t timeout = timeout_ms; memset(&err_data, 0, sizeof(err_data)); @@ -243,9 +245,7 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (reset) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - + if (reset || (err_data.err_addr_cnt && con && con->is_rma)) { con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c index 9e7ce1e6bc06..9cd221ed240c 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c @@ -85,6 +85,7 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) && (entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) && !entry->vmid && !entry->pasid) { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); uint32_t rlc_status0 = 0; rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0); @@ -96,7 +97,8 @@ static int gfx_v11_0_3_poison_consumption_handler(struct amdgpu_device *adev, ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET; } - amdgpu_ras_reset_gpu(adev); + if (con && !con->is_rma) + amdgpu_ras_reset_gpu(adev); } return 0; -- cgit v1.2.3 From 7e4371676e5e58739ffc884b1b5d6bbf1cce3d17 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Wed, 29 May 2024 15:39:41 +0800 Subject: drm/amdgpu: create amdgpu_ras_in_recovery to simplify code Reduce redundant code and user doesn't need to pay attention to RAS details. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 ++------- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 14 ++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 31 +++++++++++++--------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 5 ++-- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 3 +-- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 12 +-------- 7 files changed, 29 insertions(+), 51 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7df5544ac983..3fb02f5b91c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6276,20 +6276,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) struct amdgpu_reset_context reset_context; u32 memsize; struct list_head device_list; - struct amdgpu_hive_info *hive; - int hive_ras_recovery = 0; - struct amdgpu_ras *ras; /* PCI error slot reset should be skipped During RAS recovery */ - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - ras = amdgpu_ras_get_context(adev); if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + amdgpu_ras_in_recovery(adev)) return PCI_ERS_RESULT_RECOVERED; DRM_INFO("PCI error: slot reset callback!!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 19b1817b55d7..82452606ae6c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -506,9 +506,6 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) { struct amdgpu_kiq *kiq = &adev->gfx.kiq[xcc_id]; struct amdgpu_ring *kiq_ring = &kiq->ring; - struct amdgpu_hive_info *hive; - struct amdgpu_ras *ras; - int hive_ras_recovery = 0; int i, r = 0; int j; @@ -533,16 +530,9 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) * This is workaround: only skip kiq_ring test * during ras recovery in suspend stage for gfx9.4.3 */ - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - - ras = amdgpu_ras_get_context(adev); if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) { + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + amdgpu_ras_in_recovery(adev)) { spin_unlock(&kiq->ring_lock); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 671bbb19995a..02d9ef988e81 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1409,11 +1409,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block block) { struct amdgpu_ras_block_object *block_obj = amdgpu_ras_get_ras_block(adev, block, 0); - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; const struct aca_smu_funcs *smu_funcs = adev->aca.smu_funcs; - struct amdgpu_hive_info *hive; - int hive_ras_recovery = 0; if (!block_obj || !block_obj->hw_ops) { dev_dbg_once(adev->dev, "%s doesn't config RAS function\n", @@ -1425,15 +1422,8 @@ int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, !amdgpu_ras_get_aca_debug_mode(adev)) return -EOPNOTSUPP; - hive = amdgpu_get_xgmi_hive(adev); - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - /* skip ras error reset in gpu reset */ - if ((amdgpu_in_reset(adev) || atomic_read(&ras->in_recovery) || - hive_ras_recovery) && + if ((amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) && ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode))) return -EOPNOTSUPP; @@ -2461,6 +2451,23 @@ static void amdgpu_ras_set_fed_all(struct amdgpu_device *adev, } } +bool amdgpu_ras_in_recovery(struct amdgpu_device *adev) +{ + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + int hive_ras_recovery = 0; + + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + + if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + return true; + + return false; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = @@ -2821,7 +2828,7 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) struct ras_err_data err_data; unsigned long err_cnt; - if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery)) + if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) return; amdgpu_ras_error_data_init(&err_data); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e70c45712ddb..83437fef9df5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -954,6 +954,8 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, enum amdgpu_ras_block block, uint16_t pasid, pasid_notify pasid_fn, void *data, uint32_t reset); +bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); + __printf(3, 4) void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, const char *fmt, ...); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index 825786fc849e..04533f99f1e3 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1863,7 +1863,6 @@ static int aldebaran_mode1_reset(struct smu_context *smu) u32 fatal_err, param; int ret = 0; struct amdgpu_device *adev = smu->adev; - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); fatal_err = 0; param = SMU_RESET_MODE_1; @@ -1876,8 +1875,8 @@ static int aldebaran_mode1_reset(struct smu_context *smu) } else { /* fatal error triggered by ras, PMFW supports the flag from 68.44.0 */ - if ((smu->smc_fw_version >= 0x00442c00) && ras && - atomic_read(&ras->in_recovery)) + if ((smu->smc_fw_version >= 0x00442c00) && + amdgpu_ras_in_recovery(adev)) fatal_err = 1; param |= (fatal_err << 16); diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 6c24e2306383..ce9b77074cb1 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2786,10 +2786,9 @@ static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu, uint32_t *param) { struct amdgpu_device *adev = smu->adev; - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); if ((smu->smc_fw_version >= supported_version) && - ras && atomic_read(&ras->in_recovery)) + amdgpu_ras_in_recovery(adev)) /* Set RAS fatal error reset flag */ *param = 1 << 16; else diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index ffe3528e92cb..6790095a2fa7 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2574,24 +2574,14 @@ failed: static int smu_v13_0_6_mode1_reset(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; - struct amdgpu_hive_info *hive = NULL; - u32 hive_ras_recovery = 0; - struct amdgpu_ras *ras; u32 fatal_err, param; int ret = 0; - hive = amdgpu_get_xgmi_hive(adev); - ras = amdgpu_ras_get_context(adev); fatal_err = 0; param = SMU_RESET_MODE_1; - if (hive) { - hive_ras_recovery = atomic_read(&hive->ras_recovery); - amdgpu_put_xgmi_hive(hive); - } - /* fatal error triggered by ras, PMFW supports the flag */ - if (ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + if (amdgpu_ras_in_recovery(adev)) fatal_err = 1; param |= (fatal_err << 16); -- cgit v1.2.3 From 09a3d8202df1e9fa1eb5f5f63524c8948275ff4c Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 6 Jun 2024 11:20:57 +0800 Subject: drm/amdgpu: set RAS fed status for more cases Indicate fatal error for each RAS block and NBIO. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 1 + 2 files changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 02d9ef988e81..68e9935028db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2131,6 +2131,7 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, /* Let IP handle its data, maybe we need get the output * from the callback to update the error type/count, etc */ + amdgpu_ras_set_fed(obj->adev, true); ret = data->cb(obj->adev, &err_data, entry); /* ue will trigger an interrupt, and in that case * we need do a reset to recovery the whole system. diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 32cc60ce5521..8d80df94bd8b 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -414,6 +414,7 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device /* ras_controller_int is dedicated for nbif ras error, * not the global interrupt for sync flood */ + amdgpu_ras_set_fed(adev, true); amdgpu_ras_reset_gpu(adev); } -- cgit v1.2.3