diff options
| author | Luben Tuikov <luben.tuikov@amd.com> | 2021-05-21 11:53:09 -0400 | 
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2021-05-27 12:23:06 -0400 | 
| commit | 05adfd80cc52e0b4581e65bb5418de5dfd24d105 (patch) | |
| tree | d6634299bb56f99f115e49d8a8059bbe56875894 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
| parent | a46751fbcde505e6aff8622e17995092c8d86ae4 (diff) | |
drm/amdgpu: Use delayed work to collect RAS error counters
On Context Query2 IOCTL return the correctable and
uncorrectable errors in O(1) fashion, from cached
values, and schedule a delayed work function to
calculate and cache them for the next such IOCTL.
v2: Cancel pending delayed work at ras_fini().
v3: Remove conditionals when dealing with delayed
    work manipulation as they're inherently racy.
Cc: Alexander Deucher <Alexander.Deucher@amd.com>
Cc: Christian König <christian.koenig@amd.com>
Cc: John Clements <john.clements@amd.com>
Cc: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
Reviewed-by: Alexander Deucher <Alexander.Deucher@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 | 
1 files changed, 40 insertions, 0 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ed3c43e8b0b5..ec936cde2726 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -27,6 +27,7 @@  #include <linux/uaccess.h>  #include <linux/reboot.h>  #include <linux/syscalls.h> +#include <linux/pm_runtime.h>  #include "amdgpu.h"  #include "amdgpu_ras.h" @@ -2116,6 +2117,30 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)  		adev->ras_hw_enabled & amdgpu_ras_mask;  } +static void amdgpu_ras_counte_dw(struct work_struct *work) +{ +	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras, +					      ras_counte_delay_work.work); +	struct amdgpu_device *adev = con->adev; +	struct drm_device *dev = &adev->ddev; +	unsigned long ce_count, ue_count; +	int res; + +	res = pm_runtime_get_sync(dev->dev); +	if (res < 0) +		goto Out; + +	/* Cache new values. +	 */ +	amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); +	atomic_set(&con->ras_ce_count, ce_count); +	atomic_set(&con->ras_ue_count, ue_count); + +	pm_runtime_mark_last_busy(dev->dev); +Out: +	pm_runtime_put_autosuspend(dev->dev); +} +  int amdgpu_ras_init(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -2130,6 +2155,11 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	if (!con)  		return -ENOMEM; +	con->adev = adev; +	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw); +	atomic_set(&con->ras_ce_count, 0); +	atomic_set(&con->ras_ue_count, 0); +  	con->objs = (struct ras_manager *)(con + 1);  	amdgpu_ras_set_context(adev, con); @@ -2233,6 +2263,8 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,  			 struct ras_fs_if *fs_info,  			 struct ras_ih_if *ih_info)  { +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	unsigned long ue_count, ce_count;  	int r;  	/* disable RAS feature per IP block if it is not supported */ @@ -2273,6 +2305,12 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,  	if (r)  		goto sysfs; +	/* Those are the cached values at init. +	 */ +	amdgpu_ras_query_error_count(adev, &ce_count, &ue_count); +	atomic_set(&con->ras_ce_count, ce_count); +	atomic_set(&con->ras_ue_count, ue_count); +  	return 0;  cleanup:  	amdgpu_ras_sysfs_remove(adev, ras_block); @@ -2390,6 +2428,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  	if (con->features)  		amdgpu_ras_disable_all_features(adev, 1); +	cancel_delayed_work_sync(&con->ras_counte_delay_work); +  	amdgpu_ras_set_context(adev, NULL);  	kfree(con); | 
