diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 201 | 
1 files changed, 115 insertions, 86 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 138b09f6227d..c2c791ca00f4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -33,6 +33,7 @@  #include "amdgpu_atomfirmware.h"  #include "amdgpu_xgmi.h"  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include "atom.h"  static const char *RAS_FS_NAME = "ras"; @@ -320,11 +321,14 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,   * "disable" requires only the block.   * "enable" requires the block and error type.   * "inject" requires the block, error type, address, and value. + *   * The block is one of: umc, sdma, gfx, etc.   *	see ras_block_string[] for details + *   * The error type is one of: ue, ce, where,   *	ue is multi-uncorrectable   *	ce is single-correctable + *   * The sub-block is a the sub-block index, pass 0 if there is no sub-block.   * The address and value are hexadecimal numbers, leading 0x is optional.   * @@ -531,7 +535,7 @@ static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj; -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return NULL;  	if (head->block >= AMDGPU_RAS_BLOCK_COUNT) @@ -558,7 +562,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,  	struct ras_manager *obj;  	int i; -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return NULL;  	if (head) { @@ -585,36 +589,11 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,  }  /* obj end */ -static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, -					 const char* invoke_type, -					 const char* block_name, -					 enum ta_ras_status ret) -{ -	switch (ret) { -	case TA_RAS_STATUS__SUCCESS: -		return; -	case TA_RAS_STATUS__ERROR_RAS_NOT_AVAILABLE: -		dev_warn(adev->dev, -			"RAS WARN: %s %s currently unavailable\n", -			invoke_type, -			block_name); -		break; -	default: -		dev_err(adev->dev, -			"RAS ERROR: %s %s error failed ret 0x%X\n", -			invoke_type, -			block_name, -			ret); -	} -} -  /* feature ctl begin */  static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev, -		struct ras_common_if *head) +					 struct ras_common_if *head)  { -	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - -	return con->hw_supported & BIT(head->block); +	return adev->ras_hw_enabled & BIT(head->block);  }  static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev, @@ -658,11 +637,7 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,  		con->features |= BIT(head->block);  	} else {  		if (obj && amdgpu_ras_is_feature_enabled(adev, head)) { -			/* skip clean gfx ras context feature for VEGA20 Gaming. -			 * will clean later -			 */ -			if (!(!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX))) -				con->features &= ~BIT(head->block); +			con->features &= ~BIT(head->block);  			put_obj(obj);  		}  	} @@ -708,15 +683,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,  	if (!amdgpu_ras_intr_triggered()) {  		ret = psp_ras_enable_features(&adev->psp, info, enable);  		if (ret) { -			amdgpu_ras_parse_status_code(adev, -						     enable ? "enable":"disable", -						     ras_block_str(head->block), -						    (enum ta_ras_status)ret); -			if (ret == TA_RAS_STATUS__RESET_NEEDED) -				ret = -EAGAIN; -			else -				ret = -EINVAL; - +			dev_err(adev->dev, "ras %s %s failed %d\n", +				enable ? "enable":"disable", +				ras_block_str(head->block), +				ret);  			goto out;  		}  	} @@ -770,6 +740,10 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,  				con->features |= BIT(head->block);  			ret = amdgpu_ras_feature_enable(adev, head, 0); + +			/* clean gfx block ras features flag */ +			if (adev->ras_enabled && head->block == AMDGPU_RAS_BLOCK__GFX) +				con->features &= ~BIT(head->block);  		}  	} else  		ret = amdgpu_ras_feature_enable(adev, head, enable); @@ -890,6 +864,11 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,  		    adev->gmc.xgmi.ras_funcs->query_ras_error_count)  			adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data);  		break; +	case AMDGPU_RAS_BLOCK__HDP: +		if (adev->hdp.ras_funcs && +		    adev->hdp.ras_funcs->query_ras_error_count) +			adev->hdp.ras_funcs->query_ras_error_count(adev, &err_data); +		break;  	default:  		break;  	} @@ -901,17 +880,42 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,  	info->ce_count = obj->err_data.ce_count;  	if (err_data.ce_count) { -		dev_info(adev->dev, "%ld correctable hardware errors " +		if (adev->smuio.funcs && +		    adev->smuio.funcs->get_socket_id && +		    adev->smuio.funcs->get_die_id) { +			dev_info(adev->dev, "socket: %d, die: %d " +					"%ld correctable hardware errors " +					"detected in %s block, no user " +					"action is needed.\n", +					adev->smuio.funcs->get_socket_id(adev), +					adev->smuio.funcs->get_die_id(adev), +					obj->err_data.ce_count, +					ras_block_str(info->head.block)); +		} else { +			dev_info(adev->dev, "%ld correctable hardware errors "  					"detected in %s block, no user "  					"action is needed.\n",  					obj->err_data.ce_count,  					ras_block_str(info->head.block)); +		}  	}  	if (err_data.ue_count) { -		dev_info(adev->dev, "%ld uncorrectable hardware errors " +		if (adev->smuio.funcs && +		    adev->smuio.funcs->get_socket_id && +		    adev->smuio.funcs->get_die_id) { +			dev_info(adev->dev, "socket: %d, die: %d " +					"%ld uncorrectable hardware errors " +					"detected in %s block\n", +					adev->smuio.funcs->get_socket_id(adev), +					adev->smuio.funcs->get_die_id(adev), +					obj->err_data.ue_count, +					ras_block_str(info->head.block)); +		} else { +			dev_info(adev->dev, "%ld uncorrectable hardware errors "  					"detected in %s block\n",  					obj->err_data.ue_count,  					ras_block_str(info->head.block)); +		}  	}  	return 0; @@ -937,11 +941,20 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,  		if (adev->mmhub.ras_funcs &&  		    adev->mmhub.ras_funcs->reset_ras_error_count)  			adev->mmhub.ras_funcs->reset_ras_error_count(adev); + +		if (adev->mmhub.ras_funcs && +		    adev->mmhub.ras_funcs->reset_ras_error_status) +			adev->mmhub.ras_funcs->reset_ras_error_status(adev);  		break;  	case AMDGPU_RAS_BLOCK__SDMA:  		if (adev->sdma.funcs->reset_ras_error_count)  			adev->sdma.funcs->reset_ras_error_count(adev);  		break; +	case AMDGPU_RAS_BLOCK__HDP: +		if (adev->hdp.ras_funcs && +		    adev->hdp.ras_funcs->reset_ras_error_count) +			adev->hdp.ras_funcs->reset_ras_error_count(adev); +		break;  	default:  		break;  	} @@ -1022,10 +1035,9 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,  		ret = -EINVAL;  	} -	amdgpu_ras_parse_status_code(adev, -				     "inject", -				     ras_block_str(info->head.block), -				     (enum ta_ras_status)ret); +	if (ret) +		dev_err(adev->dev, "ras inject %s failed %d\n", +			ras_block_str(info->head.block), ret);  	return ret;  } @@ -1038,7 +1050,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,  	struct ras_manager *obj;  	struct ras_err_data data = {0, 0}; -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return 0;  	list_for_each_entry(obj, &con->head, node) { @@ -1265,8 +1277,8 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)  static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	struct dentry *dir; -	struct drm_minor *minor = adev_to_drm(adev)->primary; +	struct drm_minor  *minor = adev_to_drm(adev)->primary; +	struct dentry     *dir;  	dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root);  	debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, dir, adev, @@ -1275,6 +1287,8 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *  			    &amdgpu_ras_debugfs_eeprom_ops);  	debugfs_create_u32("bad_page_cnt_threshold", 0444, dir,  			   &con->bad_page_cnt_threshold); +	debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled); +	debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);  	/*  	 * After one uncorrectable error happens, usually GPU recovery will @@ -1561,7 +1575,7 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj; -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return;  	list_for_each_entry(obj, &con->head, node) { @@ -1611,7 +1625,7 @@ static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj; -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return;  	list_for_each_entry(obj, &con->head, node) { @@ -1925,7 +1939,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  	bool exc_err_limit = false;  	int ret; -	if (adev->ras_features && con) +	if (adev->ras_enabled && con)  		data = &con->eh_data;  	else  		return 0; @@ -2029,6 +2043,23 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)  }  /* + * this is workaround for vega20 workstation sku, + * force enable gfx ras, ignore vbios gfx ras flag + * due to GC EDC can not write + */ +static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) +{ +	struct atom_context *ctx = adev->mode_info.atom_context; + +	if (!ctx) +		return; + +	if (strnstr(ctx->vbios_version, "D16406", +		    sizeof(ctx->vbios_version))) +		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); +} + +/*   * check hardware's ras ability which will be saved in hw_supported.   * if hardware does not support ras, we can skip some ras initializtion and   * forbid some ras operations from IP. @@ -2037,11 +2068,9 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)   * we have to initialize ras as normal. but need check if operation is   * allowed or not in each function.   */ -static void amdgpu_ras_check_supported(struct amdgpu_device *adev, -		uint32_t *hw_supported, uint32_t *supported) +static void amdgpu_ras_check_supported(struct amdgpu_device *adev)  { -	*hw_supported = 0; -	*supported = 0; +	adev->ras_hw_enabled = adev->ras_enabled = 0;  	if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw ||  	    !amdgpu_ras_asic_supported(adev)) @@ -2050,33 +2079,34 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev,  	if (!adev->gmc.xgmi.connected_to_cpu) {  		if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {  			dev_info(adev->dev, "MEM ECC is active.\n"); -			*hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | -					1 << AMDGPU_RAS_BLOCK__DF); +			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | +						   1 << AMDGPU_RAS_BLOCK__DF);  		} else {  			dev_info(adev->dev, "MEM ECC is not presented.\n");  		}  		if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {  			dev_info(adev->dev, "SRAM ECC is active.\n"); -			*hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | -					1 << AMDGPU_RAS_BLOCK__DF); +			adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | +						    1 << AMDGPU_RAS_BLOCK__DF);  		} else {  			dev_info(adev->dev, "SRAM ECC is not presented.\n");  		}  	} else {  		/* driver only manages a few IP blocks RAS feature  		 * when GPU is connected cpu through XGMI */ -		*hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | -				1 << AMDGPU_RAS_BLOCK__SDMA | -				1 << AMDGPU_RAS_BLOCK__MMHUB); +		adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX | +					   1 << AMDGPU_RAS_BLOCK__SDMA | +					   1 << AMDGPU_RAS_BLOCK__MMHUB);  	} +	amdgpu_ras_get_quirks(adev); +  	/* hw_supported needs to be aligned with RAS block mask. */ -	*hw_supported &= AMDGPU_RAS_BLOCK_MASK; +	adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; -	*supported = amdgpu_ras_enable == 0 ? -			0 : *hw_supported & amdgpu_ras_mask; -	adev->ras_features = *supported; +	adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 : +		adev->ras_hw_enabled & amdgpu_ras_mask;  }  int amdgpu_ras_init(struct amdgpu_device *adev) @@ -2097,13 +2127,13 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	amdgpu_ras_set_context(adev, con); -	amdgpu_ras_check_supported(adev, &con->hw_supported, -			&con->supported); -	if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { +	amdgpu_ras_check_supported(adev); + +	if (!adev->ras_enabled || adev->asic_type == CHIP_VEGA10) {  		/* set gfx block ras context feature for VEGA20 Gaming  		 * send ras disable cmd to ras ta during ras late init.  		 */ -		if (!adev->ras_features && adev->asic_type == CHIP_VEGA20) { +		if (!adev->ras_enabled && adev->asic_type == CHIP_VEGA20) {  			con->features |= BIT(AMDGPU_RAS_BLOCK__GFX);  			return 0; @@ -2153,8 +2183,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	}  	dev_info(adev->dev, "RAS INFO: ras initialized successfully, " -			"hardware ability[%x] ras_mask[%x]\n", -			con->hw_supported, con->supported); +		 "hardware ability[%x] ras_mask[%x]\n", +		 adev->ras_hw_enabled, adev->ras_enabled); +  	return 0;  release_con:  	amdgpu_ras_set_context(adev, NULL); @@ -2163,7 +2194,7 @@ release_con:  	return r;  } -static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) +int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev)  {  	if (adev->gmc.xgmi.connected_to_cpu)  		return 1; @@ -2268,7 +2299,7 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);  	struct ras_manager *obj, *tmp; -	if (!adev->ras_features || !con) { +	if (!adev->ras_enabled || !con) {  		/* clean ras context for VEGA20 Gaming after send ras disable cmd */  		amdgpu_release_ras_context(adev); @@ -2314,7 +2345,7 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return;  	amdgpu_ras_disable_all_features(adev, 0); @@ -2328,7 +2359,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return 0; @@ -2342,7 +2373,7 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  {  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); -	if (!adev->ras_features || !con) +	if (!adev->ras_enabled || !con)  		return 0;  	amdgpu_ras_fs_fini(adev); @@ -2361,10 +2392,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  { -	uint32_t hw_supported, supported; - -	amdgpu_ras_check_supported(adev, &hw_supported, &supported); -	if (!hw_supported) +	amdgpu_ras_check_supported(adev); +	if (!adev->ras_hw_enabled)  		return;  	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { @@ -2393,7 +2422,7 @@ void amdgpu_release_ras_context(struct amdgpu_device *adev)  	if (!con)  		return; -	if (!adev->ras_features && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) { +	if (!adev->ras_enabled && con->features & BIT(AMDGPU_RAS_BLOCK__GFX)) {  		con->features &= ~BIT(AMDGPU_RAS_BLOCK__GFX);  		amdgpu_ras_set_context(adev, NULL);  		kfree(con); | 
