diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 98 | 
1 files changed, 66 insertions, 32 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 95b3327168ac..54d8a3e7e75c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -373,7 +373,13 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo  	if (lock)  		mutex_lock(&tmp->hive_lock); -	tmp->pstate = -1; +	tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; +	tmp->hi_req_gpu = NULL; +	/* +	 * hive pstate on boot is high in vega20 so we have to go to low +	 * pstate on after boot. +	 */ +	tmp->hi_req_count = AMDGPU_MAX_XGMI_DEVICE_PER_HIVE;  	mutex_unlock(&xgmi_mutex);  	return tmp; @@ -383,50 +389,51 @@ int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)  {  	int ret = 0;  	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); -	struct amdgpu_device *tmp_adev; -	bool update_hive_pstate = true; -	bool is_high_pstate = pstate && adev->asic_type == CHIP_VEGA20; +	struct amdgpu_device *request_adev = hive->hi_req_gpu ? +						hive->hi_req_gpu : adev; +	bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; +	bool init_low = hive->pstate == AMDGPU_XGMI_PSTATE_UNKNOWN; -	if (!hive) +	/* fw bug so temporarily disable pstate switching */ +	if (!hive || adev->asic_type == CHIP_VEGA20)  		return 0;  	mutex_lock(&hive->hive_lock); -	if (hive->pstate == pstate) { -		adev->pstate = is_high_pstate ? pstate : adev->pstate; +	if (is_hi_req) +		hive->hi_req_count++; +	else +		hive->hi_req_count--; + +	/* +	 * Vega20 only needs single peer to request pstate high for the hive to +	 * go high but all peers must request pstate low for the hive to go low +	 */ +	if (hive->pstate == pstate || +			(!is_hi_req && hive->hi_req_count && !init_low))  		goto out; -	} -	dev_dbg(adev->dev, "Set xgmi pstate %d.\n", pstate); +	dev_dbg(request_adev->dev, "Set xgmi pstate %d.\n", pstate); -	ret = amdgpu_dpm_set_xgmi_pstate(adev, pstate); +	ret = amdgpu_dpm_set_xgmi_pstate(request_adev, pstate);  	if (ret) { -		dev_err(adev->dev, +		dev_err(request_adev->dev,  			"XGMI: Set pstate failure on device %llx, hive %llx, ret %d", -			adev->gmc.xgmi.node_id, -			adev->gmc.xgmi.hive_id, ret); +			request_adev->gmc.xgmi.node_id, +			request_adev->gmc.xgmi.hive_id, ret);  		goto out;  	} -	/* Update device pstate */ -	adev->pstate = pstate; - -	/* -	 * Update the hive pstate only all devices of the hive -	 * are in the same pstate -	 */ -	list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { -		if (tmp_adev->pstate != adev->pstate) { -			update_hive_pstate = false; -			break; -		} -	} -	if (update_hive_pstate || is_high_pstate) +	if (init_low) +		hive->pstate = hive->hi_req_count ? +					hive->pstate : AMDGPU_XGMI_PSTATE_MIN; +	else {  		hive->pstate = pstate; - +		hive->hi_req_gpu = pstate != AMDGPU_XGMI_PSTATE_MIN ? +							adev : NULL; +	}  out:  	mutex_unlock(&hive->hive_lock); -  	return ret;  } @@ -507,9 +514,6 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  		goto exit;  	} -	/* Set default device pstate */ -	adev->pstate = -1; -  	top_info = &adev->psp.xgmi_context.top_info;  	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); @@ -604,6 +608,8 @@ int amdgpu_xgmi_ras_late_init(struct amdgpu_device *adev)  	    adev->gmc.xgmi.num_physical_nodes == 0)  		return 0; +	amdgpu_xgmi_reset_ras_error_count(adev); +  	if (!adev->gmc.xgmi.ras_if) {  		adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);  		if (!adev->gmc.xgmi.ras_if) @@ -668,6 +674,32 @@ uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,  	return addr + dram_base_addr;  } +static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) +{ +	WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); +	WREG32_PCIE(pcs_status_reg, 0); +} + +void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) +{ +	uint32_t i; + +	switch (adev->asic_type) { +	case CHIP_ARCTURUS: +		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) +			pcs_clear_status(adev, +					 xgmi_pcs_err_status_reg_arct[i]); +		break; +	case CHIP_VEGA20: +		for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) +			pcs_clear_status(adev, +					 xgmi_pcs_err_status_reg_vg20[i]); +		break; +	default: +		break; +	} +} +  static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,  					      uint32_t value,  					      uint32_t *ue_count, @@ -758,6 +790,8 @@ int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,  		break;  	} +	amdgpu_xgmi_reset_ras_error_count(adev); +  	err_data->ue_count += ue_cnt;  	err_data->ce_count += ce_cnt;  | 
