From a82400b57abb6aff068bb3b21d1cccd63acbb863 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 29 Nov 2018 12:21:53 -0500 Subject: drm/amdgpu: Handle xgmi device removal. XGMI hive has some resources allocted on device init which needs to be deallocated when the device is unregistered. v2: Remove creation of dedicated wq for XGMI hive reset. v3: Use the gmc.xgmi.supported flag Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c75badfa5c4c..bfd286c40631 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1864,6 +1864,9 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) { int i, r; + if (adev->gmc.xgmi.num_physical_nodes > 1) + amdgpu_xgmi_remove_device(adev); + amdgpu_amdkfd_device_fini(adev); amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); -- cgit v1.2.3 From d4535e2c018bba71b49edeb5e396183920f5d341 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 29 Nov 2018 15:14:27 -0500 Subject: drm/amdgpu: Implement concurrent asic reset for XGMI. Use per hive wq to concurrently send reset commands to all nodes in the hive. v2: Switch to system_highpri_wq after dropping dedicated queue. Fix non XGMI code path KASAN error. Stop the hive reset for each node loop if there is a reset failure on any of the nodes. Signed-off-by: Andrey Grodzovsky Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 ++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c8ad6bf6618a..6fc023bae7fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -910,7 +910,9 @@ struct amdgpu_device { bool in_gpu_reset; struct mutex lock_reset; struct amdgpu_doorbell_index doorbell_index; + int asic_reset_res; + struct work_struct xgmi_reset_work; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bfd286c40631..9fd9f63adc08 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2356,6 +2356,19 @@ bool amdgpu_device_has_dc_support(struct amdgpu_device *adev) return amdgpu_device_asic_has_dc_support(adev->asic_type); } + +static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) +{ + struct amdgpu_device *adev = + container_of(__work, struct amdgpu_device, xgmi_reset_work); + + adev->asic_reset_res = amdgpu_asic_reset(adev); + if (adev->asic_reset_res) + DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", + adev->asic_reset_res, adev->ddev->unique); +} + + /** * amdgpu_device_init - initialize the driver * @@ -2454,6 +2467,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, amdgpu_device_delay_enable_gfx_off); + INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); + adev->gfx.gfx_off_req_count = 1; adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false; @@ -3331,10 +3346,31 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, */ if (need_full_reset) { list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - r = amdgpu_asic_reset(tmp_adev); - if (r) - DRM_WARN("ASIC reset failed with err r, %d for drm dev, %s", + /* For XGMI run all resets in parallel to speed up the process */ + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work)) + r = -EALREADY; + } else + r = amdgpu_asic_reset(tmp_adev); + + if (r) { + DRM_ERROR("ASIC reset failed with err r, %d for drm dev, %s", r, tmp_adev->ddev->unique); + break; + } + } + + /* For XGMI wait for all PSP resets to complete before proceed */ + if (!r) { + list_for_each_entry(tmp_adev, device_list_handle, + gmc.xgmi.head) { + if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) { + flush_work(&tmp_adev->xgmi_reset_work); + r = tmp_adev->asic_reset_res; + if (r) + break; + } + } } } @@ -3521,8 +3557,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (tmp_adev == adev) continue; - dev_info(tmp_adev->dev, "GPU reset begin for drm dev %s!\n", adev->ddev->unique); - amdgpu_device_lock_adev(tmp_adev); r = amdgpu_device_pre_asic_reset(tmp_adev, NULL, -- cgit v1.2.3 From 223577753b54acf0033de9585340909a0ef05e68 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sun, 2 Dec 2018 21:47:42 -0500 Subject: drm/amdgpu/si: fix SI after doorbell rework SI does not use doorbells, move asic doorbell init later asic check. Fixes: https://bugs.freedesktop.org/show_bug.cgi?id=108920 Reviewed-by: Oak Zeng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9fd9f63adc08..ef36cc595985 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -515,7 +515,6 @@ void amdgpu_device_pci_config_reset(struct amdgpu_device *adev) */ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) { - amdgpu_asic_init_doorbell_index(adev); /* No doorbell on SI hardware generation */ if (adev->asic_type < CHIP_BONAIRE) { @@ -529,6 +528,8 @@ static int amdgpu_device_doorbell_init(struct amdgpu_device *adev) if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET) return -EINVAL; + amdgpu_asic_init_doorbell_index(adev); + /* doorbell bar mapping */ adev->doorbell.base = pci_resource_start(adev->pdev, 2); adev->doorbell.size = pci_resource_len(adev->pdev, 2); -- cgit v1.2.3 From fc42d47ce0118e2f59a67ac0b0da56f9dc454bd9 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 11 Dec 2018 15:31:35 -0500 Subject: drm/amdgpu: Enable GPU recovery by default for CI I retested Bonaire (gfx7 dGPU) and it works fine. Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index ef36cc595985..b60afeade50a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3258,6 +3258,8 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev) if (amdgpu_gpu_recovery == -1) { switch (adev->asic_type) { + case CHIP_BONAIRE: + case CHIP_HAWAII: case CHIP_TOPAZ: case CHIP_TONGA: case CHIP_FIJI: -- cgit v1.2.3