summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdkfd
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c12
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c12
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_events.c11
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c7
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.c1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_migrate.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c12
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.c20
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_svm.h1
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_topology.c4
10 files changed, 46 insertions, 35 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 0f0719528bcc..22925df6a791 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2826,7 +2826,7 @@ retry:
static int runtime_disable(struct kfd_process *p)
{
- int i = 0, ret;
+ int i = 0, ret = 0;
bool was_enabled = p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED;
p->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_DISABLED;
@@ -2863,6 +2863,7 @@ static int runtime_disable(struct kfd_process *p)
/* disable ttmp setup */
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
+ int last_err = 0;
if (kfd_dbg_is_per_vmid_supported(pdd->dev)) {
pdd->spi_dbg_override =
@@ -2872,14 +2873,17 @@ static int runtime_disable(struct kfd_process *p)
pdd->dev->vm_info.last_vmid_kfd);
if (!pdd->dev->kfd->shared_resources.enable_mes)
- debug_refresh_runlist(pdd->dev->dqm);
+ last_err = debug_refresh_runlist(pdd->dev->dqm);
else
- kfd_dbg_set_mes_debug_mode(pdd,
+ last_err = kfd_dbg_set_mes_debug_mode(pdd,
!kfd_dbg_has_cwsr_workaround(pdd->dev));
+
+ if (last_err)
+ ret = last_err;
}
}
- return 0;
+ return ret;
}
static int kfd_ioctl_runtime_enable(struct file *filep, struct kfd_process *p, void *data)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 6e7bc983fc0b..d7a2e7178ea9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1897,6 +1897,8 @@ fail_packet_manager_init:
static int stop_cpsch(struct device_queue_manager *dqm)
{
+ int ret = 0;
+
dqm_lock(dqm);
if (!dqm->sched_running) {
dqm_unlock(dqm);
@@ -1904,9 +1906,10 @@ static int stop_cpsch(struct device_queue_manager *dqm)
}
if (!dqm->dev->kfd->shared_resources.enable_mes)
- unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
+ ret = unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES,
+ 0, USE_DEFAULT_GRACE_PERIOD, false);
else
- remove_all_kfd_queues_mes(dqm);
+ ret = remove_all_kfd_queues_mes(dqm);
dqm->sched_running = false;
@@ -1920,7 +1923,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)
dqm->detect_hang_info = NULL;
dqm_unlock(dqm);
- return 0;
+ return ret;
}
static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
@@ -2091,7 +2094,8 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm,
while (*fence_addr != fence_value) {
/* Fatal err detected, this response won't come */
- if (amdgpu_amdkfd_is_fed(dqm->dev->adev))
+ if (amdgpu_amdkfd_is_fed(dqm->dev->adev) ||
+ amdgpu_in_reset(dqm->dev->adev))
return -EIO;
if (time_after(jiffies, end_jiffies)) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 82905f3e54dd..5a190dd6be4e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -748,16 +748,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
uint64_t *slots = page_slots(p->signal_page);
uint32_t id;
- /*
- * If id is valid but slot is not signaled, GPU may signal the same event twice
- * before driver have chance to process the first interrupt, then signal slot is
- * auto-reset after set_event wakeup the user space, just drop the second event as
- * the application only need wakeup once.
- */
- if ((valid_id_bits > 31 || (1U << valid_id_bits) >= KFD_SIGNAL_EVENT_LIMIT) &&
- partial_id < KFD_SIGNAL_EVENT_LIMIT && slots[partial_id] == UNSIGNALED_EVENT_SLOT)
- goto out_unlock;
-
if (valid_id_bits)
pr_debug_ratelimited("Partial ID invalid: %u (%u valid bits)\n",
partial_id, valid_id_bits);
@@ -786,7 +776,6 @@ void kfd_signal_event_interrupt(u32 pasid, uint32_t partial_id,
}
}
-out_unlock:
rcu_read_unlock();
kfd_unref_process(p);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 4ceb251312a6..d76fb61869c7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -28,6 +28,7 @@
#include "kfd_device_queue_manager.h"
#include "kfd_smi_events.h"
#include "amdgpu_ras.h"
+#include "amdgpu_ras_mgr.h"
/*
* GFX9 SQ Interrupts
@@ -228,7 +229,11 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
kfd_signal_poison_consumed_event(dev, pasid);
- event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
+ if (amdgpu_uniras_enabled(dev->adev))
+ event_id = amdgpu_ras_mgr_gen_ras_event_seqno(dev->adev,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION);
+ else
+ event_id = amdgpu_ras_acquire_event_id(dev->adev, type);
RAS_EVENT_LOG(dev->adev, event_id,
"poison is consumed by client %d, kick off gpu reset flow\n", client_id);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 59a5a3fea65d..46c84fc60af1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -21,7 +21,6 @@
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/types.h>
-#include <linux/hmm.h>
#include <linux/dma-direction.h>
#include <linux/dma-mapping.h>
#include <linux/migrate.h>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
index 2eebf67f9c2c..2b7fd442d29c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h
@@ -31,7 +31,6 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/sched/mm.h>
-#include <linux/hmm.h>
#include "kfd_priv.h"
#include "kfd_svm.h"
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ddfe30c13e9d..a085faac9fe1 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1083,7 +1083,6 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
* for auto suspend
*/
if (pdd->runtime_inuse) {
- pm_runtime_mark_last_busy(adev_to_drm(pdd->dev->adev)->dev);
pm_runtime_put_autosuspend(adev_to_drm(pdd->dev->adev)->dev);
pdd->runtime_inuse = false;
}
@@ -1162,9 +1161,6 @@ static void kfd_process_wq_release(struct work_struct *work)
release_work);
struct dma_fence *ef;
- kfd_process_dequeue_from_all_devices(p);
- pqm_uninit(&p->pqm);
-
/*
* If GPU in reset, user queues may still running, wait for reset complete.
*/
@@ -1226,6 +1222,14 @@ static void kfd_process_notifier_release_internal(struct kfd_process *p)
cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work);
+ /*
+ * Dequeue and destroy user queues, it is not safe for GPU to access
+ * system memory after mmu release notifier callback returns because
+ * exit_mmap free process memory afterwards.
+ */
+ kfd_process_dequeue_from_all_devices(p);
+ pqm_uninit(&p->pqm);
+
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 74a1d3e1d52b..97c2270f278f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1698,7 +1698,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
start = map_start << PAGE_SHIFT;
end = (map_last + 1) << PAGE_SHIFT;
for (addr = start; !r && addr < end; ) {
- struct hmm_range *hmm_range = NULL;
+ struct amdgpu_hmm_range *range = NULL;
unsigned long map_start_vma;
unsigned long map_last_vma;
struct vm_area_struct *vma;
@@ -1737,9 +1737,12 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
}
WRITE_ONCE(p->svms.faulting_task, current);
- r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
- readonly, owner,
- &hmm_range);
+ range = amdgpu_hmm_range_alloc(NULL);
+ if (likely(range))
+ r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
+ readonly, owner, range);
+ else
+ r = -ENOMEM;
WRITE_ONCE(p->svms.faulting_task, NULL);
if (r)
pr_debug("failed %d to get svm range pages\n", r);
@@ -1750,7 +1753,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
if (!r) {
offset = (addr >> PAGE_SHIFT) - prange->start;
r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
- hmm_range->hmm_pfns);
+ range->hmm_range.hmm_pfns);
if (r)
pr_debug("failed %d to dma map range\n", r);
}
@@ -1758,14 +1761,17 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
svm_range_lock(prange);
/* Free backing memory of hmm_range if it was initialized
- * Overrride return value to TRY AGAIN only if prior returns
+ * Override return value to TRY AGAIN only if prior returns
* were successful
*/
- if (hmm_range && amdgpu_hmm_range_get_pages_done(hmm_range) && !r) {
+ if (range && !amdgpu_hmm_range_valid(range) && !r) {
pr_debug("hmm update the range, need validate again\n");
r = -EAGAIN;
}
+ /* Free the hmm range */
+ amdgpu_hmm_range_free(range);
+
if (!r && !list_empty(&prange->child_list)) {
pr_debug("range split by unmap in parallel, validate again\n");
r = -EAGAIN;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 01c7a4877904..a63dfc95b602 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -31,7 +31,6 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/sched/mm.h>
-#include <linux/hmm.h>
#include "amdgpu.h"
#include "kfd_priv.h"
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
index 5c98746eb72d..811636af14ea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c
@@ -530,7 +530,9 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version",
dev->gpu->kfd->sdma_fw_version);
sysfs_show_64bit_prop(buffer, offs, "unique_id",
- dev->gpu->xcp ?
+ dev->gpu->xcp &&
+ (dev->gpu->xcp->xcp_mgr->mode !=
+ AMDGPU_SPX_PARTITION_MODE) ?
dev->gpu->xcp->unique_id :
dev->gpu->adev->unique_id);
sysfs_show_32bit_prop(buffer, offs, "num_xcc",