From dad6c45cbd40b57db95c9d46e01ff6d302e86042 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 16 Feb 2024 11:41:16 -0500 Subject: drm/amdkfd: Output migrate end event if migrate failed If page migration failed, also output migrate end event to match with migrate start event, with failure error_code added to the end of the migrate message macro. This will not break uAPI because application uses old message macro sscanf drop and ignore the error_code. Output GPU page fault restore end event if migration failed. Signed-off-by: Philip Yang Reviewed-by: James Zhu Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 717307d6b5b7..fa9f9846b88e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -609,6 +609,7 @@ struct kfd_ioctl_smi_events_args { * migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update * rw: 'W' for write page fault, 'R' for read page fault * rescheduled: 'R' if the queue restore failed and rescheduled to try again + * error_code: migrate failure error code, 0 if no error */ #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ "%x %s\n", (reset_seq_num), (reset_cause) @@ -630,9 +631,9 @@ struct kfd_ioctl_smi_events_args { "%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\ (from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger) -#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\ - "%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\ - (from), (to), (migrate_trigger) +#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger, error_code) \ + "%lld -%d @%lx(%lx) %x->%x %d %d\n", (ns), (pid), (start), (size),\ + (from), (to), (migrate_trigger), (error_code) #define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\ "%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger) -- cgit v1.2.3 From a3ab2d45b9887ee609cd3bea39f668236935774c Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Tue, 30 Jul 2024 15:33:23 -0400 Subject: drm/amdkfd: SMI report dropped event count Add new SMI event to report the dropped event count. When the event kfifo is full, drop count is not zero, or no enough space left to store the event message, increase drop count. After reading event out from kfifo, if event was dropped, drop_count is not zero, generate a dropped event record and reset drop count to zero. Signed-off-by: Philip Yang Reviewed-by: James Zhu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 27 +++++++++++++++++++++++---- include/uapi/linux/kfd_ioctl.h | 6 ++++++ 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 9b8169761ec5..c8d67d62ca3f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -42,6 +42,7 @@ struct kfd_smi_client { struct rcu_head rcu; pid_t pid; bool suser; + u32 drop_count; }; #define KFD_MAX_KFIFO_SIZE 8192 @@ -103,12 +104,28 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, } to_copy = min(size, to_copy); ret = kfifo_out(&client->fifo, buf, to_copy); - spin_unlock(&client->lock); if (ret <= 0) { + spin_unlock(&client->lock); ret = -EAGAIN; goto ret_err; } + if (client->drop_count) { + char msg[KFD_SMI_EVENT_MSG_SIZE]; + int len; + + len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT); + len += snprintf(msg + len, sizeof(msg) - len, + KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), + client->pid, client->drop_count)); + if (kfifo_avail(&client->fifo) >= len) { + kfifo_in(&client->fifo, msg, len); + client->drop_count = 0; + } + } + + spin_unlock(&client->lock); + ret = copy_to_user(user, buf, to_copy); if (ret) { ret = -EFAULT; @@ -182,13 +199,15 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev, list_for_each_entry_rcu(client, &dev->smi_clients, list) { if (!kfd_smi_ev_enabled(pid, client, smi_event)) continue; + spin_lock(&client->lock); - if (kfifo_avail(&client->fifo) >= len) { + if (!client->drop_count && kfifo_avail(&client->fifo) >= len) { kfifo_in(&client->fifo, event_msg, len); wake_up_all(&client->wait_queue); } else { - pr_debug("smi_event(EventID: %u): no space left\n", - smi_event); + client->drop_count++; + pr_debug("smi_event(EventID: %u): no space left drop_count %d\n", + smi_event, client->drop_count); } spin_unlock(&client->lock); } diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index fa9f9846b88e..7afd66d45313 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -530,6 +530,7 @@ enum kfd_smi_event { KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, + KFD_SMI_EVENT_DROPPED_EVENT = 12, /* * max event number, as a flag bit to get events from all processes, @@ -610,6 +611,7 @@ struct kfd_ioctl_smi_events_args { * rw: 'W' for write page fault, 'R' for read page fault * rescheduled: 'R' if the queue restore failed and rescheduled to try again * error_code: migrate failure error code, 0 if no error + * drop_count: how many events dropped when fifo is full */ #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ "%x %s\n", (reset_seq_num), (reset_cause) @@ -645,6 +647,10 @@ struct kfd_ioctl_smi_events_args { "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\ (node), (unmap_trigger) +#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\ + "%lld -%d %d\n", (ns), (pid), (drop_count) + + /************************************************************************************************** * CRIU IOCTLs (Checkpoint Restore In Userspace) * -- cgit v1.2.3 From 97ddae76ddd20ea35d2059086aacd85b707a09c5 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 21 Oct 2024 13:41:13 -0400 Subject: Revert "drm/amdkfd: SMI report dropped event count" This reverts commit a3ab2d45b9887ee609cd3bea39f668236935774c. The userspace side for this code is not ready yet so revert for now. Reviewed-by: Philip Yang Signed-off-by: Alex Deucher Cc: Philip Yang --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 27 ++++----------------------- include/uapi/linux/kfd_ioctl.h | 6 ------ 2 files changed, 4 insertions(+), 29 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index c8d67d62ca3f..9b8169761ec5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -42,7 +42,6 @@ struct kfd_smi_client { struct rcu_head rcu; pid_t pid; bool suser; - u32 drop_count; }; #define KFD_MAX_KFIFO_SIZE 8192 @@ -104,28 +103,12 @@ static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, } to_copy = min(size, to_copy); ret = kfifo_out(&client->fifo, buf, to_copy); + spin_unlock(&client->lock); if (ret <= 0) { - spin_unlock(&client->lock); ret = -EAGAIN; goto ret_err; } - if (client->drop_count) { - char msg[KFD_SMI_EVENT_MSG_SIZE]; - int len; - - len = snprintf(msg, sizeof(msg), "%x ", KFD_SMI_EVENT_DROPPED_EVENT); - len += snprintf(msg + len, sizeof(msg) - len, - KFD_EVENT_FMT_DROPPED_EVENT(ktime_get_boottime_ns(), - client->pid, client->drop_count)); - if (kfifo_avail(&client->fifo) >= len) { - kfifo_in(&client->fifo, msg, len); - client->drop_count = 0; - } - } - - spin_unlock(&client->lock); - ret = copy_to_user(user, buf, to_copy); if (ret) { ret = -EFAULT; @@ -199,15 +182,13 @@ static void add_event_to_kfifo(pid_t pid, struct kfd_node *dev, list_for_each_entry_rcu(client, &dev->smi_clients, list) { if (!kfd_smi_ev_enabled(pid, client, smi_event)) continue; - spin_lock(&client->lock); - if (!client->drop_count && kfifo_avail(&client->fifo) >= len) { + if (kfifo_avail(&client->fifo) >= len) { kfifo_in(&client->fifo, event_msg, len); wake_up_all(&client->wait_queue); } else { - client->drop_count++; - pr_debug("smi_event(EventID: %u): no space left drop_count %d\n", - smi_event, client->drop_count); + pr_debug("smi_event(EventID: %u): no space left\n", + smi_event); } spin_unlock(&client->lock); } diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 7afd66d45313..fa9f9846b88e 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -530,7 +530,6 @@ enum kfd_smi_event { KFD_SMI_EVENT_QUEUE_EVICTION = 9, KFD_SMI_EVENT_QUEUE_RESTORE = 10, KFD_SMI_EVENT_UNMAP_FROM_GPU = 11, - KFD_SMI_EVENT_DROPPED_EVENT = 12, /* * max event number, as a flag bit to get events from all processes, @@ -611,7 +610,6 @@ struct kfd_ioctl_smi_events_args { * rw: 'W' for write page fault, 'R' for read page fault * rescheduled: 'R' if the queue restore failed and rescheduled to try again * error_code: migrate failure error code, 0 if no error - * drop_count: how many events dropped when fifo is full */ #define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\ "%x %s\n", (reset_seq_num), (reset_cause) @@ -647,10 +645,6 @@ struct kfd_ioctl_smi_events_args { "%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\ (node), (unmap_trigger) -#define KFD_EVENT_FMT_DROPPED_EVENT(ns, pid, drop_count)\ - "%lld -%d %d\n", (ns), (pid), (drop_count) - - /************************************************************************************************** * CRIU IOCTLs (Checkpoint Restore In Userspace) * -- cgit v1.2.3