From 2c2b0d880f1b4c01f30e14242977b82fa527342d Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Thu, 23 Jul 2020 23:09:57 -0400 Subject: drm/amdkfd: Add thermal throttling SMI event Add support for reporting thermal throttling events through SMI. Also, add a counter to count the number of throttling interrupts observed and report the count in the SMI event message. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4e..df6c7a43aadc 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args { * KFD SMI(System Management Interface) events */ /* Event type (defined by bitmask) */ -#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 +#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 +#define KFD_SMI_EVENT_THERMAL_THROTTLE 0x0000000000000002 struct kfd_ioctl_smi_events_args { __u32 gpuid; /* to KFD */ -- cgit v1.2.3 From 522ec6e0eed0ab0678e7d5b5bf00487dfe83f7ce Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Thu, 30 Jul 2020 18:04:33 -0400 Subject: drm/amdkfd: Replace bitmask with event idx in SMI event msg Event bitmask is a 64-bit mask with only 1 bit set. Sending this event bitmask in KFD SMI event message is both wasteful of memory and potentially limiting to only 64 events. Instead send event index in SMI event message. Please note this change does not break the ABI for the two event types defined so far. The new index is identical to the mask used before. Signed-off-by: Mukul Joshi Suggested-by: Felix Kuehling Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 24 +++++++++++++----------- include/uapi/linux/kfd_ioctl.h | 10 +++++++--- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 86c2c3e97944..4d4b6e3ab697 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -149,7 +149,7 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) return 0; } -static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event, +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, char *event_msg, int len) { struct kfd_smi_client *client; @@ -157,14 +157,15 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event rcu_read_lock(); list_for_each_entry_rcu(client, &dev->smi_clients, list) { - if (!(READ_ONCE(client->events) & smi_event)) + if (!(READ_ONCE(client->events) & + KFD_SMI_EVENT_MASK_FROM_INDEX(smi_event))) continue; spin_lock(&client->lock); if (kfifo_avail(&client->fifo) >= len) { kfifo_in(&client->fifo, event_msg, len); wake_up_all(&client->wait_queue); } else { - pr_debug("smi_event(EventID: %llu): no space left\n", + pr_debug("smi_event(EventID: %u): no space left\n", smi_event); } spin_unlock(&client->lock); @@ -180,21 +181,21 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, /* * ThermalThrottle msg = throttle_bitmask(8): * thermal_interrupt_count(16): - * 16 bytes event + 1 byte space + 8 byte throttle_bitmask + + * 1 byte event + 1 byte space + 8 byte throttle_bitmask + * 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n + - * 1 byte \0 = 44 + * 1 byte \0 = 29 */ - char fifo_in[44]; + char fifo_in[29]; int len; if (list_empty(&dev->smi_clients)) return; - len = snprintf(fifo_in, 44, "%x %x:%llx\n", + len = snprintf(fifo_in, 29, "%x %x:%llx\n", KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask, atomic64_read(&adev->smu.throttle_int_counter)); - add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len); + add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len); } void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) @@ -202,9 +203,10 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; struct amdgpu_task_info task_info; /* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */ - /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 + /* 1 byte event + 1 byte space + 25 bytes msg + 1 byte \n + + * 1 byte \0 = 29 */ - char fifo_in[43]; + char fifo_in[29]; int len; if (list_empty(&dev->smi_clients)) @@ -216,7 +218,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) if (!task_info.pid) return; - len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, + len = snprintf(fifo_in, 29, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, task_info.pid, task_info.task_name); add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len); diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index df6c7a43aadc..cb1f963a84e0 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -449,9 +449,13 @@ struct kfd_ioctl_import_dmabuf_args { /* * KFD SMI(System Management Interface) events */ -/* Event type (defined by bitmask) */ -#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 -#define KFD_SMI_EVENT_THERMAL_THROTTLE 0x0000000000000002 +enum kfd_smi_event { + KFD_SMI_EVENT_NONE = 0, /* not used */ + KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ + KFD_SMI_EVENT_THERMAL_THROTTLE = 2, +}; + +#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) struct kfd_ioctl_smi_events_args { __u32 gpuid; /* to KFD */ -- cgit v1.2.3 From 55977744f9d862512a524fea93fc5226b09e76a9 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 28 Aug 2020 18:50:42 -0400 Subject: drm/amdkfd: Add GPU reset SMI event Add support for reporting GPU reset events through SMI. KFD would report both pre and post GPU reset events. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 4 ++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 34 +++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 1 + include/uapi/linux/kfd_ioctl.h | 2 ++ 5 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index e1cd6599529f..0e71a0543f98 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -812,6 +812,8 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) if (!kfd->init_complete) return 0; + kfd_smi_event_update_gpu_reset(kfd, false); + kfd->dqm->ops.pre_reset(kfd->dqm); kgd2kfd_suspend(kfd, false); @@ -840,6 +842,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) atomic_set(&kfd->sram_ecc_flag, 0); + kfd_smi_event_update_gpu_reset(kfd, true); + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index f14beb93acb4..023629f28495 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -312,6 +312,8 @@ struct kfd_dev { /* Clients watching SMI events */ struct list_head smi_clients; spinlock_t smi_lock; + + uint32_t reset_seq_num; }; enum kfd_mempool { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 4d4b6e3ab697..17d1736367ea 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -174,6 +174,36 @@ static void add_event_to_kfifo(struct kfd_dev *dev, unsigned int smi_event, rcu_read_unlock(); } +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset) +{ + /* + * GpuReset msg = Reset seq number (incremented for + * every reset message sent before GPU reset). + * 1 byte event + 1 byte space + 8 bytes seq num + + * 1 byte \n + 1 byte \0 = 12 + */ + char fifo_in[12]; + int len; + unsigned int event; + + if (list_empty(&dev->smi_clients)) + return; + + memset(fifo_in, 0x0, sizeof(fifo_in)); + + if (post_reset) { + event = KFD_SMI_EVENT_GPU_POST_RESET; + } else { + event = KFD_SMI_EVENT_GPU_PRE_RESET; + ++(dev->reset_seq_num); + } + + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x\n", event, + dev->reset_seq_num); + + add_event_to_kfifo(dev, event, fifo_in, len); +} + void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask) { @@ -191,7 +221,7 @@ void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, if (list_empty(&dev->smi_clients)) return; - len = snprintf(fifo_in, 29, "%x %x:%llx\n", + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%llx\n", KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask, atomic64_read(&adev->smu.throttle_int_counter)); @@ -218,7 +248,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) if (!task_info.pid) return; - len = snprintf(fifo_in, 29, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, + len = snprintf(fifo_in, sizeof(fifo_in), "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, task_info.pid, task_info.task_name); add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index 15537b2cccb5..b9b0438202e2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -27,5 +27,6 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, uint32_t throttle_bitmask); +void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); #endif diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index cb1f963a84e0..8b7368bfbd84 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -453,6 +453,8 @@ enum kfd_smi_event { KFD_SMI_EVENT_NONE = 0, /* not used */ KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ KFD_SMI_EVENT_THERMAL_THROTTLE = 2, + KFD_SMI_EVENT_GPU_PRE_RESET = 3, + KFD_SMI_EVENT_GPU_POST_RESET = 4, }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) -- cgit v1.2.3 From 5dc1a0bcb758c343b873e8330ee986417f5a1727 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Fri, 28 Aug 2020 19:53:08 -0400 Subject: include/uapi/linux: Fix indentation in kfd_smi_event enum Replace spaces with Tabs to fix indentation in kfd_smi_event enum. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 8b7368bfbd84..695b606da4b1 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -450,9 +450,9 @@ struct kfd_ioctl_import_dmabuf_args { * KFD SMI(System Management Interface) events */ enum kfd_smi_event { - KFD_SMI_EVENT_NONE = 0, /* not used */ - KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ - KFD_SMI_EVENT_THERMAL_THROTTLE = 2, + KFD_SMI_EVENT_NONE = 0, /* not used */ + KFD_SMI_EVENT_VMFAULT = 1, /* event start counting at 1 */ + KFD_SMI_EVENT_THERMAL_THROTTLE = 2, KFD_SMI_EVENT_GPU_PRE_RESET = 3, KFD_SMI_EVENT_GPU_POST_RESET = 4, }; -- cgit v1.2.3