From 1359fcbe0f4aa2cd6ea684727a5a111eebeeed3a Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Wed, 23 Feb 2022 11:47:05 +0200 Subject: habanalabs: add DRAM default page size to HW info When using the device memory allocation API the user ought to know what is the default allocation page size. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- include/uapi/misc/habanalabs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/misc') diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 1d6b4f0c4159..ae2441521467 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -409,6 +409,7 @@ enum hl_server_type { * @dram_page_size: The DRAM physical page size. * @number_of_user_interrupts: The number of interrupts that are available to the userspace * application to use. Relevant for Gaudi2 and later. + * @device_mem_alloc_default_page_size: default page size used in device memory allocation. */ struct hl_info_hw_ip_info { __u64 sram_base_address; @@ -436,6 +437,8 @@ struct hl_info_hw_ip_info { __u32 reserved3; __u16 number_of_user_interrupts; __u16 pad2; + __u64 reserved4; + __u64 device_mem_alloc_default_page_size; }; struct hl_info_dram_usage { -- cgit v1.2.3 From 050a6f349a09d3cefb14f4114bfa047b2c5b2a65 Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Wed, 23 Feb 2022 13:37:08 +0200 Subject: habanalabs: add user API to get valid DRAM page sizes Future devices will support multiple device memory page sizes. In addition, an API for the user was added for it to be able to control the device memory allocation page size. This patch is a complementary patch to inform the user of the available page size supported by the device. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/common/habanalabs.h | 2 + drivers/misc/habanalabs/common/habanalabs_ioctl.c | 24 +++++++++ drivers/misc/habanalabs/gaudi/gaudi.c | 7 +++ drivers/misc/habanalabs/goya/goya.c | 7 +++ include/uapi/misc/habanalabs.h | 63 +++++++++++++---------- 5 files changed, 77 insertions(+), 26 deletions(-) (limited to 'include/uapi/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 6eb35e4124c2..564797766f42 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -1304,6 +1304,7 @@ struct fw_load_mgr { * @get_stream_master_qid_arr: get pointer to stream masters QID array * @is_valid_dram_page_size: return true if page size is supported in device * memory allocation, otherwise false. + * @get_valid_dram_page_orders: get valid device memory allocation page orders */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -1432,6 +1433,7 @@ struct hl_asic_funcs { bool (*is_valid_dram_page_size)(u32 page_size); int (*mmu_get_real_page_size)(struct hl_device *hdev, struct hl_mmu_properties *mmu_prop, u32 page_size, u32 *real_page_size, bool is_dram_addr); + void (*get_valid_dram_page_orders)(struct hl_info_dev_memalloc_page_sizes *info); }; diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index 14c58579b9cd..c6fe35ae1238 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -591,6 +591,27 @@ static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args) return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; } +static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + struct hl_info_dev_memalloc_page_sizes info = {0}; + struct hl_device *hdev = hpriv->hdev; + u32 max_size = args->return_size; + + if ((!max_size) || (!out)) + return -EINVAL; + + /* + * Future ASICs that will support multiple DRAM page sizes will support only "powers of 2" + * pages (unlike some of the ASICs before supporting multiple page sizes). + * For this reason for all ASICs that not support multiple page size the function will + * return an empty bitmask indicating that multiple page sizes is not supported. + */ + hdev->asic_funcs->get_valid_dram_page_orders(&info); + + return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -641,6 +662,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_RAZWI_EVENT: return razwi_info(hpriv, args); + case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES: + return dev_mem_alloc_page_sizes_info(hpriv, args); + default: break; } diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 5979434d1905..64cb195bc26e 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -9378,6 +9378,12 @@ static u32 *gaudi_get_stream_master_qid_arr(void) return gaudi_stream_master; } +static void gaudi_get_valid_dram_page_orders(struct hl_info_dev_memalloc_page_sizes *info) +{ + /* set 0 since multiple pages are not supported */ + info->page_order_bitmask = 0; +} + static ssize_t infineon_ver_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hl_device *hdev = dev_get_drvdata(dev); @@ -9489,6 +9495,7 @@ static const struct hl_asic_funcs gaudi_funcs = { .get_stream_master_qid_arr = gaudi_get_stream_master_qid_arr, .is_valid_dram_page_size = NULL, .mmu_get_real_page_size = hl_mmu_get_real_page_size, + .get_valid_dram_page_orders = gaudi_get_valid_dram_page_orders, }; /** diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index ec347bd3bb69..b18288070754 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -5679,6 +5679,12 @@ static u32 *goya_get_stream_master_qid_arr(void) return NULL; } +static void goya_get_valid_dram_page_orders(struct hl_info_dev_memalloc_page_sizes *info) +{ + /* set 0 since multiple pages are not supported */ + info->page_order_bitmask = 0; +} + static const struct hl_asic_funcs goya_funcs = { .early_init = goya_early_init, .early_fini = goya_early_fini, @@ -5767,6 +5773,7 @@ static const struct hl_asic_funcs goya_funcs = { .get_stream_master_qid_arr = goya_get_stream_master_qid_arr, .is_valid_dram_page_size = NULL, .mmu_get_real_page_size = hl_mmu_get_real_page_size, + .get_valid_dram_page_orders = goya_get_valid_dram_page_orders, }; /* diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index ae2441521467..f474e7fb018d 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -348,33 +348,35 @@ enum hl_server_type { * The address which accessing it caused the razwi. * Razwi initiator. * Razwi cause, was it a page fault or MMU access error. + * HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES - Retrieve valid page sizes for device memory allocation */ -#define HL_INFO_HW_IP_INFO 0 -#define HL_INFO_HW_EVENTS 1 -#define HL_INFO_DRAM_USAGE 2 -#define HL_INFO_HW_IDLE 3 -#define HL_INFO_DEVICE_STATUS 4 -#define HL_INFO_DEVICE_UTILIZATION 6 -#define HL_INFO_HW_EVENTS_AGGREGATE 7 -#define HL_INFO_CLK_RATE 8 -#define HL_INFO_RESET_COUNT 9 -#define HL_INFO_TIME_SYNC 10 -#define HL_INFO_CS_COUNTERS 11 -#define HL_INFO_PCI_COUNTERS 12 -#define HL_INFO_CLK_THROTTLE_REASON 13 -#define HL_INFO_SYNC_MANAGER 14 -#define HL_INFO_TOTAL_ENERGY 15 -#define HL_INFO_PLL_FREQUENCY 16 -#define HL_INFO_POWER 17 -#define HL_INFO_OPEN_STATS 18 -#define HL_INFO_DRAM_REPLACED_ROWS 21 -#define HL_INFO_DRAM_PENDING_ROWS 22 -#define HL_INFO_LAST_ERR_OPEN_DEV_TIME 23 -#define HL_INFO_CS_TIMEOUT_EVENT 24 -#define HL_INFO_RAZWI_EVENT 25 - -#define HL_INFO_VERSION_MAX_LEN 128 -#define HL_INFO_CARD_NAME_MAX_LEN 16 +#define HL_INFO_HW_IP_INFO 0 +#define HL_INFO_HW_EVENTS 1 +#define HL_INFO_DRAM_USAGE 2 +#define HL_INFO_HW_IDLE 3 +#define HL_INFO_DEVICE_STATUS 4 +#define HL_INFO_DEVICE_UTILIZATION 6 +#define HL_INFO_HW_EVENTS_AGGREGATE 7 +#define HL_INFO_CLK_RATE 8 +#define HL_INFO_RESET_COUNT 9 +#define HL_INFO_TIME_SYNC 10 +#define HL_INFO_CS_COUNTERS 11 +#define HL_INFO_PCI_COUNTERS 12 +#define HL_INFO_CLK_THROTTLE_REASON 13 +#define HL_INFO_SYNC_MANAGER 14 +#define HL_INFO_TOTAL_ENERGY 15 +#define HL_INFO_PLL_FREQUENCY 16 +#define HL_INFO_POWER 17 +#define HL_INFO_OPEN_STATS 18 +#define HL_INFO_DRAM_REPLACED_ROWS 21 +#define HL_INFO_DRAM_PENDING_ROWS 22 +#define HL_INFO_LAST_ERR_OPEN_DEV_TIME 23 +#define HL_INFO_CS_TIMEOUT_EVENT 24 +#define HL_INFO_RAZWI_EVENT 25 +#define HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES 26 + +#define HL_INFO_VERSION_MAX_LEN 128 +#define HL_INFO_CARD_NAME_MAX_LEN 16 /** * struct hl_info_hw_ip_info - hardware information on various IPs in the ASIC @@ -643,6 +645,15 @@ struct hl_info_razwi_event { __u8 pad[2]; }; +/** + * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information. + * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size + * (e.g. 0x2100000 means that 1MB and 32MB pages are supported). + */ +struct hl_info_dev_memalloc_page_sizes { + __u64 page_order_bitmask; +}; + enum gaudi_dcores { HL_GAUDI_WS_DCORE, HL_GAUDI_WN_DCORE, -- cgit v1.2.3 From fdec56c1a416c6947b1db22617da15cb89f46c6c Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 3 Mar 2022 09:43:10 +0200 Subject: habanalabs: expose compute ctx status through info ioctl In order for the user to know if he can try and open device, we expose the compute ctx state. The user can now know if the context is used by another process or whether the device is still ongoing through cleanup or reset and will be available soon. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/common/habanalabs_ioctl.c | 2 ++ include/uapi/misc/habanalabs.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/misc') diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index c6fe35ae1238..bfb5cfe68110 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -498,6 +498,8 @@ static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args) open_stats_info.last_open_period_ms = jiffies64_to_msecs( hdev->last_open_session_duration_jif); open_stats_info.open_counter = hdev->open_counter; + open_stats_info.is_compute_ctx_active = hdev->is_compute_ctx_active; + open_stats_info.compute_ctx_in_release = hdev->compute_ctx_in_release; return copy_to_user(out, &open_stats_info, min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index f474e7fb018d..ca2af5f98056 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -543,10 +543,15 @@ struct hl_pll_frequency_info { * struct hl_open_stats_info - device open statistics information * @open_counter: ever growing counter, increased on each successful dev open * @last_open_period_ms: duration (ms) device was open last time + * @is_compute_ctx_active: Whether there is an active compute context executing + * @compute_ctx_in_release: true if the current compute context is being released */ struct hl_open_stats_info { __u64 open_counter; __u64 last_open_period_ms; + __u8 is_compute_ctx_active; + __u8 compute_ctx_in_release; + __u8 pad[6]; }; /** -- cgit v1.2.3 From 5d1a0de2c778f369970dd50f6713e95068926a8b Mon Sep 17 00:00:00 2001 From: Ohad Sharabi Date: Sun, 10 Apr 2022 11:19:42 +0300 Subject: habanalabs: add prefetch flag to the MAP operation This patch let the user decide whether the translations done in the page tables will be fetched directly to the STLB right after the map. We want to let the user control whether to perform prefetch upon map operation. To do so a memory flag was added, to be used in the MAP ioctl, called HL_MEM_PREFETCH and if set- the mappings will be fetched directly to the STLB after map operation. Signed-off-by: Ohad Sharabi Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/common/memory.c | 11 ++++++----- include/uapi/misc/habanalabs.h | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/uapi/misc') diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 326c2179628f..6face45c57e3 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -1250,11 +1250,12 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, if (rc) goto map_err; - /* already prefetch the relevant translations to the cache */ - rc = hl_mmu_prefetch_cache_range(hdev, *vm_type, ctx->asid, ret_vaddr, - phys_pg_pack->total_size); - if (rc) - goto map_err; + if (args->flags & HL_MEM_PREFETCH) { + rc = hl_mmu_prefetch_cache_range(hdev, *vm_type, ctx->asid, ret_vaddr, + phys_pg_pack->total_size); + if (rc) + goto map_err; + } mutex_unlock(&ctx->mmu_lock); diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index ca2af5f98056..3576bf2b4841 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -1134,6 +1134,7 @@ union hl_wait_cs_args { #define HL_MEM_SHARED 0x2 #define HL_MEM_USERPTR 0x4 #define HL_MEM_FORCE_HINT 0x8 +#define HL_MEM_PREFETCH 0x40 /** * structure hl_mem_in - structure that handle input args for memory IOCTL -- cgit v1.2.3 From 422ef171038d4855ffe938137039a8f3b3e84293 Mon Sep 17 00:00:00 2001 From: Tal Cohen Date: Thu, 28 Apr 2022 13:45:18 +0300 Subject: habanalabs: add support for notification via eventfd The driver will be able to send notification events towards a user process, using user's registered event file descriptor. The driver uses the notification mechanism to inform the user about an occurred event. A user thread can wait until a notification is received from the driver. The driver stores the occurred event until the user reads it, using HL_INFO_GET_EVENTS - new ioctl opcode in the INFO ioctl. Gaudi specific implementation includes sending a notification on a TPC assertion event that is received from f/w. Signed-off-by: Tal Cohen Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay Signed-off-by: Greg Kroah-Hartman --- drivers/misc/habanalabs/common/device.c | 52 ++++++++++++++++++ drivers/misc/habanalabs/common/habanalabs.h | 40 +++++++++----- drivers/misc/habanalabs/common/habanalabs_drv.c | 9 ++++ drivers/misc/habanalabs/common/habanalabs_ioctl.c | 65 +++++++++++++++++++++++ drivers/misc/habanalabs/gaudi/gaudi.c | 14 ++++- include/uapi/misc/habanalabs.h | 15 ++++++ 6 files changed, 182 insertions(+), 13 deletions(-) (limited to 'include/uapi/misc') diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 15df89b31e1b..315510aaca35 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -285,6 +285,14 @@ static void hpriv_release(struct kref *ref) hdev->compute_ctx_in_release = 0; + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) { + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + } + + mutex_destroy(&hpriv->notifier_event.lock); + kfree(hpriv); } @@ -355,6 +363,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp) list_del(&hpriv->dev_node); mutex_unlock(&hdev->fpriv_ctrl_list_lock); out: + /* release the eventfd */ + if (hpriv->notifier_event.eventfd) { + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + } + + mutex_destroy(&hpriv->notifier_event.lock); put_pid(hpriv->taskpid); kfree(hpriv); @@ -1506,6 +1521,43 @@ out_err: return rc; } +static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event) +{ + mutex_lock(¬ifier_event->lock); + notifier_event->events_mask |= event; + if (notifier_event->eventfd) + eventfd_signal(notifier_event->eventfd, 1); + + mutex_unlock(¬ifier_event->lock); +} + +/* + * hl_notifier_event_send_all - notify all user processes via eventfd + * + * @hdev: pointer to habanalabs device structure + * @event: the occurred event + * Returns 0 for success or an error on failure. + */ +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event) +{ + struct hl_fpriv *hpriv; + + mutex_lock(&hdev->fpriv_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event); + + mutex_unlock(&hdev->fpriv_list_lock); + + /* control device */ + mutex_lock(&hdev->fpriv_ctrl_list_lock); + + list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node) + hl_notifier_event_send(&hpriv->notifier_event, event); + + mutex_unlock(&hdev->fpriv_ctrl_list_lock); +} + /* * hl_device_init - main initialization function for habanalabs device * diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 918e8a04acab..8977ec67dba7 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1932,6 +1933,18 @@ struct hl_debug_params { bool enable; }; +/** + * struct hl_notifier_event - holds the notifier data structure + * @eventfd: the event file descriptor to raise the notifications + * @lock: mutex lock to protect the notifier data flows + * @events_mask: indicates the bitmap events + */ +struct hl_notifier_event { + struct eventfd_ctx *eventfd; + struct mutex lock; + u64 events_mask; +}; + /* * FILE PRIVATE STRUCTURE */ @@ -1943,24 +1956,25 @@ struct hl_debug_params { * @taskpid: current process ID. * @ctx: current executing context. TODO: remove for multiple ctx per process * @ctx_mgr: context manager to handle multiple context for this FD. - * @cb_mgr: command buffer manager to handle multiple buffers for this FD. * @mem_mgr: manager descriptor for memory exportable via mmap + * @notifier_event: notifier eventfd towards user process * @debugfs_list: list of relevant ASIC debugfs. * @dev_node: node in the device list of file private data * @refcount: number of related contexts. * @restore_phase_mutex: lock for context switch and restore phase. */ struct hl_fpriv { - struct hl_device *hdev; - struct file *filp; - struct pid *taskpid; - struct hl_ctx *ctx; - struct hl_ctx_mgr ctx_mgr; - struct hl_mem_mgr mem_mgr; - struct list_head debugfs_list; - struct list_head dev_node; - struct kref refcount; - struct mutex restore_phase_mutex; + struct hl_device *hdev; + struct file *filp; + struct pid *taskpid; + struct hl_ctx *ctx; + struct hl_ctx_mgr ctx_mgr; + struct hl_mem_mgr mem_mgr; + struct hl_notifier_event notifier_event; + struct list_head debugfs_list; + struct list_head dev_node; + struct kref refcount; + struct mutex restore_phase_mutex; }; @@ -2676,8 +2690,8 @@ struct hl_reset_info { * @state_dump_specs: constants and dictionaries needed to dump system state. * @multi_cs_completion: array of multi-CS completion. * @clk_throttling: holds information about current/previous clock throttling events - * @reset_info: holds current device reset information. * @last_error: holds information about last session in which CS timeout or razwi error occurred. + * @reset_info: holds current device reset information. * @stream_master_qid_arr: pointer to array with QIDs of master streams. * @fw_major_version: major version of current loaded preboot * @dram_used_mem: current DRAM memory consumption. @@ -3071,6 +3085,8 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization); int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sensors_arr); +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event); + int hl_sysfs_init(struct hl_device *hdev); void hl_sysfs_fini(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index 1210de39d661..c97173e9507d 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -134,6 +134,10 @@ int hl_device_open(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; + hpriv->notifier_event.events_mask = 0; + hpriv->notifier_event.eventfd = 0; + + mutex_init(&hpriv->notifier_event.lock); mutex_init(&hpriv->restore_phase_mutex); kref_init(&hpriv->refcount); nonseekable_open(inode, filp); @@ -208,6 +212,7 @@ out_err: hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr); filp->private_data = NULL; mutex_destroy(&hpriv->restore_phase_mutex); + mutex_destroy(&hpriv->notifier_event.lock); put_pid(hpriv->taskpid); kfree(hpriv); @@ -241,6 +246,10 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp) hpriv->hdev = hdev; filp->private_data = hpriv; hpriv->filp = filp; + hpriv->notifier_event.events_mask = 0; + hpriv->notifier_event.eventfd = 0; + + mutex_init(&hpriv->notifier_event.lock); nonseekable_open(inode, filp); hpriv->taskpid = get_task_pid(current, PIDTYPE_PID); diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c index bfb5cfe68110..d1ef56a8d3ac 100644 --- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c +++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c @@ -116,6 +116,25 @@ static int hw_events_info(struct hl_device *hdev, bool aggregate, return copy_to_user(out, arr, min(max_size, size)) ? -EFAULT : 0; } +static int events_info(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + int rc; + u32 max_size = args->return_size; + u64 events_mask; + void __user *out = (void __user *) (uintptr_t) args->return_pointer; + + if ((max_size < sizeof(u64)) || (!out)) + return -EINVAL; + + mutex_lock(&hpriv->notifier_event.lock); + events_mask = hpriv->notifier_event.events_mask; + hpriv->notifier_event.events_mask = 0; + mutex_unlock(&hpriv->notifier_event.lock); + + rc = copy_to_user(out, &events_mask, sizeof(u64)); + return rc; +} + static int dram_usage_info(struct hl_fpriv *hpriv, struct hl_info_args *args) { struct hl_device *hdev = hpriv->hdev; @@ -614,6 +633,43 @@ static int dev_mem_alloc_page_sizes_info(struct hl_fpriv *hpriv, struct hl_info_ return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0; } +static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + int rc; + + /* check if there is already a registered on that process */ + mutex_lock(&hpriv->notifier_event.lock); + if (hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + hpriv->notifier_event.eventfd = eventfd_ctx_fdget(args->eventfd); + if (IS_ERR(hpriv->notifier_event.eventfd)) { + rc = PTR_ERR(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + mutex_unlock(&hpriv->notifier_event.lock); + return rc; + } + + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + +static int eventfd_unregister(struct hl_fpriv *hpriv, struct hl_info_args *args) +{ + mutex_lock(&hpriv->notifier_event.lock); + if (!hpriv->notifier_event.eventfd) { + mutex_unlock(&hpriv->notifier_event.lock); + return -EINVAL; + } + + eventfd_ctx_put(hpriv->notifier_event.eventfd); + hpriv->notifier_event.eventfd = 0; + mutex_unlock(&hpriv->notifier_event.lock); + return 0; +} + static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, struct device *dev) { @@ -667,6 +723,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES: return dev_mem_alloc_page_sizes_info(hpriv, args); + case HL_INFO_GET_EVENTS: + return events_info(hpriv, args); + default: break; } @@ -717,6 +776,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data, case HL_INFO_DRAM_PENDING_ROWS: return dram_pending_rows_info(hpriv, args); + case HL_INFO_REGISTER_EVENTFD: + return eventfd_register(hpriv, args); + + case HL_INFO_UNREGISTER_EVENTFD: + return eventfd_unregister(hpriv, args); + default: dev_err(dev, "Invalid request %d\n", args->op); rc = -EINVAL; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 08cd60300b4f..1c388537de33 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -7879,7 +7879,6 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_MMU_PAGE_FAULT: case GAUDI_EVENT_MMU_WR_PERM: case GAUDI_EVENT_RAZWI_OR_ADC: - case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: case GAUDI_EVENT_MME0_QM ... GAUDI_EVENT_MME2_QM: case GAUDI_EVENT_DMA0_QM ... GAUDI_EVENT_DMA7_QM: fallthrough; @@ -7899,6 +7898,19 @@ static void gaudi_handle_eqe(struct hl_device *hdev, hl_fw_unmask_irq(hdev, event_type); break; + case GAUDI_EVENT_TPC0_QM ... GAUDI_EVENT_TPC7_QM: + gaudi_print_irq_info(hdev, event_type, true); + gaudi_handle_qman_err(hdev, event_type); + hl_fw_unmask_irq(hdev, event_type); + + /* In TPC QM event, notify on TPC assertion. While there isn't + * a specific event for assertion yet, the FW generates QM event. + * The SW upper layer will inspect an internal mapped area to indicate + * if the event is a tpc assertion or tpc QM. + */ + hl_notifier_event_send_all(hdev, HL_NOTIFIER_EVENT_TPC_ASSERT); + break; + case GAUDI_EVENT_RAZWI_OR_ADC_SW: gaudi_print_irq_info(hdev, event_type, true); goto reset_device; diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 3576bf2b4841..52540d5b4fc9 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -349,6 +349,9 @@ enum hl_server_type { * Razwi initiator. * Razwi cause, was it a page fault or MMU access error. * HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES - Retrieve valid page sizes for device memory allocation + * HL_INFO_REGISTER_EVENTFD - Register eventfd for event notifications. + * HL_INFO_UNREGISTER_EVENTFD - Unregister eventfd + * HL_INFO_GET_EVENTS - Retrieve the last occurred events */ #define HL_INFO_HW_IP_INFO 0 #define HL_INFO_HW_EVENTS 1 @@ -374,6 +377,9 @@ enum hl_server_type { #define HL_INFO_CS_TIMEOUT_EVENT 24 #define HL_INFO_RAZWI_EVENT 25 #define HL_INFO_DEV_MEM_ALLOC_PAGE_SIZES 26 +#define HL_INFO_REGISTER_EVENTFD 28 +#define HL_INFO_UNREGISTER_EVENTFD 29 +#define HL_INFO_GET_EVENTS 30 #define HL_INFO_VERSION_MAX_LEN 128 #define HL_INFO_CARD_NAME_MAX_LEN 16 @@ -679,6 +685,7 @@ enum gaudi_dcores { * @period_ms: Period value, in milliseconds, for utilization rate in range 100ms - 1000ms in 100 ms * resolution. Currently not in use. * @pll_index: Index as defined in hl__pll_index enumeration. + * @eventfd: event file descriptor for event notifications. * @pad: Padding to 64 bit. */ struct hl_info_args { @@ -691,6 +698,7 @@ struct hl_info_args { __u32 ctx_id; __u32 period_ms; __u32 pll_index; + __u32 eventfd; }; __u32 pad; @@ -1390,6 +1398,13 @@ struct hl_debug_args { __u32 ctx_id; }; +/* + * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command + * + * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event + */ +#define HL_NOTIFIER_EVENT_TPC_ASSERT (1 << 0) + /* * Various information operations such as: * - H/W IP information -- cgit v1.2.3