From 158350aae16a4e666e261da0bb4d91c0601a3edd Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Thu, 16 Feb 2023 13:09:45 +0000
Subject: drm: document DRM_IOCTL_GEM_CLOSE

This is a bit tricky, because of the ref'counting considerations.
See also [1] for more discussion about this topic. Since this is
kernel docs, I've decided to elaborate a bit less on the user-space
details.

[1]: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/110

Signed-off-by: Simon Ser <contact@emersion.fr>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Pekka Paalanen <ppaalanen@gmail.com>
Cc: Daniel Stone <daniel@fooishbar.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230216130934.156541-2-contact@emersion.fr
---
 include/uapi/drm/drm.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index 642808520d92..c39fefb54613 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -972,6 +972,19 @@ extern "C" {
 #define DRM_IOCTL_GET_STATS             DRM_IOR( 0x06, struct drm_stats)
 #define DRM_IOCTL_SET_VERSION		DRM_IOWR(0x07, struct drm_set_version)
 #define DRM_IOCTL_MODESET_CTL           DRM_IOW(0x08, struct drm_modeset_ctl)
+/**
+ * DRM_IOCTL_GEM_CLOSE - Close a GEM handle.
+ *
+ * GEM handles are not reference-counted by the kernel. User-space is
+ * responsible for managing their lifetime. For example, if user-space imports
+ * the same memory object twice on the same DRM file description, the same GEM
+ * handle is returned by both imports, and user-space needs to ensure
+ * &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen
+ * when a memory object is allocated, then exported and imported again on the
+ * same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception
+ * and always returns fresh new GEM handles even if an existing GEM handle
+ * already refers to the same memory object before the IOCTL is performed.
+ */
 #define DRM_IOCTL_GEM_CLOSE		DRM_IOW (0x09, struct drm_gem_close)
 #define DRM_IOCTL_GEM_FLINK		DRM_IOWR(0x0a, struct drm_gem_flink)
 #define DRM_IOCTL_GEM_OPEN		DRM_IOWR(0x0b, struct drm_gem_open)
-- 
cgit v1.2.3


From 61a55f8b1ebcde68bc7bfd57435b1b3bb1121b8f Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Fri, 17 Feb 2023 16:22:01 +0000
Subject: drm: document expectations for GETFB2 handles

There are two important details missing from the docs:

- If the memory object backing the FB already has a GEM handle,
  it's not re-used, a new one is generated.
- Aliased planes will return the same GEM handle.

v2: document how user-space can obtain DMA-BUF FDs without leaking
handles (Pekka)

Signed-off-by: Simon Ser <contact@emersion.fr>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Pekka Paalanen <ppaalanen@gmail.com>
Cc: Daniel Stone <daniel@fooishbar.org>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: https://patchwork.freedesktop.org/patch/msgid/20230217162151.59996-1-contact@emersion.fr
---
 include/uapi/drm/drm.h | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index c39fefb54613..292e4778a2f4 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -1117,8 +1117,13 @@ extern "C" {
  * struct as the output.
  *
  * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
- * will be filled with GEM buffer handles. Planes are valid until one has a
- * zero handle -- this can be used to compute the number of planes.
+ * will be filled with GEM buffer handles. Fresh new GEM handles are always
+ * returned, even if another GEM handle referring to the same memory object
+ * already exists on the DRM file description. The caller is responsible for
+ * removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same
+ * new handle will be returned for multiple planes in case they use the same
+ * memory object. Planes are valid until one has a zero handle -- this can be
+ * used to compute the number of planes.
  *
  * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
  * until one has a zero &drm_mode_fb_cmd2.pitches.
@@ -1126,6 +1131,11 @@ extern "C" {
  * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
  * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
  * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
+ *
+ * To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space
+ * can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately
+ * close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not
+ * double-close handles which are specified multiple times in the array.
  */
 #define DRM_IOCTL_MODE_GETFB2		DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
 
-- 
cgit v1.2.3


From 6068771673a38efc31ed5b99645176b4d3a33129 Mon Sep 17 00:00:00 2001
From: Simon Ser <contact@emersion.fr>
Date: Fri, 17 Feb 2023 16:22:04 +0000
Subject: drm: document DRM_IOCTL_PRIME_HANDLE_TO_FD and PRIME_FD_TO_HANDLE

v2: mention caps, note that the IOCTLs might fail, document that
user-space needs a data structure to keep track of the
handles (Daniel V.)

Signed-off-by: Simon Ser <contact@emersion.fr>
Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Pekka Paalanen <ppaalanen@gmail.com>
Cc: Daniel Stone <daniel@fooishbar.org>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230217162151.59996-2-contact@emersion.fr
---
 include/uapi/drm/drm.h | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index 292e4778a2f4..a87bbbbca2d4 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -1025,7 +1025,37 @@ extern "C" {
 #define DRM_IOCTL_UNLOCK		DRM_IOW( 0x2b, struct drm_lock)
 #define DRM_IOCTL_FINISH		DRM_IOW( 0x2c, struct drm_lock)
 
+/**
+ * DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD.
+ *
+ * User-space sets &drm_prime_handle.handle with the GEM handle to export and
+ * &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in
+ * &drm_prime_handle.fd.
+ *
+ * The export can fail for any driver-specific reason, e.g. because export is
+ * not supported for this specific GEM handle (but might be for others).
+ *
+ * Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT.
+ */
 #define DRM_IOCTL_PRIME_HANDLE_TO_FD    DRM_IOWR(0x2d, struct drm_prime_handle)
+/**
+ * DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle.
+ *
+ * User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to
+ * import, and gets back a GEM handle in &drm_prime_handle.handle.
+ * &drm_prime_handle.flags is unused.
+ *
+ * If an existing GEM handle refers to the memory object backing the DMA-BUF,
+ * that GEM handle is returned. Therefore user-space which needs to handle
+ * arbitrary DMA-BUFs must have a user-space lookup data structure to manually
+ * reference-count duplicated GEM handles. For more information see
+ * &DRM_IOCTL_GEM_CLOSE.
+ *
+ * The import can fail for any driver-specific reason, e.g. because import is
+ * only supported for DMA-BUFs allocated on this DRM device.
+ *
+ * Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT.
+ */
 #define DRM_IOCTL_PRIME_FD_TO_HANDLE    DRM_IOWR(0x2e, struct drm_prime_handle)
 
 #define DRM_IOCTL_AGP_ACQUIRE		DRM_IO(  0x30)
-- 
cgit v1.2.3


From fd234e7581162573742dfb8cc4dc0af3d3148138 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Tue, 16 Nov 2021 23:15:55 -0500
Subject: drm/amdkfd: Implement DMA buf fd export from KFD
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Exports a DMA buf fd of a given KFD buffer handle. This is intended for
being able to import KFD BOs into GEM contexts to leverage the
amdgpu_bo_va API for more flexible virtual address mappings. It will
also be used for the new upstreamable RDMA solution coming to UCX and
RCCL.

The corresponding user mode change (Thunk API and kfdtest) is here:
https://github.com/fxkamd/ROCT-Thunk-Interface/commits/fxkamd/dmabuf

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Xiaogang Chen <Xiaogang.Chen@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h       |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 45 ++++++++++++++-----
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c         | 55 ++++++++++++++++++++++++
 include/uapi/linux/kfd_ioctl.h                   | 14 +++++-
 4 files changed, 104 insertions(+), 12 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 333780491867..01ba3589b60a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -308,6 +308,8 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device *adev,
 				      uint64_t va, void *drm_priv,
 				      struct kgd_mem **mem, uint64_t *size,
 				      uint64_t *mmap_offset);
+int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
+				      struct dma_buf **dmabuf);
 int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
 				struct tile_config *config);
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 13d88fb92544..a4ee9f0378c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -711,6 +711,21 @@ kfd_mem_dmaunmap_attachment(struct kgd_mem *mem,
 	}
 }
 
+static int kfd_mem_export_dmabuf(struct kgd_mem *mem)
+{
+	if (!mem->dmabuf) {
+		struct dma_buf *ret = amdgpu_gem_prime_export(
+			&mem->bo->tbo.base,
+			mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
+				DRM_RDWR : 0);
+		if (IS_ERR(ret))
+			return PTR_ERR(ret);
+		mem->dmabuf = ret;
+	}
+
+	return 0;
+}
+
 static int
 kfd_mem_attach_dmabuf(struct amdgpu_device *adev, struct kgd_mem *mem,
 		      struct amdgpu_bo **bo)
@@ -718,16 +733,9 @@ kfd_mem_attach_dmabuf(struct amdgpu_device *adev, struct kgd_mem *mem,
 	struct drm_gem_object *gobj;
 	int ret;
 
-	if (!mem->dmabuf) {
-		mem->dmabuf = amdgpu_gem_prime_export(&mem->bo->tbo.base,
-			mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
-				DRM_RDWR : 0);
-		if (IS_ERR(mem->dmabuf)) {
-			ret = PTR_ERR(mem->dmabuf);
-			mem->dmabuf = NULL;
-			return ret;
-		}
-	}
+	ret = kfd_mem_export_dmabuf(mem);
+	if (ret)
+		return ret;
 
 	gobj = amdgpu_gem_prime_import(adev_to_drm(adev), mem->dmabuf);
 	if (IS_ERR(gobj))
@@ -2268,6 +2276,23 @@ err_put_obj:
 	return ret;
 }
 
+int amdgpu_amdkfd_gpuvm_export_dmabuf(struct kgd_mem *mem,
+				      struct dma_buf **dma_buf)
+{
+	int ret;
+
+	mutex_lock(&mem->lock);
+	ret = kfd_mem_export_dmabuf(mem);
+	if (ret)
+		goto out;
+
+	get_dma_buf(mem->dmabuf);
+	*dma_buf = mem->dmabuf;
+out:
+	mutex_unlock(&mem->lock);
+	return ret;
+}
+
 /* Evict a userptr BO by stopping the queues if necessary
  *
  * Runs in MMU notifier, may be in RECLAIM_FS context. This means it
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 072fa4fbd27f..9e08e8bf7b81 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1586,6 +1586,58 @@ err_unlock:
 	return r;
 }
 
+static int kfd_ioctl_export_dmabuf(struct file *filep,
+				   struct kfd_process *p, void *data)
+{
+	struct kfd_ioctl_export_dmabuf_args *args = data;
+	struct kfd_process_device *pdd;
+	struct dma_buf *dmabuf;
+	struct kfd_dev *dev;
+	void *mem;
+	int ret = 0;
+
+	dev = kfd_device_by_id(GET_GPU_ID(args->handle));
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&p->mutex);
+
+	pdd = kfd_get_process_device_data(dev, p);
+	if (!pdd) {
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	mem = kfd_process_device_translate_handle(pdd,
+						GET_IDR_HANDLE(args->handle));
+	if (!mem) {
+		ret = -EINVAL;
+		goto err_unlock;
+	}
+
+	ret = amdgpu_amdkfd_gpuvm_export_dmabuf(mem, &dmabuf);
+	mutex_unlock(&p->mutex);
+	if (ret)
+		goto err_out;
+
+	ret = dma_buf_fd(dmabuf, args->flags);
+	if (ret < 0) {
+		dma_buf_put(dmabuf);
+		goto err_out;
+	}
+	/* dma_buf_fd assigns the reference count to the fd, no need to
+	 * put the reference here.
+	 */
+	args->dmabuf_fd = ret;
+
+	return 0;
+
+err_unlock:
+	mutex_unlock(&p->mutex);
+err_out:
+	return ret;
+}
+
 /* Handle requests for watching SMI events */
 static int kfd_ioctl_smi_events(struct file *filep,
 				struct kfd_process *p, void *data)
@@ -2768,6 +2820,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
 	AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
 			kfd_ioctl_get_available_memory, 0),
+
+	AMDKFD_IOCTL_DEF(AMDKFD_IOC_EXPORT_DMABUF,
+				kfd_ioctl_export_dmabuf, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index 42b60198b6c5..2da5c3ad71bd 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -37,9 +37,10 @@
  * - 1.9 - Add available memory ioctl
  * - 1.10 - Add SMI profiler event log
  * - 1.11 - Add unified memory for ctx save/restore area
+ * - 1.12 - Add DMA buf export ioctl
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 11
+#define KFD_IOCTL_MINOR_VERSION 12
 
 struct kfd_ioctl_get_version_args {
 	__u32 major_version;	/* from KFD */
@@ -463,6 +464,12 @@ struct kfd_ioctl_import_dmabuf_args {
 	__u32 dmabuf_fd;	/* to KFD */
 };
 
+struct kfd_ioctl_export_dmabuf_args {
+	__u64 handle;		/* to KFD */
+	__u32 flags;		/* to KFD */
+	__u32 dmabuf_fd;	/* from KFD */
+};
+
 /*
  * KFD SMI(System Management Interface) events
  */
@@ -877,7 +884,10 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_AVAILABLE_MEMORY		\
 		AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args)
 
+#define AMDKFD_IOC_EXPORT_DMABUF		\
+		AMDKFD_IOWR(0x24, struct kfd_ioctl_export_dmabuf_args)
+
 #define AMDKFD_COMMAND_START		0x01
-#define AMDKFD_COMMAND_END		0x24
+#define AMDKFD_COMMAND_END		0x25
 
 #endif
-- 
cgit v1.2.3


From 313e9f63b74419ca14c2c09f581a79c7037ee0e2 Mon Sep 17 00:00:00 2001
From: Moti Haimovski <mhaimovski@habana.ai>
Date: Tue, 10 Jan 2023 17:35:31 +0200
Subject: accel/habanalabs: add critical-event bit in notifier

Enhance the existing user notifications by adding a HW and FW critical
event bits to be used when a HW or FW event occur that requires
both SW abort and hard-resetting the chip.

Signed-off-by: Moti Haimovski <mhaimovski@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
---
 drivers/accel/habanalabs/common/device.c           | 53 ++++++++++++++++++++-
 drivers/accel/habanalabs/common/habanalabs.h       | 54 ++++++++++++++++++++++
 drivers/accel/habanalabs/common/habanalabs_drv.c   |  5 +-
 drivers/accel/habanalabs/common/habanalabs_ioctl.c | 50 ++++++++++++++++++++
 drivers/accel/habanalabs/gaudi/gaudi.c             | 10 +++-
 drivers/accel/habanalabs/gaudi2/gaudi2.c           |  4 ++
 include/uapi/drm/habanalabs_accel.h                | 43 +++++++++++++++++
 7 files changed, 213 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c
index b8c74185eabd..f91f3509336f 100644
--- a/drivers/accel/habanalabs/common/device.c
+++ b/drivers/accel/habanalabs/common/device.c
@@ -998,6 +998,8 @@ static void hl_device_heartbeat(struct work_struct *work)
 {
 	struct hl_device *hdev = container_of(work, struct hl_device,
 						work_heartbeat.work);
+	struct hl_info_fw_err_info info = {0};
+	u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
 
 	if (!hl_device_operational(hdev, NULL))
 		goto reschedule;
@@ -1008,7 +1010,10 @@ static void hl_device_heartbeat(struct work_struct *work)
 	if (hl_device_operational(hdev, NULL))
 		dev_err(hdev->dev, "Device heartbeat failed!\n");
 
-	hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
+	info.err_type = HL_INFO_FW_HEARTBEAT_ERR;
+	info.event_mask = &event_mask;
+	hl_handle_fw_err(hdev, &info);
+	hl_device_cond_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT, event_mask);
 
 	return;
 
@@ -2626,3 +2631,49 @@ void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_
 	if (event_mask)
 		*event_mask |=  HL_NOTIFIER_EVENT_PAGE_FAULT;
 }
+
+void hl_capture_hw_err(struct hl_device *hdev, u16 event_id)
+{
+	struct hw_err_info *info = &hdev->captured_err_info.hw_err;
+
+	/* Capture only the first HW err */
+	if (atomic_cmpxchg(&info->event_detected, 0, 1))
+		return;
+
+	info->event.timestamp = ktime_to_ns(ktime_get());
+	info->event.event_id = event_id;
+
+	info->event_info_available = true;
+}
+
+void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask)
+{
+	hl_capture_hw_err(hdev, event_id);
+
+	if (event_mask)
+		*event_mask |= HL_NOTIFIER_EVENT_CRITICL_HW_ERR;
+}
+
+void hl_capture_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *fw_info)
+{
+	struct fw_err_info *info = &hdev->captured_err_info.fw_err;
+
+	/* Capture only the first FW error */
+	if (atomic_cmpxchg(&info->event_detected, 0, 1))
+		return;
+
+	info->event.timestamp = ktime_to_ns(ktime_get());
+	info->event.err_type = fw_info->err_type;
+	if (fw_info->err_type == HL_INFO_FW_REPORTED_ERR)
+		info->event.event_id = fw_info->event_id;
+
+	info->event_info_available = true;
+}
+
+void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info)
+{
+	hl_capture_fw_err(hdev, info);
+
+	if (info->event_mask)
+		*info->event_mask |= HL_NOTIFIER_EVENT_CRITICL_FW_ERR;
+}
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index afdae5775eaa..176a2e2c050d 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -3031,18 +3031,56 @@ struct razwi_info {
 	bool				razwi_info_available;
 };
 
+/**
+ * struct hw_err_info - HW error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then a HW event was discovered for the
+ *                  first time after the driver has finished booting-up.
+ *                  currently we assume that only fatal events (that require hard-reset) are
+ *                  reported so we don't care of the others that might follow it.
+ *                  so once changed to 1, it will remain that way.
+ *                  TODO: support multiple events.
+ * @event_info_available: indicates that a HW event info is now available.
+ */
+struct hw_err_info {
+	struct hl_info_hw_err_event	event;
+	atomic_t			event_detected;
+	bool				event_info_available;
+};
+
+/**
+ * struct fw_err_info - FW error information.
+ * @event: holds information on the event.
+ * @event_detected: if set as 1, then a FW event was discovered for the
+ *                  first time after the driver has finished booting-up.
+ *                  currently we assume that only fatal events (that require hard-reset) are
+ *                  reported so we don't care of the others that might follow it.
+ *                  so once changed to 1, it will remain that way.
+ *                  TODO: support multiple events.
+ * @event_info_available: indicates that a HW event info is now available.
+ */
+struct fw_err_info {
+	struct hl_info_fw_err_event	event;
+	atomic_t			event_detected;
+	bool				event_info_available;
+};
+
 /**
  * struct hl_error_info - holds information collected during an error.
  * @cs_timeout: CS timeout error information.
  * @razwi_info: RAZWI information.
  * @undef_opcode: undefined opcode information.
  * @page_fault_info: page fault information.
+ * @hw_err: (fatal) hardware error information.
+ * @fw_err: firmware error information.
  */
 struct hl_error_info {
 	struct cs_timeout_info		cs_timeout;
 	struct razwi_info		razwi_info;
 	struct undefined_opcode_info	undef_opcode;
 	struct page_fault_info		page_fault_info;
+	struct hw_err_info		hw_err;
+	struct fw_err_info		fw_err;
 };
 
 /**
@@ -3453,6 +3491,20 @@ struct hl_cs_encaps_sig_handle {
 	u32  count;
 };
 
+/**
+ * struct hl_info_fw_err_info - firmware error information structure
+ * @err_type: The type of error detected (or reported).
+ * @event_mask: Pointer to the event mask to be modified with the detected error flag
+ *              (can be NULL)
+ * @event_id: The id of the event that reported the error
+ *            (applicable when err_type is HL_INFO_FW_REPORTED_ERR).
+ */
+struct hl_info_fw_err_info {
+	enum hl_info_fw_err_type err_type;
+	u64 *event_mask;
+	u16 event_id;
+};
+
 /*
  * IOCTLs
  */
@@ -3883,6 +3935,8 @@ void hl_handle_razwi(struct hl_device *hdev, u64 addr, u16 *engine_id, u16 num_o
 void hl_capture_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu);
 void hl_handle_page_fault(struct hl_device *hdev, u64 addr, u16 eng_id, bool is_pmmu,
 				u64 *event_mask);
+void hl_handle_critical_hw_err(struct hl_device *hdev, u16 event_id, u64 *event_mask);
+void hl_handle_fw_err(struct hl_device *hdev, struct hl_info_fw_err_info *info);
 
 #ifdef CONFIG_DEBUG_FS
 
diff --git a/drivers/accel/habanalabs/common/habanalabs_drv.c b/drivers/accel/habanalabs/common/habanalabs_drv.c
index 8ccc3d6519b5..0cb6e52a1192 100644
--- a/drivers/accel/habanalabs/common/habanalabs_drv.c
+++ b/drivers/accel/habanalabs/common/habanalabs_drv.c
@@ -221,12 +221,9 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	hl_debugfs_add_file(hpriv);
 
+	memset(&hdev->captured_err_info, 0, sizeof(hdev->captured_err_info));
 	atomic_set(&hdev->captured_err_info.cs_timeout.write_enable, 1);
-	atomic_set(&hdev->captured_err_info.razwi_info.razwi_detected, 0);
-	atomic_set(&hdev->captured_err_info.page_fault_info.page_fault_detected, 0);
 	hdev->captured_err_info.undef_opcode.write_enable = true;
-	hdev->captured_err_info.razwi_info.razwi_info_available = false;
-	hdev->captured_err_info.page_fault_info.page_fault_info_available = false;
 
 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 5005e6fca691..13cd5013c72a 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -830,6 +830,50 @@ static int user_mappings_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 	return copy_to_user(out, pgf_info->user_mappings, actual_size) ? -EFAULT : 0;
 }
 
+static int hw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+	struct hl_device *hdev = hpriv->hdev;
+	u32 user_buf_size = args->return_size;
+	struct hw_err_info *info;
+	int rc;
+
+	if ((!user_buf_size) || (!user_buf))
+		return -EINVAL;
+
+	if (user_buf_size < sizeof(struct hl_info_hw_err_event))
+		return -ENOMEM;
+
+	info = &hdev->captured_err_info.hw_err;
+	if (!info->event_info_available)
+		return -ENOENT;
+
+	rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_hw_err_event));
+	return rc ? -EFAULT : 0;
+}
+
+static int fw_err_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	void __user *user_buf = (void __user *) (uintptr_t) args->return_pointer;
+	struct hl_device *hdev = hpriv->hdev;
+	u32 user_buf_size = args->return_size;
+	struct fw_err_info *info;
+	int rc;
+
+	if ((!user_buf_size) || (!user_buf))
+		return -EINVAL;
+
+	if (user_buf_size < sizeof(struct hl_info_fw_err_event))
+		return -ENOMEM;
+
+	info = &hdev->captured_err_info.fw_err;
+	if (!info->event_info_available)
+		return -ENOENT;
+
+	rc = copy_to_user(user_buf, &info->event, sizeof(struct hl_info_fw_err_event));
+	return rc ? -EFAULT : 0;
+}
+
 static int send_fw_generic_request(struct hl_device *hdev, struct hl_info_args *info_args)
 {
 	void __user *buff = (void __user *) (uintptr_t) info_args->return_pointer;
@@ -950,6 +994,12 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_UNREGISTER_EVENTFD:
 		return eventfd_unregister(hpriv, args);
 
+	case HL_INFO_HW_ERR_EVENT:
+		return hw_err_info(hpriv, args);
+
+	case HL_INFO_FW_ERR_EVENT:
+		return fw_err_info(hpriv, args);
+
 	default:
 		break;
 	}
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 4ba5352cb7cb..0e02aebb6ea6 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -7634,6 +7634,7 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev, u16 event_type,
 static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
+	struct hl_info_fw_err_info fw_err_info;
 	u64 data = le64_to_cpu(eq_entry->data[0]), event_mask = 0;
 	u32 ctl = le32_to_cpu(eq_entry->hdr.ctl);
 	u32 fw_fatal_err_flag = 0, flags = 0;
@@ -7912,7 +7913,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entr
 	case GAUDI_EVENT_FW_ALIVE_S:
 		gaudi_print_irq_info(hdev, event_type, false, &event_mask);
 		gaudi_print_fw_alive_info(hdev, &eq_entry->fw_alive);
-		event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
+		fw_err_info.err_type = HL_INFO_FW_REPORTED_ERR;
+		fw_err_info.event_id = event_type;
+		fw_err_info.event_mask = &event_mask;
+		hl_handle_fw_err(hdev, &fw_err_info);
 		goto reset_device;
 
 	default:
@@ -7943,6 +7947,10 @@ reset_device:
 	}
 
 	if (reset_required) {
+		/* escalate general hw errors to critical/fatal error */
+		if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
+			hl_handle_critical_hw_err(hdev, event_type, &event_mask);
+
 		hl_device_cond_reset(hdev, flags, event_mask);
 	} else {
 		hl_fw_unmask_irq(hdev, event_type);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index d250c6f01acb..6926af5b5ed1 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -9444,6 +9444,10 @@ reset_device:
 	} else {
 		reset_flags |= HL_DRV_RESET_DELAY;
 	}
+	/* escalate general hw errors to critical/fatal error */
+	if (event_mask & HL_NOTIFIER_EVENT_GENERAL_HW_ERR)
+		hl_handle_critical_hw_err(hdev, event_type, &event_mask);
+
 	event_mask |= HL_NOTIFIER_EVENT_DEVICE_RESET;
 	hl_device_cond_reset(hdev, reset_flags, event_mask);
 }
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 331567ec9e79..3a62652a6452 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -723,6 +723,10 @@ enum hl_server_type {
  * HL_NOTIFIER_EVENT_GENERAL_HW_ERR     - Indicates device HW error
  * HL_NOTIFIER_EVENT_RAZWI              - Indicates razwi happened
  * HL_NOTIFIER_EVENT_PAGE_FAULT         - Indicates page fault happened
+ * HL_NOTIFIER_EVENT_CRITICAL_HW_ERR    - Indicates a HW error that requires SW abort and
+ *                                        HW reset
+ * HL_NOTIFIER_EVENT_CRITICAL_FW_ERR    - Indicates a FW error that requires SW abort and
+ *                                        HW reset
  */
 #define HL_NOTIFIER_EVENT_TPC_ASSERT		(1ULL << 0)
 #define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE	(1ULL << 1)
@@ -733,6 +737,8 @@ enum hl_server_type {
 #define HL_NOTIFIER_EVENT_GENERAL_HW_ERR	(1ULL << 6)
 #define HL_NOTIFIER_EVENT_RAZWI			(1ULL << 7)
 #define HL_NOTIFIER_EVENT_PAGE_FAULT		(1ULL << 8)
+#define HL_NOTIFIER_EVENT_CRITICL_HW_ERR	(1ULL << 9)
+#define HL_NOTIFIER_EVENT_CRITICL_FW_ERR	(1ULL << 10)
 
 /* Opcode for management ioctl
  *
@@ -790,6 +796,8 @@ enum hl_server_type {
  * HL_INFO_PAGE_FAULT_EVENT - Retrieve parameters of captured page fault.
  * HL_INFO_USER_MAPPINGS - Retrieve user mappings, captured after page fault event.
  * HL_INFO_FW_GENERIC_REQ - Send generic request to FW.
+ * HL_INFO_HW_ERR_EVENT   - Retrieve information on the reported HW error.
+ * HL_INFO_FW_ERR_EVENT   - Retrieve information on the reported FW error.
  */
 #define HL_INFO_HW_IP_INFO			0
 #define HL_INFO_HW_EVENTS			1
@@ -824,6 +832,8 @@ enum hl_server_type {
 #define HL_INFO_PAGE_FAULT_EVENT		33
 #define HL_INFO_USER_MAPPINGS			34
 #define HL_INFO_FW_GENERIC_REQ			35
+#define HL_INFO_HW_ERR_EVENT			36
+#define HL_INFO_FW_ERR_EVENT			37
 
 #define HL_INFO_VERSION_MAX_LEN			128
 #define HL_INFO_CARD_NAME_MAX_LEN		16
@@ -1161,6 +1171,39 @@ struct hl_info_undefined_opcode_event {
 	__u32 stream_id;
 };
 
+/**
+ * struct hl_info_hw_err_event - info about HW error
+ * @timestamp: timestamp of error occurrence
+ * @event_id: The async event ID (specific to each device type).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_hw_err_event {
+	__s64 timestamp;
+	__u16 event_id;
+	__u16 pad[3];
+};
+
+/* FW error definition for event_type in struct hl_info_fw_err_event */
+enum hl_info_fw_err_type {
+	HL_INFO_FW_HEARTBEAT_ERR,
+	HL_INFO_FW_REPORTED_ERR,
+};
+
+/**
+ * struct hl_info_fw_err_event - info about FW error
+ * @timestamp: time-stamp of error occurrence
+ * @err_type: The type of event as defined in hl_info_fw_err_type.
+ * @event_id: The async event ID (specific to each device type, applicable only when event type is
+ *             HL_INFO_FW_REPORTED_ERR).
+ * @pad: size padding for u64 granularity.
+ */
+struct hl_info_fw_err_event {
+	__s64 timestamp;
+	__u16 err_type;
+	__u16 event_id;
+	__u32 pad;
+};
+
 /**
  * struct hl_info_dev_memalloc_page_sizes - valid page sizes in device mem alloc information.
  * @page_order_bitmask: bitmap in which a set bit represents the order of the supported page size
-- 
cgit v1.2.3


From 7fc0d011c378c6b2abc65cb536e0df0ee055ed39 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Sun, 22 Jan 2023 14:06:15 +0200
Subject: accel/habanalabs: expose engine core int reg address

In order for engine cores to raise interrupts towards FW, They need
to know which register the event data should be written to.
Hence, we forward the relevant scratchpad register received during
dynamic regs handshake with FW.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
---
 drivers/accel/habanalabs/common/habanalabs.h       | 3 +++
 drivers/accel/habanalabs/common/habanalabs_ioctl.c | 1 +
 drivers/accel/habanalabs/gaudi2/gaudi2.c           | 5 +++++
 include/uapi/drm/habanalabs_accel.h                | 5 +++++
 4 files changed, 14 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 176a2e2c050d..bf81eda88e2e 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -592,6 +592,8 @@ struct hl_hints_range {
  * @host_base_address: host physical start address for host DMA from device
  * @host_end_address: host physical end address for host DMA from device
  * @max_freq_value: current max clk frequency.
+ * @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
+ *                                  in order to raise events toward FW.
  * @clk_pll_index: clock PLL index that specify which PLL determines the clock
  *                 we display to the user
  * @mmu_pgt_size: MMU page tables total size.
@@ -739,6 +741,7 @@ struct asic_fixed_properties {
 	u64				host_base_address;
 	u64				host_end_address;
 	u64				max_freq_value;
+	u64				engine_core_interrupt_reg_addr;
 	u32				clk_pll_index;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 13cd5013c72a..448cdd2501d8 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -107,6 +107,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.server_type = prop->server_type;
 	hw_ip.security_enabled = prop->fw_security_enabled;
 	hw_ip.revision_id = hdev->pdev->revision;
+	hw_ip.engine_core_interrupt_reg_addr = prop->engine_core_interrupt_reg_addr;
 
 	return copy_to_user(out, &hw_ip,
 		min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 6926af5b5ed1..220d46d95abf 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2933,6 +2933,7 @@ static bool gaudi2_is_arc_tpc_owned(u64 arc_id)
 
 static void gaudi2_init_arcs(struct hl_device *hdev)
 {
+	struct cpu_dyn_regs *dyn_regs = &hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
 	struct gaudi2_device *gaudi2 = hdev->asic_specific;
 	u64 arc_id;
 	u32 i;
@@ -2962,6 +2963,10 @@ static void gaudi2_init_arcs(struct hl_device *hdev)
 
 		gaudi2_set_arc_id_cap(hdev, arc_id);
 	}
+
+	/* Fetch ARC scratchpad address */
+	hdev->asic_prop.engine_core_interrupt_reg_addr =
+		CFG_BASE + le32_to_cpu(dyn_regs->eng_arc_irq_ctrl);
 }
 
 static int gaudi2_scrub_arc_dccm(struct hl_device *hdev, u32 cpu_id)
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 3a62652a6452..c1fdbb85d1d5 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -885,6 +885,8 @@ enum hl_server_type {
  *                             application to use. Relevant for Gaudi2 and later.
  * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
  * @revision_id: PCI revision ID of the ASIC.
+ * @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
+ *                                  in order to raise events toward FW.
  */
 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
@@ -921,6 +923,9 @@ struct hl_info_hw_ip_info {
 	__u8 reserved8;
 	__u8 revision_id;
 	__u8 pad[2];
+	__u32 reserved9;
+	__u8 pad3[4];
+	__u64 engine_core_interrupt_reg_addr;
 };
 
 struct hl_info_dram_usage {
-- 
cgit v1.2.3


From 4713ace3246644519bf93cc8ea6e44efe57fc3ec Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Mon, 16 Jan 2023 19:56:23 +0200
Subject: accel/habanalabs: add support for TPC assert

In order to allow TPC engines to raise an assert, we must expose
the relevant MSIX interrupt to the user so he will configure the engine
correctly. In addition, we implement the corresponding interrupt
handler that will notify the user upon such an event.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
---
 drivers/accel/habanalabs/common/habanalabs.h       |  5 +++++
 drivers/accel/habanalabs/common/habanalabs_ioctl.c |  1 +
 drivers/accel/habanalabs/common/irq.c              | 18 ++++++++++++++++
 drivers/accel/habanalabs/gaudi/gaudi.c             |  1 +
 drivers/accel/habanalabs/gaudi2/gaudi2.c           | 25 +++++++++++++++++++---
 drivers/accel/habanalabs/gaudi2/gaudi2P.h          |  2 ++
 drivers/accel/habanalabs/goya/goya.c               |  1 +
 include/uapi/drm/habanalabs_accel.h                |  3 ++-
 8 files changed, 52 insertions(+), 4 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 24ad15272040..ed6987a0050f 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -665,6 +665,7 @@ struct hl_hints_range {
  * @first_available_cq: first available CQ for the user.
  * @user_interrupt_count: number of user interrupts.
  * @user_dec_intr_count: number of decoder interrupts exposed to user.
+ * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
  * @cache_line_size: device cache line size.
  * @server_type: Server type that the ASIC is currently installed in.
  *               The value is according to enum hl_server_type in uapi file.
@@ -791,6 +792,7 @@ struct asic_fixed_properties {
 	u16				first_available_cq[HL_MAX_DCORES];
 	u16				user_interrupt_count;
 	u16				user_dec_intr_count;
+	u16				tpc_interrupt_id;
 	u16				cache_line_size;
 	u16				server_type;
 	u8				completion_queues_count;
@@ -1099,6 +1101,7 @@ struct hl_cq {
 enum hl_user_interrupt_type {
 	HL_USR_INTERRUPT_CQ = 0,
 	HL_USR_INTERRUPT_DECODER,
+	HL_USR_INTERRUPT_TPC
 };
 
 /**
@@ -3148,6 +3151,7 @@ struct hl_reset_info {
  * @user_interrupt: array of hl_user_interrupt. upon the corresponding user
  *                  interrupt, driver will monitor the list of fences
  *                  registered to this interrupt.
+ * @tpc_interrupt: single TPC interrupt for all TPCs.
  * @common_user_cq_interrupt: common user CQ interrupt for all user CQ interrupts.
  *                         upon any user CQ interrupt, driver will monitor the
  *                         list of fences registered to this common structure.
@@ -3332,6 +3336,7 @@ struct hl_device {
 	enum hl_asic_type		asic_type;
 	struct hl_cq			*completion_queue;
 	struct hl_user_interrupt	*user_interrupt;
+	struct hl_user_interrupt	tpc_interrupt;
 	struct hl_user_interrupt	common_user_cq_interrupt;
 	struct hl_user_interrupt	common_decoder_interrupt;
 	struct hl_cs			**shadow_cs_queue;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 448cdd2501d8..100282fc82fc 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -102,6 +102,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.mme_master_slave_mode = prop->mme_master_slave_mode;
 	hw_ip.first_available_interrupt_id = prop->first_available_user_interrupt;
 	hw_ip.number_of_user_interrupts = prop->user_interrupt_count;
+	hw_ip.tpc_interrupt_id = prop->tpc_interrupt_id;
 
 	hw_ip.edma_enabled_mask = prop->edma_enabled_mask;
 	hw_ip.server_type = prop->server_type;
diff --git a/drivers/accel/habanalabs/common/irq.c b/drivers/accel/habanalabs/common/irq.c
index 716228291b46..bd0e3413721b 100644
--- a/drivers/accel/habanalabs/common/irq.c
+++ b/drivers/accel/habanalabs/common/irq.c
@@ -325,6 +325,21 @@ static void handle_user_interrupt(struct hl_device *hdev, struct hl_user_interru
 	}
 }
 
+static void handle_tpc_interrupt(struct hl_device *hdev)
+{
+	u64 event_mask;
+	u32 flags;
+
+	event_mask = HL_NOTIFIER_EVENT_TPC_ASSERT |
+		HL_NOTIFIER_EVENT_USER_ENGINE_ERR |
+		HL_NOTIFIER_EVENT_DEVICE_RESET;
+
+	flags = HL_DRV_RESET_DELAY;
+
+	dev_err_ratelimited(hdev->dev, "Received TPC assert\n");
+	hl_device_cond_reset(hdev, flags, event_mask);
+}
+
 /**
  * hl_irq_handler_user_interrupt - irq handler for user interrupts
  *
@@ -367,6 +382,9 @@ irqreturn_t hl_irq_user_interrupt_thread_handler(int irq, void *arg)
 		/* Handle decoder interrupt registered on this specific irq */
 		handle_user_interrupt(hdev, user_int);
 		break;
+	case HL_USR_INTERRUPT_TPC:
+		handle_tpc_interrupt(hdev);
+		break;
 	default:
 		break;
 	}
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index 0e02aebb6ea6..688efd7b55a1 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -679,6 +679,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 			(num_sync_stream_queues * HL_RSVD_MONS);
 
 	prop->first_available_user_interrupt = USHRT_MAX;
+	prop->tpc_interrupt_id = USHRT_MAX;
 
 	for (i = 0 ; i < HL_MAX_DCORES ; i++)
 		prop->first_available_cq[i] = USHRT_MAX;
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 7f23363ced9c..846fd65d9f19 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2348,6 +2348,7 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 					(num_sync_stream_queues * HL_RSVD_MONS);
 
 	prop->first_available_user_interrupt = GAUDI2_IRQ_NUM_USER_FIRST;
+	prop->tpc_interrupt_id = GAUDI2_IRQ_NUM_TPC_ASSERT;
 
 	prop->first_available_cq[0] = GAUDI2_RESERVED_CQ_NUMBER;
 
@@ -3235,6 +3236,9 @@ static void gaudi2_user_interrupt_setup(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int i, j, k;
 
+	/* Initialize TPC interrupt */
+	HL_USR_INTR_STRUCT_INIT(hdev->tpc_interrupt, hdev, 0, HL_USR_INTERRUPT_TPC);
+
 	/* Initialize common user CQ interrupt */
 	HL_USR_INTR_STRUCT_INIT(hdev->common_user_cq_interrupt, hdev,
 				HL_COMMON_USER_CQ_INTERRUPT_ID, HL_USR_INTERRUPT_CQ);
@@ -3892,6 +3896,8 @@ static const char *gaudi2_irq_name(u16 irq_number)
 		return "gaudi2 completion";
 	case GAUDI2_IRQ_NUM_DCORE0_DEC0_NRM ... GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM:
 		return gaudi2_vdec_irq_name[irq_number - GAUDI2_IRQ_NUM_DCORE0_DEC0_NRM];
+	case GAUDI2_IRQ_NUM_TPC_ASSERT:
+		return "gaudi2 tpc assert";
 	case GAUDI2_IRQ_NUM_USER_FIRST ... GAUDI2_IRQ_NUM_USER_LAST:
 		return "gaudi2 user completion";
 	default:
@@ -4004,6 +4010,15 @@ static int gaudi2_enable_msix(struct hl_device *hdev)
 		goto free_event_irq;
 	}
 
+	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_TPC_ASSERT);
+	rc = request_threaded_irq(irq, hl_irq_handler_user_interrupt,
+			hl_irq_user_interrupt_thread_handler, IRQF_ONESHOT,
+			gaudi2_irq_name(GAUDI2_IRQ_NUM_TPC_ASSERT), &hdev->tpc_interrupt);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to request IRQ %d", irq);
+		goto free_dec_irq;
+	}
+
 	for (i = GAUDI2_IRQ_NUM_USER_FIRST, j = prop->user_dec_intr_count, user_irq_init_cnt = 0;
 			user_irq_init_cnt < prop->user_interrupt_count;
 			i++, j++, user_irq_init_cnt++) {
@@ -4031,9 +4046,8 @@ free_user_irq:
 		irq = pci_irq_vector(hdev->pdev, i);
 		free_irq(irq, &hdev->user_interrupt[j]);
 	}
-
-	gaudi2_dec_disable_msix(hdev, GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM + 1);
-
+free_dec_irq:
+	gaudi2_dec_disable_msix(hdev, GAUDI2_IRQ_NUM_DEC_LAST + 1);
 free_event_irq:
 	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_EVENT_QUEUE);
 	free_irq(irq, cq);
@@ -4065,6 +4079,8 @@ static void gaudi2_sync_irqs(struct hl_device *hdev)
 		synchronize_irq(irq);
 	}
 
+	synchronize_irq(pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_TPC_ASSERT));
+
 	for (i = GAUDI2_IRQ_NUM_USER_FIRST, j = 0 ; j < hdev->asic_prop.user_interrupt_count;
 										i++, j++) {
 		irq = pci_irq_vector(hdev->pdev, i);
@@ -4091,6 +4107,9 @@ static void gaudi2_disable_msix(struct hl_device *hdev)
 
 	gaudi2_dec_disable_msix(hdev, GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM + 1);
 
+	irq = pci_irq_vector(hdev->pdev, GAUDI2_IRQ_NUM_TPC_ASSERT);
+	free_irq(irq, &hdev->tpc_interrupt);
+
 	for (i = GAUDI2_IRQ_NUM_USER_FIRST, j = prop->user_dec_intr_count, k = 0;
 			k < hdev->asic_prop.user_interrupt_count ; i++, j++, k++) {
 
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2P.h b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
index 2687404d9d21..f79958b24811 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2P.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2P.h
@@ -410,9 +410,11 @@ enum gaudi2_irq_num {
 	GAUDI2_IRQ_NUM_SHARED_DEC0_ABNRM,
 	GAUDI2_IRQ_NUM_SHARED_DEC1_NRM,
 	GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM,
+	GAUDI2_IRQ_NUM_DEC_LAST = GAUDI2_IRQ_NUM_SHARED_DEC1_ABNRM,
 	GAUDI2_IRQ_NUM_COMPLETION,
 	GAUDI2_IRQ_NUM_NIC_PORT_FIRST,
 	GAUDI2_IRQ_NUM_NIC_PORT_LAST = (GAUDI2_IRQ_NUM_NIC_PORT_FIRST + NIC_NUMBER_OF_PORTS - 1),
+	GAUDI2_IRQ_NUM_TPC_ASSERT,
 	GAUDI2_IRQ_NUM_RESERVED_FIRST,
 	GAUDI2_IRQ_NUM_RESERVED_LAST = (GAUDI2_MSIX_ENTRIES - GAUDI2_NUM_USER_INTERRUPTS - 1),
 	GAUDI2_IRQ_NUM_USER_FIRST,
diff --git a/drivers/accel/habanalabs/goya/goya.c b/drivers/accel/habanalabs/goya/goya.c
index df65e9bdc18a..e8ae1e27bc90 100644
--- a/drivers/accel/habanalabs/goya/goya.c
+++ b/drivers/accel/habanalabs/goya/goya.c
@@ -472,6 +472,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->max_pending_cs = GOYA_MAX_PENDING_CS;
 
 	prop->first_available_user_interrupt = USHRT_MAX;
+	prop->tpc_interrupt_id = USHRT_MAX;
 
 	for (i = 0 ; i < HL_MAX_DCORES ; i++)
 		prop->first_available_cq[i] = USHRT_MAX;
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index c1fdbb85d1d5..359b19ef3c3f 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -885,6 +885,7 @@ enum hl_server_type {
  *                             application to use. Relevant for Gaudi2 and later.
  * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
  * @revision_id: PCI revision ID of the ASIC.
+ * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
  * @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
  *                                  in order to raise events toward FW.
  */
@@ -922,7 +923,7 @@ struct hl_info_hw_ip_info {
 	__u32 reserved7;
 	__u8 reserved8;
 	__u8 revision_id;
-	__u8 pad[2];
+	__u16 tpc_interrupt_id;
 	__u32 reserved9;
 	__u8 pad3[4];
 	__u64 engine_core_interrupt_reg_addr;
-- 
cgit v1.2.3


From f7f0085eec8d3c0c353d2e7bfa7fb54b3b925d7a Mon Sep 17 00:00:00 2001
From: Koby Elbaz <kelbaz@habana.ai>
Date: Wed, 15 Feb 2023 17:51:14 +0200
Subject: accel/habanalabs: add uapi to stall/resume engine

The user might want to stall/resume engines to perform power testing
for various scenarios. Because our current
HL_CS_FLAGS_ENGINE_CORE_COMMAND command only handles the engines' cores,
we need to add another opcode for handling entire engine and not just
its core.

The user supplies an array, where each entry holds the engine's ID and
the command to send to the engine. The size of the array is limited
by the number of engines in the ASIC (only Gaudi2 is currently
supported).

Signed-off-by: Koby Elbaz <kelbaz@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 .../accel/habanalabs/common/command_submission.c   |  55 ++++-
 drivers/accel/habanalabs/common/habanalabs.h       |  12 +-
 drivers/accel/habanalabs/gaudi/gaudi.c             |   1 +
 drivers/accel/habanalabs/gaudi2/gaudi2.c           | 255 ++++++++++++++++++++-
 drivers/accel/habanalabs/gaudi2/gaudi2_masks.h     |   2 +
 .../include/gaudi2/asic_reg/gaudi2_regs.h          |   5 +
 include/uapi/drm/habanalabs_accel.h                |  36 ++-
 7 files changed, 356 insertions(+), 10 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/accel/habanalabs/common/command_submission.c b/drivers/accel/habanalabs/common/command_submission.c
index 2cebc1c47248..af9d2e22c6e7 100644
--- a/drivers/accel/habanalabs/common/command_submission.c
+++ b/drivers/accel/habanalabs/common/command_submission.c
@@ -14,7 +14,7 @@
 #define HL_CS_FLAGS_TYPE_MASK	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
 			HL_CS_FLAGS_COLLECTIVE_WAIT | HL_CS_FLAGS_RESERVE_SIGNALS_ONLY | \
 			HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY | HL_CS_FLAGS_ENGINE_CORE_COMMAND | \
-			HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
+			HL_CS_FLAGS_ENGINES_COMMAND | HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
 
 
 #define MAX_TS_ITER_NUM 100
@@ -1319,6 +1319,8 @@ static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
 		return CS_UNRESERVE_SIGNALS;
 	else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
 		return CS_TYPE_ENGINE_CORE;
+	else if (cs_type_flags & HL_CS_FLAGS_ENGINES_COMMAND)
+		return CS_TYPE_ENGINES;
 	else if (cs_type_flags & HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
 		return CS_TYPE_FLUSH_PCI_HBW_WRITES;
 	else
@@ -2444,10 +2446,13 @@ out:
 static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
 						u32 num_engine_cores, u32 core_command)
 {
-	int rc;
 	struct hl_device *hdev = hpriv->hdev;
 	void __user *engine_cores_arr;
 	u32 *cores;
+	int rc;
+
+	if (!hdev->asic_prop.supports_engine_modes)
+		return -EPERM;
 
 	if (!num_engine_cores || num_engine_cores > hdev->asic_prop.num_engine_cores) {
 		dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
@@ -2476,6 +2481,48 @@ static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
 	return rc;
 }
 
+static int cs_ioctl_engines(struct hl_fpriv *hpriv, u64 engines_arr_user_addr,
+						u32 num_engines, enum hl_engine_command command)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 *engines, max_num_of_engines;
+	void __user *engines_arr;
+	int rc;
+
+	if (!hdev->asic_prop.supports_engine_modes)
+		return -EPERM;
+
+	if (command >= HL_ENGINE_COMMAND_MAX) {
+		dev_err(hdev->dev, "Engine command is invalid\n");
+		return -EINVAL;
+	}
+
+	max_num_of_engines = hdev->asic_prop.max_num_of_engines;
+	if (command == HL_ENGINE_CORE_RUN || command == HL_ENGINE_CORE_HALT)
+		max_num_of_engines = hdev->asic_prop.num_engine_cores;
+
+	if (!num_engines || num_engines > max_num_of_engines) {
+		dev_err(hdev->dev, "Number of engines %d is invalid\n", num_engines);
+		return -EINVAL;
+	}
+
+	engines_arr = (void __user *) (uintptr_t) engines_arr_user_addr;
+	engines = kmalloc_array(num_engines, sizeof(u32), GFP_KERNEL);
+	if (!engines)
+		return -ENOMEM;
+
+	if (copy_from_user(engines, engines_arr, num_engines * sizeof(u32))) {
+		dev_err(hdev->dev, "Failed to copy engine-ids array from user\n");
+		kfree(engines);
+		return -EFAULT;
+	}
+
+	rc = hdev->asic_funcs->set_engines(hdev, engines, num_engines, command);
+	kfree(engines);
+
+	return rc;
+}
+
 static int cs_ioctl_flush_pci_hbw_writes(struct hl_fpriv *hpriv)
 {
 	struct hl_device *hdev = hpriv->hdev;
@@ -2547,6 +2594,10 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		rc = cs_ioctl_engine_cores(hpriv, args->in.engine_cores,
 				args->in.num_engine_cores, args->in.core_command);
 		break;
+	case CS_TYPE_ENGINES:
+		rc = cs_ioctl_engines(hpriv, args->in.engines,
+				args->in.num_engines, args->in.engine_command);
+		break;
 	case CS_TYPE_FLUSH_PCI_HBW_WRITES:
 		rc = cs_ioctl_flush_pci_hbw_writes(hpriv);
 		break;
diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index e03f9c125e30..af5a51f8c173 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -372,6 +372,7 @@ enum hl_cs_type {
 	CS_RESERVE_SIGNALS,
 	CS_UNRESERVE_SIGNALS,
 	CS_TYPE_ENGINE_CORE,
+	CS_TYPE_ENGINES,
 	CS_TYPE_FLUSH_PCI_HBW_WRITES,
 };
 
@@ -644,7 +645,8 @@ struct hl_hints_range {
  *                                      which the property supports_user_set_page_size is true
  *                                      (i.e. the DRAM supports multiple page sizes), otherwise
  *                                      it will shall  be equal to dram_page_size.
- * @num_engine_cores: number of engine cpu cores
+ * @num_engine_cores: number of engine cpu cores.
+ * @max_num_of_engines: maximum number of all engines in the ASIC.
  * @num_of_special_blocks: special_blocks array size.
  * @glbl_err_cause_num: global err cause number.
  * @hbw_flush_reg: register to read to generate HBW flush. value of 0 means HBW flush is
@@ -695,6 +697,7 @@ struct hl_hints_range {
  * @supports_user_set_page_size: true if user can set the allocation page size.
  * @dma_mask: the dma mask to be set for this device
  * @supports_advanced_cpucp_rc: true if new cpucp opcodes are supported.
+ * @supports_engine_modes: true if changing engines/engine_cores modes is supported.
  */
 struct asic_fixed_properties {
 	struct hw_queue_properties	*hw_queues_props;
@@ -773,6 +776,7 @@ struct asic_fixed_properties {
 	u32				xbar_edge_enabled_mask;
 	u32				device_mem_alloc_default_page_size;
 	u32				num_engine_cores;
+	u32				max_num_of_engines;
 	u32				num_of_special_blocks;
 	u32				glbl_err_cause_num;
 	u32				hbw_flush_reg;
@@ -810,6 +814,7 @@ struct asic_fixed_properties {
 	u8				supports_user_set_page_size;
 	u8				dma_mask;
 	u8				supports_advanced_cpucp_rc;
+	u8				supports_engine_modes;
 };
 
 /**
@@ -1564,6 +1569,7 @@ struct engines_data {
  * @access_dev_mem: access device memory
  * @set_dram_bar_base: set the base of the DRAM BAR
  * @set_engine_cores: set a config command to engine cores
+ * @set_engines: set a config command to user engines
  * @send_device_activity: indication to FW about device availability
  * @set_dram_properties: set DRAM related properties.
  * @set_binning_masks: set binning/enable masks for all relevant components.
@@ -1703,6 +1709,8 @@ struct hl_asic_funcs {
 	u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
 	int (*set_engine_cores)(struct hl_device *hdev, u32 *core_ids,
 					u32 num_cores, u32 core_command);
+	int (*set_engines)(struct hl_device *hdev, u32 *engine_ids,
+					u32 num_engines, u32 engine_command);
 	int (*send_device_activity)(struct hl_device *hdev, bool open);
 	int (*set_dram_properties)(struct hl_device *hdev);
 	int (*set_binning_masks)(struct hl_device *hdev);
@@ -1826,7 +1834,7 @@ struct hl_cs_outcome_store {
  * @hpriv: pointer to the private (Kernel Driver) data of the process (fd).
  * @hdev: pointer to the device structure.
  * @refcount: reference counter for the context. Context is released only when
- *		this hits 0l. It is incremented on CS and CS_WAIT.
+ *		this hits 0. It is incremented on CS and CS_WAIT.
  * @cs_pending: array of hl fence objects representing pending CS.
  * @outcome_store: storage data structure used to remember outcomes of completed
  *                 command submissions for a long time after CS id wraparound.
diff --git a/drivers/accel/habanalabs/gaudi/gaudi.c b/drivers/accel/habanalabs/gaudi/gaudi.c
index d37c5d4893b9..004846bc086e 100644
--- a/drivers/accel/habanalabs/gaudi/gaudi.c
+++ b/drivers/accel/habanalabs/gaudi/gaudi.c
@@ -656,6 +656,7 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->cfg_size = CFG_SIZE;
 	prop->max_asid = MAX_ASID;
 	prop->num_of_events = GAUDI_EVENT_SIZE;
+	prop->max_num_of_engines = GAUDI_ENGINE_ID_SIZE;
 	prop->tpc_enabled_mask = TPC_ENABLED_MASK;
 
 	set_default_power_values(hdev);
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 1131aae68690..166469179628 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -1714,6 +1714,34 @@ static const u32 gaudi2_tpc_cfg_blocks_bases[TPC_ID_SIZE] = {
 	[TPC_ID_DCORE0_TPC6] = mmDCORE0_TPC6_CFG_BASE,
 };
 
+static const u32 gaudi2_tpc_eml_cfg_blocks_bases[TPC_ID_SIZE] = {
+	[TPC_ID_DCORE0_TPC0] = mmDCORE0_TPC0_EML_CFG_BASE,
+	[TPC_ID_DCORE0_TPC1] = mmDCORE0_TPC1_EML_CFG_BASE,
+	[TPC_ID_DCORE0_TPC2] = mmDCORE0_TPC2_EML_CFG_BASE,
+	[TPC_ID_DCORE0_TPC3] = mmDCORE0_TPC3_EML_CFG_BASE,
+	[TPC_ID_DCORE0_TPC4] = mmDCORE0_TPC4_EML_CFG_BASE,
+	[TPC_ID_DCORE0_TPC5] = mmDCORE0_TPC5_EML_CFG_BASE,
+	[TPC_ID_DCORE1_TPC0] = mmDCORE1_TPC0_EML_CFG_BASE,
+	[TPC_ID_DCORE1_TPC1] = mmDCORE1_TPC1_EML_CFG_BASE,
+	[TPC_ID_DCORE1_TPC2] = mmDCORE1_TPC2_EML_CFG_BASE,
+	[TPC_ID_DCORE1_TPC3] = mmDCORE1_TPC3_EML_CFG_BASE,
+	[TPC_ID_DCORE1_TPC4] = mmDCORE1_TPC4_EML_CFG_BASE,
+	[TPC_ID_DCORE1_TPC5] = mmDCORE1_TPC5_EML_CFG_BASE,
+	[TPC_ID_DCORE2_TPC0] = mmDCORE2_TPC0_EML_CFG_BASE,
+	[TPC_ID_DCORE2_TPC1] = mmDCORE2_TPC1_EML_CFG_BASE,
+	[TPC_ID_DCORE2_TPC2] = mmDCORE2_TPC2_EML_CFG_BASE,
+	[TPC_ID_DCORE2_TPC3] = mmDCORE2_TPC3_EML_CFG_BASE,
+	[TPC_ID_DCORE2_TPC4] = mmDCORE2_TPC4_EML_CFG_BASE,
+	[TPC_ID_DCORE2_TPC5] = mmDCORE2_TPC5_EML_CFG_BASE,
+	[TPC_ID_DCORE3_TPC0] = mmDCORE3_TPC0_EML_CFG_BASE,
+	[TPC_ID_DCORE3_TPC1] = mmDCORE3_TPC1_EML_CFG_BASE,
+	[TPC_ID_DCORE3_TPC2] = mmDCORE3_TPC2_EML_CFG_BASE,
+	[TPC_ID_DCORE3_TPC3] = mmDCORE3_TPC3_EML_CFG_BASE,
+	[TPC_ID_DCORE3_TPC4] = mmDCORE3_TPC4_EML_CFG_BASE,
+	[TPC_ID_DCORE3_TPC5] = mmDCORE3_TPC5_EML_CFG_BASE,
+	[TPC_ID_DCORE0_TPC6] = mmDCORE0_TPC6_EML_CFG_BASE,
+};
+
 const u32 gaudi2_rot_blocks_bases[ROTATOR_ID_SIZE] = {
 	[ROTATOR_ID_0] = mmROT0_BASE,
 	[ROTATOR_ID_1] = mmROT1_BASE
@@ -1752,6 +1780,56 @@ static const u32 gaudi2_rot_id_to_queue_id[ROTATOR_ID_SIZE] = {
 	[ROTATOR_ID_1] = GAUDI2_QUEUE_ID_ROT_1_0,
 };
 
+static const u32 gaudi2_tpc_engine_id_to_tpc_id[] = {
+	[GAUDI2_DCORE0_ENGINE_ID_TPC_0] = TPC_ID_DCORE0_TPC0,
+	[GAUDI2_DCORE0_ENGINE_ID_TPC_1] = TPC_ID_DCORE0_TPC1,
+	[GAUDI2_DCORE0_ENGINE_ID_TPC_2] = TPC_ID_DCORE0_TPC2,
+	[GAUDI2_DCORE0_ENGINE_ID_TPC_3] = TPC_ID_DCORE0_TPC3,
+	[GAUDI2_DCORE0_ENGINE_ID_TPC_4] = TPC_ID_DCORE0_TPC4,
+	[GAUDI2_DCORE0_ENGINE_ID_TPC_5] = TPC_ID_DCORE0_TPC5,
+	[GAUDI2_DCORE1_ENGINE_ID_TPC_0] = TPC_ID_DCORE1_TPC0,
+	[GAUDI2_DCORE1_ENGINE_ID_TPC_1] = TPC_ID_DCORE1_TPC1,
+	[GAUDI2_DCORE1_ENGINE_ID_TPC_2] = TPC_ID_DCORE1_TPC2,
+	[GAUDI2_DCORE1_ENGINE_ID_TPC_3] = TPC_ID_DCORE1_TPC3,
+	[GAUDI2_DCORE1_ENGINE_ID_TPC_4] = TPC_ID_DCORE1_TPC4,
+	[GAUDI2_DCORE1_ENGINE_ID_TPC_5] = TPC_ID_DCORE1_TPC5,
+	[GAUDI2_DCORE2_ENGINE_ID_TPC_0] = TPC_ID_DCORE2_TPC0,
+	[GAUDI2_DCORE2_ENGINE_ID_TPC_1] = TPC_ID_DCORE2_TPC1,
+	[GAUDI2_DCORE2_ENGINE_ID_TPC_2] = TPC_ID_DCORE2_TPC2,
+	[GAUDI2_DCORE2_ENGINE_ID_TPC_3] = TPC_ID_DCORE2_TPC3,
+	[GAUDI2_DCORE2_ENGINE_ID_TPC_4] = TPC_ID_DCORE2_TPC4,
+	[GAUDI2_DCORE2_ENGINE_ID_TPC_5] = TPC_ID_DCORE2_TPC5,
+	[GAUDI2_DCORE3_ENGINE_ID_TPC_0] = TPC_ID_DCORE3_TPC0,
+	[GAUDI2_DCORE3_ENGINE_ID_TPC_1] = TPC_ID_DCORE3_TPC1,
+	[GAUDI2_DCORE3_ENGINE_ID_TPC_2] = TPC_ID_DCORE3_TPC2,
+	[GAUDI2_DCORE3_ENGINE_ID_TPC_3] = TPC_ID_DCORE3_TPC3,
+	[GAUDI2_DCORE3_ENGINE_ID_TPC_4] = TPC_ID_DCORE3_TPC4,
+	[GAUDI2_DCORE3_ENGINE_ID_TPC_5] = TPC_ID_DCORE3_TPC5,
+	/* the PCI TPC is placed last (mapped liked HW) */
+	[GAUDI2_DCORE0_ENGINE_ID_TPC_6] = TPC_ID_DCORE0_TPC6,
+};
+
+static const u32 gaudi2_mme_engine_id_to_mme_id[] = {
+	[GAUDI2_DCORE0_ENGINE_ID_MME] = MME_ID_DCORE0,
+	[GAUDI2_DCORE1_ENGINE_ID_MME] = MME_ID_DCORE1,
+	[GAUDI2_DCORE2_ENGINE_ID_MME] = MME_ID_DCORE2,
+	[GAUDI2_DCORE3_ENGINE_ID_MME] = MME_ID_DCORE3,
+};
+
+static const u32 gaudi2_edma_engine_id_to_edma_id[] = {
+	[GAUDI2_ENGINE_ID_PDMA_0] = DMA_CORE_ID_PDMA0,
+	[GAUDI2_ENGINE_ID_PDMA_1] = DMA_CORE_ID_PDMA1,
+	[GAUDI2_DCORE0_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA0,
+	[GAUDI2_DCORE0_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA1,
+	[GAUDI2_DCORE1_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA2,
+	[GAUDI2_DCORE1_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA3,
+	[GAUDI2_DCORE2_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA4,
+	[GAUDI2_DCORE2_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA5,
+	[GAUDI2_DCORE3_ENGINE_ID_EDMA_0] = DMA_CORE_ID_EDMA6,
+	[GAUDI2_DCORE3_ENGINE_ID_EDMA_1] = DMA_CORE_ID_EDMA7,
+	[GAUDI2_ENGINE_ID_KDMA] = DMA_CORE_ID_KDMA,
+};
+
 const u32 edma_stream_base[NUM_OF_EDMA_PER_DCORE * NUM_OF_DCORES] = {
 	GAUDI2_QUEUE_ID_DCORE0_EDMA_0_0,
 	GAUDI2_QUEUE_ID_DCORE0_EDMA_1_0,
@@ -2026,6 +2104,12 @@ static void gaudi2_set_arc_id_cap(struct hl_device *hdev, u64 arc_id);
 static void gaudi2_memset_device_lbw(struct hl_device *hdev, u32 addr, u32 size, u32 val);
 static int gaudi2_send_job_to_kdma(struct hl_device *hdev, u64 src_addr, u64 dst_addr, u32 size,
 										bool is_memset);
+static bool gaudi2_get_tpc_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e);
+static bool gaudi2_get_mme_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e);
+static bool gaudi2_get_edma_idle_status(struct hl_device *hdev, u64 *mask_arr, u8 mask_len,
+		struct engines_data *e);
 static u64 gaudi2_mmu_scramble_addr(struct hl_device *hdev, u64 raw_addr);
 
 static void gaudi2_init_scrambler_hbm(struct hl_device *hdev)
@@ -2326,11 +2410,14 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 		prop->pmmu_huge.end_addr = VA_HOST_SPACE_HPAGE_END;
 	}
 
+	prop->max_num_of_engines = GAUDI2_ENGINE_ID_SIZE;
 	prop->num_engine_cores = CPU_ID_MAX;
 	prop->cfg_size = CFG_SIZE;
 	prop->max_asid = MAX_ASID;
 	prop->num_of_events = GAUDI2_EVENT_SIZE;
 
+	prop->supports_engine_modes = true;
+
 	prop->dc_power_default = DC_POWER_DEFAULT;
 
 	prop->cb_pool_cb_cnt = GAUDI2_CB_POOL_CB_CNT;
@@ -4322,7 +4409,6 @@ static int gaudi2_set_engine_cores(struct hl_device *hdev, u32 *core_ids,
 {
 	int i, rc;
 
-
 	for (i = 0 ; i < num_cores ; i++) {
 		if (gaudi2_is_arc_enabled(hdev, core_ids[i]))
 			gaudi2_set_arc_running_mode(hdev, core_ids[i], core_command);
@@ -4344,6 +4430,172 @@ static int gaudi2_set_engine_cores(struct hl_device *hdev, u32 *core_ids,
 	return 0;
 }
 
+static int gaudi2_set_tpc_engine_mode(struct hl_device *hdev, u32 engine_id, u32 engine_command)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	u32 reg_base, reg_addr, reg_val, tpc_id;
+
+	if (!(gaudi2->tpc_hw_cap_initialized & HW_CAP_TPC_MASK))
+		return 0;
+
+	tpc_id = gaudi2_tpc_engine_id_to_tpc_id[engine_id];
+	if (!(gaudi2->tpc_hw_cap_initialized & BIT_ULL(HW_CAP_TPC_SHIFT + tpc_id)))
+		return 0;
+
+	reg_base = gaudi2_tpc_cfg_blocks_bases[tpc_id];
+	reg_addr = reg_base + TPC_CFG_STALL_OFFSET;
+	reg_val = FIELD_PREP(DCORE0_TPC0_CFG_TPC_STALL_V_MASK,
+			!!(engine_command == HL_ENGINE_STALL));
+	WREG32(reg_addr, reg_val);
+
+	if (engine_command == HL_ENGINE_RESUME) {
+		reg_base = gaudi2_tpc_eml_cfg_blocks_bases[tpc_id];
+		reg_addr = reg_base + TPC_EML_CFG_DBG_CNT_OFFSET;
+		RMWREG32(reg_addr, 0x1, DCORE0_TPC0_EML_CFG_DBG_CNT_DBG_EXIT_MASK);
+	}
+
+	return 0;
+}
+
+static int gaudi2_set_mme_engine_mode(struct hl_device *hdev, u32 engine_id, u32 engine_command)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	u32 reg_base, reg_addr, reg_val, mme_id;
+
+	mme_id = gaudi2_mme_engine_id_to_mme_id[engine_id];
+	if (!(gaudi2->hw_cap_initialized & BIT_ULL(HW_CAP_MME_SHIFT + mme_id)))
+		return 0;
+
+	reg_base = gaudi2_mme_ctrl_lo_blocks_bases[mme_id];
+	reg_addr = reg_base + MME_CTRL_LO_QM_STALL_OFFSET;
+	reg_val = FIELD_PREP(DCORE0_MME_CTRL_LO_QM_STALL_V_MASK,
+			!!(engine_command == HL_ENGINE_STALL));
+	WREG32(reg_addr, reg_val);
+
+	return 0;
+}
+
+static int gaudi2_set_edma_engine_mode(struct hl_device *hdev, u32 engine_id, u32 engine_command)
+{
+	struct gaudi2_device *gaudi2 = hdev->asic_specific;
+	u32 reg_base, reg_addr, reg_val, edma_id;
+
+	if (!(gaudi2->hw_cap_initialized & HW_CAP_EDMA_MASK))
+		return 0;
+
+	edma_id = gaudi2_edma_engine_id_to_edma_id[engine_id];
+	if (!(gaudi2->hw_cap_initialized & BIT_ULL(HW_CAP_EDMA_SHIFT + edma_id)))
+		return 0;
+
+	reg_base = gaudi2_dma_core_blocks_bases[edma_id];
+	reg_addr = reg_base + EDMA_CORE_CFG_STALL_OFFSET;
+	reg_val = FIELD_PREP(DCORE0_EDMA0_CORE_CFG_1_HALT_MASK,
+			!!(engine_command == HL_ENGINE_STALL));
+	WREG32(reg_addr, reg_val);
+
+	if (engine_command == HL_ENGINE_STALL) {
+		reg_val = FIELD_PREP(DCORE0_EDMA0_CORE_CFG_1_HALT_MASK, 0x1) |
+				FIELD_PREP(DCORE0_EDMA0_CORE_CFG_1_FLUSH_MASK, 0x1);
+		WREG32(reg_addr, reg_val);
+	}
+
+	return 0;
+}
+
+static int gaudi2_set_engine_modes(struct hl_device *hdev,
+		u32 *engine_ids, u32 num_engines, u32 engine_command)
+{
+	int i, rc;
+
+	for (i = 0 ; i < num_engines ; ++i) {
+		switch (engine_ids[i]) {
+		case GAUDI2_DCORE0_ENGINE_ID_TPC_0 ... GAUDI2_DCORE0_ENGINE_ID_TPC_5:
+		case GAUDI2_DCORE1_ENGINE_ID_TPC_0 ... GAUDI2_DCORE1_ENGINE_ID_TPC_5:
+		case GAUDI2_DCORE2_ENGINE_ID_TPC_0 ... GAUDI2_DCORE2_ENGINE_ID_TPC_5:
+		case GAUDI2_DCORE3_ENGINE_ID_TPC_0 ... GAUDI2_DCORE3_ENGINE_ID_TPC_5:
+			rc = gaudi2_set_tpc_engine_mode(hdev, engine_ids[i], engine_command);
+			if (rc)
+				return rc;
+
+			break;
+		case GAUDI2_DCORE0_ENGINE_ID_MME:
+		case GAUDI2_DCORE1_ENGINE_ID_MME:
+		case GAUDI2_DCORE2_ENGINE_ID_MME:
+		case GAUDI2_DCORE3_ENGINE_ID_MME:
+			rc = gaudi2_set_mme_engine_mode(hdev, engine_ids[i], engine_command);
+			if (rc)
+				return rc;
+
+			break;
+		case GAUDI2_DCORE0_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE0_ENGINE_ID_EDMA_1:
+		case GAUDI2_DCORE1_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE1_ENGINE_ID_EDMA_1:
+		case GAUDI2_DCORE2_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE2_ENGINE_ID_EDMA_1:
+		case GAUDI2_DCORE3_ENGINE_ID_EDMA_0 ... GAUDI2_DCORE3_ENGINE_ID_EDMA_1:
+			rc = gaudi2_set_edma_engine_mode(hdev, engine_ids[i], engine_command);
+			if (rc)
+				return rc;
+
+			break;
+		default:
+			dev_err(hdev->dev, "Invalid engine ID %u\n", engine_ids[i]);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int gaudi2_verify_engine_modes(struct hl_device *hdev, u32 *engine_ids,
+		u32 num_engines, u32 engine_command)
+{
+	bool is_engine_idle = true;
+	u64 mask_arr = 0;
+	int i;
+
+	gaudi2_get_tpc_idle_status(hdev, &mask_arr, 8 * sizeof(mask_arr), NULL);
+	gaudi2_get_mme_idle_status(hdev, &mask_arr, 8 * sizeof(mask_arr), NULL);
+	gaudi2_get_edma_idle_status(hdev, &mask_arr, 8 * sizeof(mask_arr), NULL);
+
+	for (i = 0 ; i < num_engines ; ++i) {
+		is_engine_idle = !(mask_arr & BIT_ULL(engine_ids[i]));
+		if ((engine_command == HL_ENGINE_RESUME) && !is_engine_idle) {
+			dev_err(hdev->dev, "Engine ID %u remained NOT idle!\n", engine_ids[i]);
+			return -EBUSY;
+		} else if ((engine_command == HL_ENGINE_STALL) && is_engine_idle) {
+			dev_err(hdev->dev, "Engine ID %u remained idle!\n", engine_ids[i]);
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static int gaudi2_set_engines(struct hl_device *hdev, u32 *engine_ids,
+					u32 num_engines, u32 engine_command)
+{
+	int rc;
+
+	switch (engine_command) {
+	case HL_ENGINE_CORE_HALT:
+	case HL_ENGINE_CORE_RUN:
+		return gaudi2_set_engine_cores(hdev, engine_ids, num_engines, engine_command);
+
+	case HL_ENGINE_STALL:
+	case HL_ENGINE_RESUME:
+		rc = gaudi2_set_engine_modes(hdev, engine_ids, num_engines, engine_command);
+		if (rc)
+			return rc;
+
+		return gaudi2_verify_engine_modes(hdev, engine_ids, num_engines, engine_command);
+
+	default:
+		dev_err(hdev->dev, "failed to execute command id %u\n", engine_command);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static void gaudi2_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	u32 wait_timeout_ms;
@@ -10883,6 +11135,7 @@ static const struct hl_asic_funcs gaudi2_funcs = {
 	.access_dev_mem = hl_access_dev_mem,
 	.set_dram_bar_base = gaudi2_set_hbm_bar_base,
 	.set_engine_cores = gaudi2_set_engine_cores,
+	.set_engines = gaudi2_set_engines,
 	.send_device_activity = gaudi2_send_device_activity,
 	.set_dram_properties = gaudi2_set_dram_properties,
 	.set_binning_masks = gaudi2_set_binning_masks,
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h b/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h
index 74bc1daaeeda..e6664c4a2cf5 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2_masks.h
@@ -86,6 +86,8 @@
 
 #define DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK 0x100
 
+#define DCORE0_TPC0_EML_CFG_DBG_CNT_DBG_EXIT_MASK 0x40
+
 /* CGM_IDLE_MASK is valid for all engines CGM idle check */
 #define CGM_IDLE_MASK	DCORE0_TPC0_QM_CGM_STS_AGENT_IDLE_MASK
 
diff --git a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
index 0bf3092bfeea..452b379f39f6 100644
--- a/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
+++ b/drivers/accel/habanalabs/include/gaudi2/asic_reg/gaudi2_regs.h
@@ -164,6 +164,8 @@
 
 #define mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR	0x4800040
 
+#define mmDCORE0_TPC0_EML_CFG_DBG_CNT		0x40000
+
 #define SM_OBJS_PROT_BITS_OFFS			0x14000
 
 #define DCORE_OFFSET			(mmDCORE1_TPC0_QM_BASE - mmDCORE0_TPC0_QM_BASE)
@@ -185,7 +187,10 @@
 #define TPC_CFG_STALL_ON_ERR_OFFSET	(mmDCORE0_TPC0_CFG_STALL_ON_ERR - mmDCORE0_TPC0_CFG_BASE)
 #define TPC_CFG_TPC_INTR_MASK_OFFSET	(mmDCORE0_TPC0_CFG_TPC_INTR_MASK - mmDCORE0_TPC0_CFG_BASE)
 #define TPC_CFG_MSS_CONFIG_OFFSET	(mmDCORE0_TPC0_CFG_MSS_CONFIG - mmDCORE0_TPC0_CFG_BASE)
+#define TPC_EML_CFG_DBG_CNT_OFFSET	(mmDCORE0_TPC0_EML_CFG_DBG_CNT - mmDCORE0_TPC0_EML_CFG_BASE)
 
+#define EDMA_CORE_CFG_STALL_OFFSET	(mmDCORE0_EDMA0_CORE_CFG_1 - mmDCORE0_EDMA0_CORE_BASE)
+#define MME_CTRL_LO_QM_STALL_OFFSET	(mmDCORE0_MME_CTRL_LO_QM_STALL - mmDCORE0_MME_CTRL_LO_BASE)
 #define MME_ACC_INTR_MASK_OFFSET	(mmDCORE0_MME_ACC_INTR_MASK - mmDCORE0_MME_ACC_BASE)
 #define MME_ACC_WR_AXI_AGG_COUT0_OFFSET	(mmDCORE0_MME_ACC_WR_AXI_AGG_COUT0 - mmDCORE0_MME_ACC_BASE)
 #define MME_ACC_WR_AXI_AGG_COUT1_OFFSET	(mmDCORE0_MME_ACC_WR_AXI_AGG_COUT1 - mmDCORE0_MME_ACC_BASE)
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 359b19ef3c3f..7ca0ef802fd1 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -1535,17 +1535,31 @@ struct hl_cs_chunk {
  */
 #define HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES	0x8000
 
+/*
+ * The engines CS is merged into the existing CS ioctls.
+ * Use it to control engines modes.
+ */
+#define HL_CS_FLAGS_ENGINES_COMMAND		0x10000
+
 #define HL_CS_STATUS_SUCCESS		0
 
 #define HL_MAX_JOBS_PER_CS		512
 
-/* HL_ENGINE_CORE_ values
+/*
+ * enum hl_engine_command - engine command
  *
- * HL_ENGINE_CORE_HALT: engine core halt
- * HL_ENGINE_CORE_RUN:  engine core run
+ * @HL_ENGINE_CORE_HALT: engine core halt
+ * @HL_ENGINE_CORE_RUN: engine core run
+ * @HL_ENGINE_STALL: user engine/s stall
+ * @HL_ENGINE_RESUME: user engine/s resume
  */
-#define HL_ENGINE_CORE_HALT	(1 << 0)
-#define HL_ENGINE_CORE_RUN	(1 << 1)
+enum hl_engine_command {
+	HL_ENGINE_CORE_HALT = 1,
+	HL_ENGINE_CORE_RUN = 2,
+	HL_ENGINE_STALL = 3,
+	HL_ENGINE_RESUME = 4,
+	HL_ENGINE_COMMAND_MAX
+};
 
 struct hl_cs_in {
 
@@ -1569,6 +1583,18 @@ struct hl_cs_in {
 			/* the core command to be sent towards engine cores */
 			__u32 core_command;
 		};
+
+		/* Valid only when HL_CS_FLAGS_ENGINES_COMMAND is set */
+		struct {
+			/* this holds address of array of uint32 for engines */
+			__u64 engines;
+
+			/* number of engines in engines array */
+			__u32 num_engines;
+
+			/* the engine command to be sent towards engines */
+			__u32 engine_command;
+		};
 	};
 
 	union {
-- 
cgit v1.2.3


From 60d7bbb5b4b875d613a43e3be797ddd4ff92cb7b Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 16 Mar 2023 12:13:34 +0200
Subject: accel/habanalabs: fix field names in hl_info_hw_ip_info

Don't use padX for actual reservedX fields.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Ofir Bitton <obitton@habana.ai>
---
 include/uapi/drm/habanalabs_accel.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 7ca0ef802fd1..7d457eb4da74 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -915,17 +915,18 @@ struct hl_info_hw_ip_info {
 	__u64 dram_page_size;
 	__u32 edma_enabled_mask;
 	__u16 number_of_user_interrupts;
-	__u16 pad2;
-	__u64 reserved4;
+	__u8 reserved1;
+	__u8 reserved2;
+	__u64 reserved3;
 	__u64 device_mem_alloc_default_page_size;
+	__u64 reserved4;
 	__u64 reserved5;
-	__u64 reserved6;
-	__u32 reserved7;
-	__u8 reserved8;
+	__u32 reserved6;
+	__u8 reserved7;
 	__u8 revision_id;
 	__u16 tpc_interrupt_id;
+	__u32 reserved8;
 	__u32 reserved9;
-	__u8 pad3[4];
 	__u64 engine_core_interrupt_reg_addr;
 };
 
-- 
cgit v1.2.3


From 76e1ff37b6872c9f2d11660258fc8c88b2f97b06 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Mon, 13 Mar 2023 17:10:28 +0200
Subject: accel/habanalabs: expose dram reserved size by kmd

We expose this in order for user applications to know how much dram
is reserved for internal use.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs_ioctl.c | 1 +
 include/uapi/drm/habanalabs_accel.h                | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/uapi')

diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 0997ede359d7..81e026066f96 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -109,6 +109,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.security_enabled = prop->fw_security_enabled;
 	hw_ip.revision_id = hdev->pdev->revision;
 	hw_ip.engine_core_interrupt_reg_addr = prop->engine_core_interrupt_reg_addr;
+	hw_ip.reserved_dram_size = dram_kmd_size;
 
 	return copy_to_user(out, &hw_ip,
 		min((size_t) size, sizeof(hw_ip))) ? -EFAULT : 0;
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index 7d457eb4da74..e43688d30e96 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -888,6 +888,7 @@ enum hl_server_type {
  * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
  * @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
  *                                  in order to raise events toward FW.
+ * @reserved_dram_size: DRAM size reserved for driver and firmware.
  */
 struct hl_info_hw_ip_info {
 	__u64 sram_base_address;
@@ -928,6 +929,7 @@ struct hl_info_hw_ip_info {
 	__u32 reserved8;
 	__u32 reserved9;
 	__u64 engine_core_interrupt_reg_addr;
+	__u64 reserved_dram_size;
 };
 
 struct hl_info_dram_usage {
-- 
cgit v1.2.3


From 958e47977bd12e06752a559541867028b120de76 Mon Sep 17 00:00:00 2001
From: Ofir Bitton <obitton@habana.ai>
Date: Mon, 13 Mar 2023 22:30:23 +0200
Subject: accel/habanalabs: expose rotator mask to userspace

All engine masks are exposed to user, make sure user gets the
correct rotator enabled mask in gaudi2.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
---
 drivers/accel/habanalabs/common/habanalabs.h       | 5 +++--
 drivers/accel/habanalabs/common/habanalabs_ioctl.c | 1 +
 drivers/accel/habanalabs/gaudi2/gaudi2.c           | 2 ++
 include/uapi/drm/habanalabs_accel.h                | 4 +++-
 4 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/accel/habanalabs/common/habanalabs.h b/drivers/accel/habanalabs/common/habanalabs.h
index 98e6d98cf868..c01677ed3c07 100644
--- a/drivers/accel/habanalabs/common/habanalabs.h
+++ b/drivers/accel/habanalabs/common/habanalabs.h
@@ -609,8 +609,8 @@ struct hl_hints_range {
  * @cb_pool_cb_cnt: number of CBs in the CB pool.
  * @cb_pool_cb_size: size of each CB in the CB pool.
  * @decoder_enabled_mask: which decoders are enabled.
- * @decoder_binning_mask: which decoders are binned, 0 means usable and 1
- *                        means binned (at most one binned decoder per dcore).
+ * @decoder_binning_mask: which decoders are binned, 0 means usable and 1 means binned.
+ * @rotator_enabled_mask: which rotators are enabled.
  * @edma_enabled_mask: which EDMAs are enabled.
  * @edma_binning_mask: which EDMAs are binned, 0 means usable and 1 means
  *                     binned (at most one binned DMA).
@@ -760,6 +760,7 @@ struct asic_fixed_properties {
 	u32				cb_pool_cb_size;
 	u32				decoder_enabled_mask;
 	u32				decoder_binning_mask;
+	u32				rotator_enabled_mask;
 	u32				edma_enabled_mask;
 	u32				edma_binning_mask;
 	u32				max_pending_cs;
diff --git a/drivers/accel/habanalabs/common/habanalabs_ioctl.c b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
index 81e026066f96..203ee857810c 100644
--- a/drivers/accel/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/accel/habanalabs/common/habanalabs_ioctl.c
@@ -108,6 +108,7 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 	hw_ip.server_type = prop->server_type;
 	hw_ip.security_enabled = prop->fw_security_enabled;
 	hw_ip.revision_id = hdev->pdev->revision;
+	hw_ip.rotator_enabled_mask = prop->rotator_enabled_mask;
 	hw_ip.engine_core_interrupt_reg_addr = prop->engine_core_interrupt_reg_addr;
 	hw_ip.reserved_dram_size = dram_kmd_size;
 
diff --git a/drivers/accel/habanalabs/gaudi2/gaudi2.c b/drivers/accel/habanalabs/gaudi2/gaudi2.c
index 57c94f9a6042..9f6dbc020d27 100644
--- a/drivers/accel/habanalabs/gaudi2/gaudi2.c
+++ b/drivers/accel/habanalabs/gaudi2/gaudi2.c
@@ -2315,6 +2315,8 @@ static int gaudi2_set_fixed_properties(struct hl_device *hdev)
 
 	prop->hints_range_reservation = true;
 
+	prop->rotator_enabled_mask = BIT(NUM_OF_ROT) - 1;
+
 	if (hdev->pldm)
 		prop->mmu_pgt_size = 0x800000; /* 8MB */
 	else
diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index e43688d30e96..c139aab17c8a 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -886,6 +886,8 @@ enum hl_server_type {
  * @device_mem_alloc_default_page_size: default page size used in device memory allocation.
  * @revision_id: PCI revision ID of the ASIC.
  * @tpc_interrupt_id: interrupt id for TPC to use in order to raise events towards the host.
+ * @rotator_enabled_mask: Bit-mask that represents which rotators are enabled.
+ *                        Relevant for Gaudi3 and later.
  * @engine_core_interrupt_reg_addr: interrupt register address for engine core to use
  *                                  in order to raise events toward FW.
  * @reserved_dram_size: DRAM size reserved for driver and firmware.
@@ -926,7 +928,7 @@ struct hl_info_hw_ip_info {
 	__u8 reserved7;
 	__u8 revision_id;
 	__u16 tpc_interrupt_id;
-	__u32 reserved8;
+	__u32 rotator_enabled_mask;
 	__u32 reserved9;
 	__u64 engine_core_interrupt_reg_addr;
 	__u64 reserved_dram_size;
-- 
cgit v1.2.3


From 02abecdeebfcd3848b26b70778dd7f6eb0db65e1 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Fri, 17 Mar 2023 12:18:01 -0600
Subject: drm/i915/uapi: Replace fake flex-array with flexible-array member
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Zero-length arrays as fake flexible arrays are deprecated and we are
moving towards adopting C99 flexible-array members instead.

Address the following warning found with GCC-13 and
-fstrict-flex-arrays=3 enabled:
drivers/gpu/drm/i915/gem/i915_gem_context.c: In function ‘set_proto_ctx_engines.isra’:
drivers/gpu/drm/i915/gem/i915_gem_context.c:769:41: warning: array subscript n is outside array bounds of ‘struct i915_engine_class_instance[0]’ [-Warray-bounds=]
  769 |                 if (copy_from_user(&ci, &user->engines[n], sizeof(ci))) {
      |                                         ^~~~~~~~~~~~~~~~~
./include/uapi/drm/i915_drm.h:2494:43: note: while referencing ‘engines’
 2494 |         struct i915_engine_class_instance engines[0];

This helps with the ongoing efforts to tighten the FORTIFY_SOURCE
routines on memcpy() and help us make progress towards globally
enabling -fstrict-flex-arrays=3 [1].

Link: https://github.com/KSPP/linux/issues/21
Link: https://github.com/KSPP/linux/issues/271
Link: https://gcc.gnu.org/pipermail/gcc-patches/2022-October/602902.html [1]
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/ZBSu2QsUJy31kjSE@work
---
 include/uapi/drm/i915_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 8df261c5ab9b..5e458d6f2895 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2491,7 +2491,7 @@ struct i915_context_param_engines {
 #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
 #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */
 #define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */
-	struct i915_engine_class_instance engines[0];
+	struct i915_engine_class_instance engines[];
 } __attribute__((packed));
 
 #define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \
-- 
cgit v1.2.3


From c61d04c9eb4354980839cf938488ca703eba0f83 Mon Sep 17 00:00:00 2001
From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Date: Thu, 23 Mar 2023 15:58:58 -0700
Subject: drm/i915/perf: Add engine class instance parameters to perf

One or more engines map to a specific OA unit. All reports from these
engines are captured in the OA buffer managed by this OA unit.

Current i915 OA implementation supports only the OAG unit. OAG primarily
caters to render engine, so i915 OA uses render as the default engine
in the OA implementation. Since there are more OA units on newer
hardware that map to other engines, allow user to pass engine class and
instance to select and program specific OA units.

UMD specific changes for GPUvis support:
https://patchwork.freedesktop.org/patch/522827/?series=114023
https://patchwork.freedesktop.org/patch/522822/?series=114023
https://patchwork.freedesktop.org/patch/522826/?series=114023
https://patchwork.freedesktop.org/patch/522828/?series=114023
https://patchwork.freedesktop.org/patch/522816/?series=114023
https://patchwork.freedesktop.org/patch/522825/?series=114023

v2: (Ashutosh)
- Clarify commit message
- Add drm_dbg
- Clarify uapi description

v3: (Ashutosh)
- Remove irrelevant info from the uapi comment

v4: Ensure engine class:instance is passed together (Ashutosh)
v5: Remove unnecessary quote (Ashutosh)

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230323225901.3743681-9-umesh.nerlige.ramappa@intel.com
---
 drivers/gpu/drm/i915/i915_perf.c | 71 +++++++++++++++++++++++++---------------
 include/uapi/drm/i915_drm.h      | 19 +++++++++++
 2 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 8c4bcdce8019..081d1dd743e0 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -4012,48 +4012,32 @@ static int read_properties_unlocked(struct i915_perf *perf,
 {
 	struct drm_i915_gem_context_param_sseu user_sseu;
 	u64 __user *uprop = uprops;
+	bool config_instance = false;
+	bool config_class = false;
 	bool config_sseu = false;
+	u8 class, instance;
 	u32 i;
 	int ret;
 
 	memset(props, 0, sizeof(struct perf_open_properties));
 	props->poll_oa_period = DEFAULT_POLL_PERIOD_NS;
 
-	if (!n_props) {
-		drm_dbg(&perf->i915->drm,
-			"No i915 perf properties given\n");
-		return -EINVAL;
-	}
-
-	/* At the moment we only support using i915-perf on the RCS. */
-	props->engine = intel_engine_lookup_user(perf->i915,
-						 I915_ENGINE_CLASS_RENDER,
-						 0);
-	if (!props->engine) {
-		drm_dbg(&perf->i915->drm,
-			"No RENDER-capable engines\n");
-		return -EINVAL;
-	}
-
-	if (!engine_supports_oa(props->engine)) {
-		drm_dbg(&perf->i915->drm,
-			"Engine not supported by OA %d:%d\n",
-			I915_ENGINE_CLASS_RENDER, 0);
-		return -EINVAL;
-	}
-
 	/* Considering that ID = 0 is reserved and assuming that we don't
 	 * (currently) expect any configurations to ever specify duplicate
 	 * values for a particular property ID then the last _PROP_MAX value is
 	 * one greater than the maximum number of properties we expect to get
 	 * from userspace.
 	 */
-	if (n_props >= DRM_I915_PERF_PROP_MAX) {
+	if (!n_props || n_props >= DRM_I915_PERF_PROP_MAX) {
 		drm_dbg(&perf->i915->drm,
-			"More i915 perf properties specified than exist\n");
+			"Invalid number of i915 perf properties given\n");
 		return -EINVAL;
 	}
 
+	/* Defaults when class:instance is not passed */
+	class = I915_ENGINE_CLASS_RENDER;
+	instance = 0;
+
 	for (i = 0; i < n_props; i++) {
 		u64 oa_period, oa_freq_hz;
 		u64 id, value;
@@ -4174,7 +4158,15 @@ static int read_properties_unlocked(struct i915_perf *perf,
 			}
 			props->poll_oa_period = value;
 			break;
-		case DRM_I915_PERF_PROP_MAX:
+		case DRM_I915_PERF_PROP_OA_ENGINE_CLASS:
+			class = (u8)value;
+			config_class = true;
+			break;
+		case DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE:
+			instance = (u8)value;
+			config_instance = true;
+			break;
+		default:
 			MISSING_CASE(id);
 			return -EINVAL;
 		}
@@ -4182,6 +4174,28 @@ static int read_properties_unlocked(struct i915_perf *perf,
 		uprop += 2;
 	}
 
+	if ((config_class && !config_instance) ||
+	    (config_instance && !config_class)) {
+		drm_dbg(&perf->i915->drm,
+			"OA engine-class and engine-instance parameters must be passed together\n");
+		return -EINVAL;
+	}
+
+	props->engine = intel_engine_lookup_user(perf->i915, class, instance);
+	if (!props->engine) {
+		drm_dbg(&perf->i915->drm,
+			"OA engine class and instance invalid %d:%d\n",
+			class, instance);
+		return -EINVAL;
+	}
+
+	if (!engine_supports_oa(props->engine)) {
+		drm_dbg(&perf->i915->drm,
+			"Engine not supported by OA %d:%d\n",
+			class, instance);
+		return -EINVAL;
+	}
+
 	if (config_sseu) {
 		ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
 		if (ret) {
@@ -5158,8 +5172,11 @@ int i915_perf_ioctl_version(void)
 	 *
 	 * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the
 	 *    interval for the hrtimer used to check for OA data.
+	 *
+	 * 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and
+	 *    DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE
 	 */
-	return 5;
+	return 6;
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 5e458d6f2895..384e74cac539 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2758,6 +2758,25 @@ enum drm_i915_perf_property_id {
 	 */
 	DRM_I915_PERF_PROP_POLL_OA_PERIOD,
 
+	/**
+	 * Multiple engines may be mapped to the same OA unit. The OA unit is
+	 * identified by class:instance of any engine mapped to it.
+	 *
+	 * This parameter specifies the engine class and must be passed along
+	 * with DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE.
+	 *
+	 * This property is available in perf revision 6.
+	 */
+	DRM_I915_PERF_PROP_OA_ENGINE_CLASS,
+
+	/**
+	 * This parameter specifies the engine instance and must be passed along
+	 * with DRM_I915_PERF_PROP_OA_ENGINE_CLASS.
+	 *
+	 * This property is available in perf revision 6.
+	 */
+	DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE,
+
 	DRM_I915_PERF_PROP_MAX /* non-ABI */
 };
 
-- 
cgit v1.2.3


From 1cc064dce4ed0ff111b6d6cb06b3cccf1cba29f5 Mon Sep 17 00:00:00 2001
From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Date: Thu, 23 Mar 2023 15:58:59 -0700
Subject: drm/i915/perf: Add support for OA media units

MTL introduces additional OA units dedicated to media use cases. Add
support for programming these OA units by passing the media engine class
and instance parameters.

UMD specific changes for GPUvis support:
https://patchwork.freedesktop.org/patch/522827/?series=114023
https://patchwork.freedesktop.org/patch/522822/?series=114023
https://patchwork.freedesktop.org/patch/522826/?series=114023
https://patchwork.freedesktop.org/patch/522828/?series=114023
https://patchwork.freedesktop.org/patch/522816/?series=114023
https://patchwork.freedesktop.org/patch/522825/?series=114023

v2: (Ashutosh)
- check for IP_VER(12, 70) instead of MTL
- remove PERF_GROUP_OAG comment in mtl_oa_base
- remove oa_buffer.group
- use engine->oa_group->type in engine_supports_oa_format
- remove fw_domains and use FORCEWAKE_ALL
- remove MPES/MPEC comment
- s/xehp/mtl/ in b counter validation function name
- remove engine_supports_oa in __oa_engine_group
- remove warn_ON from __oam_engine_group
- refactor oa_init_groups and oa_init_regs
- assign g->type correctly
- use enum oa_type definition

v3: (Ashutosh)
- Drop oa_unit_functional as engine_supports_oa is enough

v4:
- s/DRM_DEBUG/drm_dbg/

Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Reviewed-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230323225901.3743681-10-umesh.nerlige.ramappa@intel.com
---
 drivers/gpu/drm/i915/i915_drv.h          |   2 +
 drivers/gpu/drm/i915/i915_pci.c          |   1 +
 drivers/gpu/drm/i915/i915_perf.c         | 184 +++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_perf_oa_regs.h |  78 +++++++++++++
 drivers/gpu/drm/i915/i915_perf_types.h   |  30 +++++
 drivers/gpu/drm/i915/intel_device_info.h |   1 +
 include/uapi/drm/i915_drm.h              |   4 +
 7 files changed, 278 insertions(+), 22 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index eaf738698149..d796bdf41af9 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -905,6 +905,8 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915,
 	(INTEL_INFO(dev_priv)->has_oa_bpc_reporting)
 #define HAS_OA_SLICE_CONTRIB_LIMITS(dev_priv) \
 	(INTEL_INFO(dev_priv)->has_oa_slice_contrib_limits)
+#define HAS_OAM(dev_priv) \
+	(INTEL_INFO(dev_priv)->has_oam)
 
 /*
  * Set this flag, when platform requires 64K GTT page sizes or larger for
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index 6b4a66734e09..c3231e323b98 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -1027,6 +1027,7 @@ static const struct intel_device_info adl_p_info = {
 	.has_mslice_steering = 1, \
 	.has_oa_bpc_reporting = 1, \
 	.has_oa_slice_contrib_limits = 1, \
+	.has_oam = 1, \
 	.has_rc6 = 1, \
 	.has_reset_engine = 1, \
 	.has_rps = 1, \
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 081d1dd743e0..247dad57ab89 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -192,6 +192,7 @@
  */
 
 #include <linux/anon_inodes.h>
+#include <linux/nospec.h>
 #include <linux/sizes.h>
 #include <linux/uuid.h>
 
@@ -326,6 +327,12 @@ static const struct i915_oa_format oa_formats[I915_OA_FORMAT_MAX] = {
 	[I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
 	[I915_OAR_FORMAT_A32u40_A4u32_B8_C8]    = { 5, 256 },
 	[I915_OA_FORMAT_A24u40_A14u32_B8_C8]    = { 5, 256 },
+	[I915_OAM_FORMAT_MPEC8u64_B8_C8]	= { 1, 192, TYPE_OAM, HDR_64_BIT },
+	[I915_OAM_FORMAT_MPEC8u32_B8_C8]	= { 2, 128, TYPE_OAM, HDR_64_BIT },
+};
+
+static const u32 mtl_oa_base[] = {
+	[PERF_GROUP_OAM_SAMEDIA_0] = 0x393000,
 };
 
 #define SAMPLE_OA_REPORT      (1<<0)
@@ -418,11 +425,17 @@ static void free_oa_config_bo(struct i915_oa_config_bo *oa_bo)
 	kfree(oa_bo);
 }
 
+static inline const
+struct i915_perf_regs *__oa_regs(struct i915_perf_stream *stream)
+{
+	return &stream->engine->oa_group->regs;
+}
+
 static u32 gen12_oa_hw_tail_read(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
 
-	return intel_uncore_read(uncore, GEN12_OAG_OATAILPTR) &
+	return intel_uncore_read(uncore, __oa_regs(stream)->oa_tail_ptr) &
 	       GEN12_OAG_OATAILPTR_MASK;
 }
 
@@ -875,7 +888,8 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
 		i915_reg_t oaheadptr;
 
 		oaheadptr = GRAPHICS_VER(stream->perf->i915) == 12 ?
-			    GEN12_OAG_OAHEADPTR : GEN8_OAHEADPTR;
+			    __oa_regs(stream)->oa_head_ptr :
+			    GEN8_OAHEADPTR;
 
 		spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
@@ -928,7 +942,8 @@ static int gen8_oa_read(struct i915_perf_stream *stream,
 		return -EIO;
 
 	oastatus_reg = GRAPHICS_VER(stream->perf->i915) == 12 ?
-		       GEN12_OAG_OASTATUS : GEN8_OASTATUS;
+		       __oa_regs(stream)->oa_status :
+		       GEN8_OASTATUS;
 
 	oastatus = intel_uncore_read(uncore, oastatus_reg);
 
@@ -1637,6 +1652,11 @@ static bool engine_supports_oa(const struct intel_engine_cs *engine)
 	return engine->oa_group;
 }
 
+static bool engine_supports_oa_format(struct intel_engine_cs *engine, int type)
+{
+	return engine->oa_group && engine->oa_group->type == type;
+}
+
 static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
 {
 	struct i915_perf *perf = stream->perf;
@@ -1788,8 +1808,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
 
 	spin_lock_irqsave(&stream->oa_buffer.ptr_lock, flags);
 
-	intel_uncore_write(uncore, GEN12_OAG_OASTATUS, 0);
-	intel_uncore_write(uncore, GEN12_OAG_OAHEADPTR,
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_status, 0);
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_head_ptr,
 			   gtt_offset & GEN12_OAG_OAHEADPTR_MASK);
 	stream->oa_buffer.head = gtt_offset;
 
@@ -1801,9 +1821,9 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream)
 	 *  to enable proper functionality of the overflow
 	 *  bit."
 	 */
-	intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset |
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_buffer, gtt_offset |
 			   OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT);
-	intel_uncore_write(uncore, GEN12_OAG_OATAILPTR,
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_tail_ptr,
 			   gtt_offset & GEN12_OAG_OATAILPTR_MASK);
 
 	/* Mark that we need updated tail pointers to read from... */
@@ -2563,7 +2583,8 @@ err_add_request:
 	return err;
 }
 
-static int gen8_configure_context(struct i915_gem_context *ctx,
+static int gen8_configure_context(struct i915_perf_stream *stream,
+				  struct i915_gem_context *ctx,
 				  struct flex *flex, unsigned int count)
 {
 	struct i915_gem_engines_iter it;
@@ -2704,7 +2725,7 @@ oa_configure_all_contexts(struct i915_perf_stream *stream,
 
 		spin_unlock(&i915->gem.contexts.lock);
 
-		err = gen8_configure_context(ctx, regs, num_regs);
+		err = gen8_configure_context(stream, ctx, regs, num_regs);
 		if (err) {
 			i915_gem_context_put(ctx);
 			return err;
@@ -2749,6 +2770,9 @@ gen12_configure_all_contexts(struct i915_perf_stream *stream,
 		},
 	};
 
+	if (stream->engine->class != RENDER_CLASS)
+		return 0;
+
 	return oa_configure_all_contexts(stream,
 					 regs, ARRAY_SIZE(regs),
 					 active);
@@ -2878,7 +2902,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
 				   _MASKED_BIT_ENABLE(GEN12_DISABLE_DOP_GATING));
 	}
 
-	intel_uncore_write(uncore, GEN12_OAG_OA_DEBUG,
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_debug,
 			   /* Disable clk ratio reports, like previous Gens. */
 			   _MASKED_BIT_ENABLE(GEN12_OAG_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
 					      GEN12_OAG_OA_DEBUG_INCLUDE_CLK_RATIO) |
@@ -2888,7 +2912,7 @@ gen12_enable_metric_set(struct i915_perf_stream *stream,
 			    */
 			   oag_report_ctx_switches(stream));
 
-	intel_uncore_write(uncore, GEN12_OAG_OAGLBCTXCTRL, periodic ?
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctx_ctrl, periodic ?
 			   (GEN12_OAG_OAGLBCTXCTRL_COUNTER_RESUME |
 			    GEN12_OAG_OAGLBCTXCTRL_TIMER_ENABLE |
 			    (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT))
@@ -3042,8 +3066,8 @@ static void gen8_oa_enable(struct i915_perf_stream *stream)
 
 static void gen12_oa_enable(struct i915_perf_stream *stream)
 {
-	struct intel_uncore *uncore = stream->uncore;
-	u32 report_format = stream->oa_buffer.format->format;
+	const struct i915_perf_regs *regs;
+	u32 val;
 
 	/*
 	 * If we don't want OA reports from the OA buffer, then we don't even
@@ -3054,9 +3078,11 @@ static void gen12_oa_enable(struct i915_perf_stream *stream)
 
 	gen12_init_oa_buffer(stream);
 
-	intel_uncore_write(uncore, GEN12_OAG_OACONTROL,
-			   (report_format << GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT) |
-			   GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE);
+	regs = __oa_regs(stream);
+	val = (stream->oa_buffer.format->format << regs->oa_ctrl_counter_format_shift) |
+	      GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE;
+
+	intel_uncore_write(stream->uncore, regs->oa_ctrl, val);
 }
 
 /**
@@ -3108,9 +3134,9 @@ static void gen12_oa_disable(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
 
-	intel_uncore_write(uncore, GEN12_OAG_OACONTROL, 0);
+	intel_uncore_write(uncore, __oa_regs(stream)->oa_ctrl, 0);
 	if (intel_wait_for_register(uncore,
-				    GEN12_OAG_OACONTROL,
+				    __oa_regs(stream)->oa_ctrl,
 				    GEN12_OAG_OACONTROL_OA_COUNTER_ENABLE, 0,
 				    50))
 		drm_err(&stream->perf->i915->drm,
@@ -4011,6 +4037,7 @@ static int read_properties_unlocked(struct i915_perf *perf,
 				    struct perf_open_properties *props)
 {
 	struct drm_i915_gem_context_param_sseu user_sseu;
+	const struct i915_oa_format *f;
 	u64 __user *uprop = uprops;
 	bool config_instance = false;
 	bool config_class = false;
@@ -4196,6 +4223,15 @@ static int read_properties_unlocked(struct i915_perf *perf,
 		return -EINVAL;
 	}
 
+	i = array_index_nospec(props->oa_format, I915_OA_FORMAT_MAX);
+	f = &perf->oa_formats[i];
+	if (!engine_supports_oa_format(props->engine, f->type)) {
+		drm_dbg(&perf->i915->drm,
+			"Invalid OA format %d for class %d\n",
+			f->type, props->engine->class);
+		return -EINVAL;
+	}
+
 	if (config_sseu) {
 		ret = get_sseu_config(&props->sseu, props->engine, &user_sseu);
 		if (ret) {
@@ -4376,6 +4412,14 @@ static const struct i915_range gen12_oa_b_counters[] = {
 	{}
 };
 
+static const struct i915_range mtl_oam_b_counters[] = {
+	{ .start = 0x393000, .end = 0x39301c },	/* GEN12_OAM_STARTTRIG1[1-8] */
+	{ .start = 0x393020, .end = 0x39303c },	/* GEN12_OAM_REPORTTRIG1[1-8] */
+	{ .start = 0x393040, .end = 0x39307c },	/* GEN12_OAM_CEC[0-7][0-1] */
+	{ .start = 0x393200, .end = 0x39323C },	/* MPES[0-7] */
+	{}
+};
+
 static const struct i915_range xehp_oa_b_counters[] = {
 	{ .start = 0xdc48, .end = 0xdc48 },	/* OAA_ENABLE_REG */
 	{ .start = 0xdd00, .end = 0xdd48 },	/* OAG_LCE0_0 - OAA_LENABLE_REG */
@@ -4429,6 +4473,8 @@ static const struct i915_range mtl_oa_mux_regs[] = {
 	{ .start = 0x0d0c, .end = 0x0d2c },	/* NOA_CONFIG[0-8] */
 	{ .start = 0x9840, .end = 0x9840 },	/* GDT_CHICKEN_BITS */
 	{ .start = 0x9884, .end = 0x9888 },	/* NOA_WRITE */
+	{ .start = 0x38d100, .end = 0x38d114},	/* VISACTL */
+	{}
 };
 
 static bool gen7_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
@@ -4466,10 +4512,20 @@ static bool gen12_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
 	return reg_in_range_table(addr, gen12_oa_b_counters);
 }
 
+static bool mtl_is_valid_oam_b_counter_addr(struct i915_perf *perf, u32 addr)
+{
+	if (HAS_OAM(perf->i915) &&
+	    GRAPHICS_VER_FULL(perf->i915) >= IP_VER(12, 70))
+		return reg_in_range_table(addr, mtl_oam_b_counters);
+
+	return false;
+}
+
 static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr)
 {
 	return reg_in_range_table(addr, xehp_oa_b_counters) ||
-		reg_in_range_table(addr, gen12_oa_b_counters);
+		reg_in_range_table(addr, gen12_oa_b_counters) ||
+		mtl_is_valid_oam_b_counter_addr(perf, addr);
 }
 
 static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr)
@@ -4839,12 +4895,86 @@ static u32 num_perf_groups_per_gt(struct intel_gt *gt)
 	return 1;
 }
 
+static u32 __oam_engine_group(struct intel_engine_cs *engine)
+{
+	if (GRAPHICS_VER_FULL(engine->i915) >= IP_VER(12, 70)) {
+		/*
+		 * There's 1 SAMEDIA gt and 1 OAM per SAMEDIA gt. All media slices
+		 * within the gt use the same OAM. All MTL SKUs list 1 SA MEDIA.
+		 */
+		drm_WARN_ON(&engine->i915->drm,
+			    engine->gt->type != GT_MEDIA);
+
+		return PERF_GROUP_OAM_SAMEDIA_0;
+	}
+
+	return PERF_GROUP_INVALID;
+}
+
 static u32 __oa_engine_group(struct intel_engine_cs *engine)
 {
-	if (engine->class == RENDER_CLASS)
+	switch (engine->class) {
+	case RENDER_CLASS:
 		return PERF_GROUP_OAG;
-	else
+
+	case VIDEO_DECODE_CLASS:
+	case VIDEO_ENHANCEMENT_CLASS:
+		return __oam_engine_group(engine);
+
+	default:
 		return PERF_GROUP_INVALID;
+	}
+}
+
+static struct i915_perf_regs __oam_regs(u32 base)
+{
+	return (struct i915_perf_regs) {
+		base,
+		GEN12_OAM_HEAD_POINTER(base),
+		GEN12_OAM_TAIL_POINTER(base),
+		GEN12_OAM_BUFFER(base),
+		GEN12_OAM_CONTEXT_CONTROL(base),
+		GEN12_OAM_CONTROL(base),
+		GEN12_OAM_DEBUG(base),
+		GEN12_OAM_STATUS(base),
+		GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT,
+	};
+}
+
+static struct i915_perf_regs __oag_regs(void)
+{
+	return (struct i915_perf_regs) {
+		0,
+		GEN12_OAG_OAHEADPTR,
+		GEN12_OAG_OATAILPTR,
+		GEN12_OAG_OABUFFER,
+		GEN12_OAG_OAGLBCTXCTRL,
+		GEN12_OAG_OACONTROL,
+		GEN12_OAG_OA_DEBUG,
+		GEN12_OAG_OASTATUS,
+		GEN12_OAG_OACONTROL_OA_COUNTER_FORMAT_SHIFT,
+	};
+}
+
+static void oa_init_groups(struct intel_gt *gt)
+{
+	int i, num_groups = gt->perf.num_perf_groups;
+
+	for (i = 0; i < num_groups; i++) {
+		struct i915_perf_group *g = &gt->perf.group[i];
+
+		/* Fused off engines can result in a group with num_engines == 0 */
+		if (g->num_engines == 0)
+			continue;
+
+		if (i == PERF_GROUP_OAG && gt->type != GT_MEDIA) {
+			g->regs = __oag_regs();
+			g->type = TYPE_OAG;
+		} else if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) {
+			g->regs = __oam_regs(mtl_oa_base[i]);
+			g->type = TYPE_OAM;
+		}
+	}
 }
 
 static int oa_init_gt(struct intel_gt *gt)
@@ -4871,6 +5001,8 @@ static int oa_init_gt(struct intel_gt *gt)
 	gt->perf.num_perf_groups = num_groups;
 	gt->perf.group = g;
 
+	oa_init_groups(gt);
+
 	return 0;
 }
 
@@ -4928,9 +5060,15 @@ static void oa_init_supported_formats(struct i915_perf *perf)
 		break;
 
 	case INTEL_DG2:
+		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
+		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
+		break;
+
 	case INTEL_METEORLAKE:
 		oa_format_add(perf, I915_OAR_FORMAT_A32u40_A4u32_B8_C8);
 		oa_format_add(perf, I915_OA_FORMAT_A24u40_A14u32_B8_C8);
+		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u64_B8_C8);
+		oa_format_add(perf, I915_OAM_FORMAT_MPEC8u32_B8_C8);
 		break;
 
 	default:
@@ -5175,8 +5313,10 @@ int i915_perf_ioctl_version(void)
 	 *
 	 * 6: Add DRM_I915_PERF_PROP_OA_ENGINE_CLASS and
 	 *    DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE
+	 *
+	 * 7: Add support for video decode and enhancement classes.
 	 */
-	return 6;
+	return 7;
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_perf_oa_regs.h b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
index 381d94101610..ba103875e19f 100644
--- a/drivers/gpu/drm/i915/i915_perf_oa_regs.h
+++ b/drivers/gpu/drm/i915/i915_perf_oa_regs.h
@@ -138,4 +138,82 @@
 #define   GEN12_SQCNT1_PMON_ENABLE		REG_BIT(30)
 #define   GEN12_SQCNT1_OABPC			REG_BIT(29)
 
+/* Gen12 OAM unit */
+#define GEN12_OAM_HEAD_POINTER_OFFSET   (0x1a0)
+#define  GEN12_OAM_HEAD_POINTER_MASK    0xffffffc0
+
+#define GEN12_OAM_TAIL_POINTER_OFFSET   (0x1a4)
+#define  GEN12_OAM_TAIL_POINTER_MASK    0xffffffc0
+
+#define GEN12_OAM_BUFFER_OFFSET         (0x1a8)
+#define  GEN12_OAM_BUFFER_SIZE_MASK     (0x7)
+#define  GEN12_OAM_BUFFER_SIZE_SHIFT    (3)
+#define  GEN12_OAM_BUFFER_MEMORY_SELECT REG_BIT(0) /* 0: PPGTT, 1: GGTT */
+
+#define GEN12_OAM_CONTEXT_CONTROL_OFFSET              (0x1bc)
+#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_PERIOD_SHIFT 2
+#define  GEN12_OAM_CONTEXT_CONTROL_TIMER_ENABLE       REG_BIT(1)
+#define  GEN12_OAM_CONTEXT_CONTROL_COUNTER_RESUME     REG_BIT(0)
+
+#define GEN12_OAM_CONTROL_OFFSET                (0x194)
+#define  GEN12_OAM_CONTROL_COUNTER_FORMAT_SHIFT 1
+#define  GEN12_OAM_CONTROL_COUNTER_ENABLE       REG_BIT(0)
+
+#define GEN12_OAM_DEBUG_OFFSET                      (0x198)
+#define  GEN12_OAM_DEBUG_BUFFER_SIZE_SELECT         REG_BIT(12)
+#define  GEN12_OAM_DEBUG_INCLUDE_CLK_RATIO          REG_BIT(6)
+#define  GEN12_OAM_DEBUG_DISABLE_CLK_RATIO_REPORTS  REG_BIT(5)
+#define  GEN12_OAM_DEBUG_DISABLE_GO_1_0_REPORTS     REG_BIT(2)
+#define  GEN12_OAM_DEBUG_DISABLE_CTX_SWITCH_REPORTS REG_BIT(1)
+
+#define GEN12_OAM_STATUS_OFFSET            (0x19c)
+#define  GEN12_OAM_STATUS_COUNTER_OVERFLOW REG_BIT(2)
+#define  GEN12_OAM_STATUS_BUFFER_OVERFLOW  REG_BIT(1)
+#define  GEN12_OAM_STATUS_REPORT_LOST      REG_BIT(0)
+
+#define GEN12_OAM_MMIO_TRG_OFFSET	(0x1d0)
+
+#define GEN12_OAM_MMIO_TRG(base) \
+	_MMIO((base) + GEN12_OAM_MMIO_TRG_OFFSET)
+
+#define GEN12_OAM_HEAD_POINTER(base) \
+	_MMIO((base) + GEN12_OAM_HEAD_POINTER_OFFSET)
+#define GEN12_OAM_TAIL_POINTER(base) \
+	_MMIO((base) + GEN12_OAM_TAIL_POINTER_OFFSET)
+#define GEN12_OAM_BUFFER(base) \
+	_MMIO((base) + GEN12_OAM_BUFFER_OFFSET)
+#define GEN12_OAM_CONTEXT_CONTROL(base) \
+	_MMIO((base) + GEN12_OAM_CONTEXT_CONTROL_OFFSET)
+#define GEN12_OAM_CONTROL(base) \
+	_MMIO((base) + GEN12_OAM_CONTROL_OFFSET)
+#define GEN12_OAM_DEBUG(base) \
+	_MMIO((base) + GEN12_OAM_DEBUG_OFFSET)
+#define GEN12_OAM_STATUS(base) \
+	_MMIO((base) + GEN12_OAM_STATUS_OFFSET)
+
+#define GEN12_OAM_CEC0_0_OFFSET		(0x40)
+#define GEN12_OAM_CEC7_1_OFFSET		(0x7c)
+#define GEN12_OAM_CEC0_0(base) \
+	_MMIO((base) + GEN12_OAM_CEC0_0_OFFSET)
+#define GEN12_OAM_CEC7_1(base) \
+	_MMIO((base) + GEN12_OAM_CEC7_1_OFFSET)
+
+#define GEN12_OAM_STARTTRIG1_OFFSET	(0x00)
+#define GEN12_OAM_STARTTRIG8_OFFSET	(0x1c)
+#define GEN12_OAM_STARTTRIG1(base) \
+	_MMIO((base) + GEN12_OAM_STARTTRIG1_OFFSET)
+#define GEN12_OAM_STARTTRIG8(base) \
+	_MMIO((base) + GEN12_OAM_STARTTRIG8_OFFSET)
+
+#define GEN12_OAM_REPORTTRIG1_OFFSET	(0x20)
+#define GEN12_OAM_REPORTTRIG8_OFFSET	(0x3c)
+#define GEN12_OAM_REPORTTRIG1(base) \
+	_MMIO((base) + GEN12_OAM_REPORTTRIG1_OFFSET)
+#define GEN12_OAM_REPORTTRIG8(base) \
+	_MMIO((base) + GEN12_OAM_REPORTTRIG8_OFFSET)
+
+#define GEN12_OAM_PERF_COUNTER_B0_OFFSET	(0x84)
+#define GEN12_OAM_PERF_COUNTER_B(base, idx) \
+	_MMIO((base) + GEN12_OAM_PERF_COUNTER_B0_OFFSET + 4 * (idx))
+
 #endif /* __INTEL_PERF_OA_REGS__ */
diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h
index d8b92508a632..66dd5f74de05 100644
--- a/drivers/gpu/drm/i915/i915_perf_types.h
+++ b/drivers/gpu/drm/i915/i915_perf_types.h
@@ -20,6 +20,7 @@
 #include "gt/intel_engine_types.h"
 #include "gt/intel_sseu.h"
 #include "i915_reg_defs.h"
+#include "intel_uncore.h"
 #include "intel_wakeref.h"
 
 struct drm_i915_private;
@@ -33,6 +34,7 @@ struct intel_engine_cs;
 
 enum {
 	PERF_GROUP_OAG = 0,
+	PERF_GROUP_OAM_SAMEDIA_0 = 0,
 
 	PERF_GROUP_MAX,
 	PERF_GROUP_INVALID = U32_MAX,
@@ -43,9 +45,27 @@ enum report_header {
 	HDR_64_BIT,
 };
 
+struct i915_perf_regs {
+	u32 base;
+	i915_reg_t oa_head_ptr;
+	i915_reg_t oa_tail_ptr;
+	i915_reg_t oa_buffer;
+	i915_reg_t oa_ctx_ctrl;
+	i915_reg_t oa_ctrl;
+	i915_reg_t oa_debug;
+	i915_reg_t oa_status;
+	u32 oa_ctrl_counter_format_shift;
+};
+
+enum oa_type {
+	TYPE_OAG,
+	TYPE_OAM,
+};
+
 struct i915_oa_format {
 	u32 format;
 	int size;
+	int type;
 	enum report_header header;
 };
 
@@ -416,6 +436,16 @@ struct i915_perf_group {
 	 * @num_engines: The number of engines using this OA unit.
 	 */
 	u32 num_engines;
+
+	/*
+	 * @regs: OA buffer register group for programming the OA unit.
+	 */
+	struct i915_perf_regs regs;
+
+	/*
+	 * @type: Type of OA unit - OAM, OAG etc.
+	 */
+	enum oa_type type;
 };
 
 struct i915_perf_gt {
diff --git a/drivers/gpu/drm/i915/intel_device_info.h b/drivers/gpu/drm/i915/intel_device_info.h
index d588e5fd2eea..8281513fe072 100644
--- a/drivers/gpu/drm/i915/intel_device_info.h
+++ b/drivers/gpu/drm/i915/intel_device_info.h
@@ -166,6 +166,7 @@ enum intel_ppgtt_type {
 	func(has_mslice_steering); \
 	func(has_oa_bpc_reporting); \
 	func(has_oa_slice_contrib_limits); \
+	func(has_oam); \
 	func(has_one_eu_per_fuse_bit); \
 	func(has_pxp); \
 	func(has_rc6); \
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 384e74cac539..dba7c5a5b25e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2676,6 +2676,10 @@ enum drm_i915_oa_format {
 	I915_OAR_FORMAT_A32u40_A4u32_B8_C8,
 	I915_OA_FORMAT_A24u40_A14u32_B8_C8,
 
+	/* MTL OAM */
+	I915_OAM_FORMAT_MPEC8u64_B8_C8,
+	I915_OAM_FORMAT_MPEC8u32_B8_C8,
+
 	I915_OA_FORMAT_MAX	    /* non-ABI */
 };
 
-- 
cgit v1.2.3


From d71c11cc79d259c059f4ad377c0f930263f77c53 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Tue, 28 Feb 2023 10:10:11 -0800
Subject: dma-buf/sync_file: Surface sync-file uABI

We had all of the internal driver APIs, but not the all important
userspace uABI, in the dma-buf doc.  Fix that.  And re-arrange the
comments slightly as otherwise the comments for the ioctl nr defines
would not show up.

v2: Fix docs build warning coming from newly including the uabi header
    in the docs build

Signed-off-by: Rob Clark <robdclark@chromium.org>
Acked-by: Pekka Paalanen <pekka.paalanen@collabora.com>
---
 Documentation/driver-api/dma-buf.rst | 10 ++++++++--
 include/uapi/linux/sync_file.h       | 37 +++++++++++++++---------------------
 2 files changed, 23 insertions(+), 24 deletions(-)

(limited to 'include/uapi')

diff --git a/Documentation/driver-api/dma-buf.rst b/Documentation/driver-api/dma-buf.rst
index 0625e7e21f6f..f92a32d095d9 100644
--- a/Documentation/driver-api/dma-buf.rst
+++ b/Documentation/driver-api/dma-buf.rst
@@ -203,8 +203,8 @@ DMA Fence unwrap
 .. kernel-doc:: include/linux/dma-fence-unwrap.h
    :internal:
 
-DMA Fence uABI/Sync File
-~~~~~~~~~~~~~~~~~~~~~~~~
+DMA Fence Sync File
+~~~~~~~~~~~~~~~~~~~
 
 .. kernel-doc:: drivers/dma-buf/sync_file.c
    :export:
@@ -212,6 +212,12 @@ DMA Fence uABI/Sync File
 .. kernel-doc:: include/linux/sync_file.h
    :internal:
 
+DMA Fence Sync File uABI
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. kernel-doc:: include/uapi/linux/sync_file.h
+   :internal:
+
 Indefinite DMA Fences
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/include/uapi/linux/sync_file.h b/include/uapi/linux/sync_file.h
index ee2dcfb3d660..7e42a5b7558b 100644
--- a/include/uapi/linux/sync_file.h
+++ b/include/uapi/linux/sync_file.h
@@ -16,12 +16,16 @@
 #include <linux/types.h>
 
 /**
- * struct sync_merge_data - data passed to merge ioctl
+ * struct sync_merge_data - SYNC_IOC_MERGE: merge two fences
  * @name:	name of new fence
  * @fd2:	file descriptor of second fence
  * @fence:	returns the fd of the new fence to userspace
  * @flags:	merge_data flags
  * @pad:	padding for 64-bit alignment, should always be zero
+ *
+ * Creates a new fence containing copies of the sync_pts in both
+ * the calling fd and sync_merge_data.fd2.  Returns the new fence's
+ * fd in sync_merge_data.fence
  */
 struct sync_merge_data {
 	char	name[32];
@@ -34,8 +38,8 @@ struct sync_merge_data {
 /**
  * struct sync_fence_info - detailed fence information
  * @obj_name:		name of parent sync_timeline
-* @driver_name:	name of driver implementing the parent
-* @status:		status of the fence 0:active 1:signaled <0:error
+ * @driver_name:	name of driver implementing the parent
+ * @status:		status of the fence 0:active 1:signaled <0:error
  * @flags:		fence_info flags
  * @timestamp_ns:	timestamp of status change in nanoseconds
  */
@@ -48,14 +52,19 @@ struct sync_fence_info {
 };
 
 /**
- * struct sync_file_info - data returned from fence info ioctl
+ * struct sync_file_info - SYNC_IOC_FILE_INFO: get detailed information on a sync_file
  * @name:	name of fence
  * @status:	status of fence. 1: signaled 0:active <0:error
  * @flags:	sync_file_info flags
  * @num_fences	number of fences in the sync_file
  * @pad:	padding for 64-bit alignment, should always be zero
- * @sync_fence_info: pointer to array of structs sync_fence_info with all
+ * @sync_fence_info: pointer to array of struct &sync_fence_info with all
  *		 fences in the sync_file
+ *
+ * Takes a struct sync_file_info. If num_fences is 0, the field is updated
+ * with the actual number of fences. If num_fences is > 0, the system will
+ * use the pointer provided on sync_fence_info to return up to num_fences of
+ * struct sync_fence_info, with detailed fence information.
  */
 struct sync_file_info {
 	char	name[32];
@@ -69,30 +78,14 @@ struct sync_file_info {
 
 #define SYNC_IOC_MAGIC		'>'
 
-/**
+/*
  * Opcodes  0, 1 and 2 were burned during a API change to avoid users of the
  * old API to get weird errors when trying to handling sync_files. The API
  * change happened during the de-stage of the Sync Framework when there was
  * no upstream users available.
  */
 
-/**
- * DOC: SYNC_IOC_MERGE - merge two fences
- *
- * Takes a struct sync_merge_data.  Creates a new fence containing copies of
- * the sync_pts in both the calling fd and sync_merge_data.fd2.  Returns the
- * new fence's fd in sync_merge_data.fence
- */
 #define SYNC_IOC_MERGE		_IOWR(SYNC_IOC_MAGIC, 3, struct sync_merge_data)
-
-/**
- * DOC: SYNC_IOC_FILE_INFO - get detailed information on a sync_file
- *
- * Takes a struct sync_file_info. If num_fences is 0, the field is updated
- * with the actual number of fences. If num_fences is > 0, the system will
- * use the pointer provided on sync_fence_info to return up to num_fences of
- * struct sync_fence_info, with detailed fence information.
- */
 #define SYNC_IOC_FILE_INFO	_IOWR(SYNC_IOC_MAGIC, 4, struct sync_file_info)
 
 #endif /* _UAPI_LINUX_SYNC_H */
-- 
cgit v1.2.3


From b5a24e13c8c8b2c98d114b16da40712b80d5cfc1 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Wed, 8 Mar 2023 07:53:04 -0800
Subject: drm/msm: Add wait-boost support

Add a way for various userspace waits to signal urgency.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Patchwork: https://patchwork.freedesktop.org/patch/525817/
Link: https://lore.kernel.org/r/20230308155322.344664-14-robdclark@gmail.com
---
 drivers/gpu/drm/msm/msm_drv.c | 12 ++++++++----
 drivers/gpu/drm/msm/msm_gem.c |  5 +++++
 include/uapi/drm/msm_drm.h    | 14 ++++++++++++--
 3 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'include/uapi')

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index ce1a77b607d1..9b6f17b1261f 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -46,6 +46,7 @@
  * - 1.8.0 - Add MSM_BO_CACHED_COHERENT for supported GPUs (a6xx)
  * - 1.9.0 - Add MSM_SUBMIT_FENCE_SN_IN
  * - 1.10.0 - Add MSM_SUBMIT_BO_NO_IMPLICIT
+ * - 1.11.0 - Add wait boost (MSM_WAIT_FENCE_BOOST, MSM_PREP_BOOST)
  */
 #define MSM_VERSION_MAJOR	1
 #define MSM_VERSION_MINOR	10
@@ -899,7 +900,7 @@ static int msm_ioctl_gem_info(struct drm_device *dev, void *data,
 }
 
 static int wait_fence(struct msm_gpu_submitqueue *queue, uint32_t fence_id,
-		      ktime_t timeout)
+		      ktime_t timeout, uint32_t flags)
 {
 	struct dma_fence *fence;
 	int ret;
@@ -927,6 +928,9 @@ static int wait_fence(struct msm_gpu_submitqueue *queue, uint32_t fence_id,
 	if (!fence)
 		return 0;
 
+	if (flags & MSM_WAIT_FENCE_BOOST)
+		dma_fence_set_deadline(fence, ktime_get());
+
 	ret = dma_fence_wait_timeout(fence, true, timeout_to_jiffies(&timeout));
 	if (ret == 0) {
 		ret = -ETIMEDOUT;
@@ -947,8 +951,8 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, void *data,
 	struct msm_gpu_submitqueue *queue;
 	int ret;
 
-	if (args->pad) {
-		DRM_ERROR("invalid pad: %08x\n", args->pad);
+	if (args->flags & ~MSM_WAIT_FENCE_FLAGS) {
+		DRM_ERROR("invalid flags: %08x\n", args->flags);
 		return -EINVAL;
 	}
 
@@ -959,7 +963,7 @@ static int msm_ioctl_wait_fence(struct drm_device *dev, void *data,
 	if (!queue)
 		return -ENOENT;
 
-	ret = wait_fence(queue, args->fence, to_ktime(args->timeout));
+	ret = wait_fence(queue, args->fence, to_ktime(args->timeout), args->flags);
 
 	msm_submitqueue_put(queue);
 
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c
index 9008f967637a..db6c4e281d75 100644
--- a/drivers/gpu/drm/msm/msm_gem.c
+++ b/drivers/gpu/drm/msm/msm_gem.c
@@ -895,6 +895,11 @@ int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout)
 		op & MSM_PREP_NOSYNC ? 0 : timeout_to_jiffies(timeout);
 	long ret;
 
+	if (op & MSM_PREP_BOOST) {
+		dma_resv_set_deadline(obj->resv, dma_resv_usage_rw(write),
+				      ktime_get());
+	}
+
 	ret = dma_resv_wait_timeout(obj->resv, dma_resv_usage_rw(write),
 				    true,  remain);
 	if (ret == 0)
diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index 329100016e7c..dbf0d6f43fa9 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -151,8 +151,13 @@ struct drm_msm_gem_info {
 #define MSM_PREP_READ        0x01
 #define MSM_PREP_WRITE       0x02
 #define MSM_PREP_NOSYNC      0x04
+#define MSM_PREP_BOOST       0x08
 
-#define MSM_PREP_FLAGS       (MSM_PREP_READ | MSM_PREP_WRITE | MSM_PREP_NOSYNC)
+#define MSM_PREP_FLAGS       (MSM_PREP_READ | \
+			      MSM_PREP_WRITE | \
+			      MSM_PREP_NOSYNC | \
+			      MSM_PREP_BOOST | \
+			      0)
 
 struct drm_msm_gem_cpu_prep {
 	__u32 handle;         /* in */
@@ -286,6 +291,11 @@ struct drm_msm_gem_submit {
 
 };
 
+#define MSM_WAIT_FENCE_BOOST	0x00000001
+#define MSM_WAIT_FENCE_FLAGS	( \
+		MSM_WAIT_FENCE_BOOST | \
+		0)
+
 /* The normal way to synchronize with the GPU is just to CPU_PREP on
  * a buffer if you need to access it from the CPU (other cmdstream
  * submission from same or other contexts, PAGE_FLIP ioctl, etc, all
@@ -295,7 +305,7 @@ struct drm_msm_gem_submit {
  */
 struct drm_msm_wait_fence {
 	__u32 fence;          /* in */
-	__u32 pad;
+	__u32 flags;          /* in, bitmask of MSM_WAIT_FENCE_x */
 	struct drm_msm_timespec timeout;   /* in */
 	__u32 queueid;         /* in, submitqueue id */
 };
-- 
cgit v1.2.3


From c501ca23a6a306a7c11631e02a26c8e0a768d64b Mon Sep 17 00:00:00 2001
From: Jeffrey Hugo <quic_jhugo@quicinc.com>
Date: Mon, 27 Mar 2023 09:54:51 -0600
Subject: accel/qaic: Add uapi and core driver file

Add the QAIC driver uapi file and core driver file that binds to the PCIe
device. The core driver file also creates the accel device and manages
all the interconnections between the different parts of the driver.

The driver can be built as a module. If so, it will be called "qaic.ko".

Signed-off-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
Reviewed-by: Carl Vanderlip <quic_carlv@quicinc.com>
Reviewed-by: Pranjal Ramajor Asha Kanojiya <quic_pkanojiy@quicinc.com>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
Reviewed-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Acked-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/1679932497-30277-3-git-send-email-quic_jhugo@quicinc.com
---
 drivers/accel/qaic/qaic.h     | 282 ++++++++++++++++++
 drivers/accel/qaic/qaic_drv.c | 647 ++++++++++++++++++++++++++++++++++++++++++
 include/uapi/drm/qaic_accel.h | 397 ++++++++++++++++++++++++++
 3 files changed, 1326 insertions(+)
 create mode 100644 drivers/accel/qaic/qaic.h
 create mode 100644 drivers/accel/qaic/qaic_drv.c
 create mode 100644 include/uapi/drm/qaic_accel.h

(limited to 'include/uapi')

diff --git a/drivers/accel/qaic/qaic.h b/drivers/accel/qaic/qaic.h
new file mode 100644
index 000000000000..f2bd637a0d4e
--- /dev/null
+++ b/drivers/accel/qaic/qaic.h
@@ -0,0 +1,282 @@
+/* SPDX-License-Identifier: GPL-2.0-only
+ *
+ * Copyright (c) 2019-2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2021-2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef _QAIC_H_
+#define _QAIC_H_
+
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/mhi.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/srcu.h>
+#include <linux/wait.h>
+#include <linux/workqueue.h>
+#include <drm/drm_device.h>
+#include <drm/drm_gem.h>
+
+#define QAIC_DBC_BASE		SZ_128K
+#define QAIC_DBC_SIZE		SZ_4K
+
+#define QAIC_NO_PARTITION	-1
+
+#define QAIC_DBC_OFF(i)		((i) * QAIC_DBC_SIZE + QAIC_DBC_BASE)
+
+#define to_qaic_bo(obj) container_of(obj, struct qaic_bo, base)
+
+extern bool datapath_polling;
+
+struct qaic_user {
+	/* Uniquely identifies this user for the device */
+	int			handle;
+	struct kref		ref_count;
+	/* Char device opened by this user */
+	struct qaic_drm_device	*qddev;
+	/* Node in list of users that opened this drm device */
+	struct list_head	node;
+	/* SRCU used to synchronize this user during cleanup */
+	struct srcu_struct	qddev_lock;
+	atomic_t		chunk_id;
+};
+
+struct dma_bridge_chan {
+	/* Pointer to device strcut maintained by driver */
+	struct qaic_device	*qdev;
+	/* ID of this DMA bridge channel(DBC) */
+	unsigned int		id;
+	/* Synchronizes access to xfer_list */
+	spinlock_t		xfer_lock;
+	/* Base address of request queue */
+	void			*req_q_base;
+	/* Base address of response queue */
+	void			*rsp_q_base;
+	/*
+	 * Base bus address of request queue. Response queue bus address can be
+	 * calculated by adding request queue size to this variable
+	 */
+	dma_addr_t		dma_addr;
+	/* Total size of request and response queue in byte */
+	u32			total_size;
+	/* Capacity of request/response queue */
+	u32			nelem;
+	/* The user that opened this DBC */
+	struct qaic_user	*usr;
+	/*
+	 * Request ID of next memory handle that goes in request queue. One
+	 * memory handle can enqueue more than one request elements, all
+	 * this requests that belong to same memory handle have same request ID
+	 */
+	u16			next_req_id;
+	/* true: DBC is in use; false: DBC not in use */
+	bool			in_use;
+	/*
+	 * Base address of device registers. Used to read/write request and
+	 * response queue's head and tail pointer of this DBC.
+	 */
+	void __iomem		*dbc_base;
+	/* Head of list where each node is a memory handle queued in request queue */
+	struct list_head	xfer_list;
+	/* Synchronizes DBC readers during cleanup */
+	struct srcu_struct	ch_lock;
+	/*
+	 * When this DBC is released, any thread waiting on this wait queue is
+	 * woken up
+	 */
+	wait_queue_head_t	dbc_release;
+	/* Head of list where each node is a bo associated with this DBC */
+	struct list_head	bo_lists;
+	/* The irq line for this DBC. Used for polling */
+	unsigned int		irq;
+	/* Polling work item to simulate interrupts */
+	struct work_struct	poll_work;
+};
+
+struct qaic_device {
+	/* Pointer to base PCI device struct of our physical device */
+	struct pci_dev		*pdev;
+	/* Req. ID of request that will be queued next in MHI control device */
+	u32			next_seq_num;
+	/* Base address of bar 0 */
+	void __iomem		*bar_0;
+	/* Base address of bar 2 */
+	void __iomem		*bar_2;
+	/* Controller structure for MHI devices */
+	struct mhi_controller	*mhi_cntrl;
+	/* MHI control channel device */
+	struct mhi_device	*cntl_ch;
+	/* List of requests queued in MHI control device */
+	struct list_head	cntl_xfer_list;
+	/* Synchronizes MHI control device transactions and its xfer list */
+	struct mutex		cntl_mutex;
+	/* Array of DBC struct of this device */
+	struct dma_bridge_chan	*dbc;
+	/* Work queue for tasks related to MHI control device */
+	struct workqueue_struct	*cntl_wq;
+	/* Synchronizes all the users of device during cleanup */
+	struct srcu_struct	dev_lock;
+	/* true: Device under reset; false: Device not under reset */
+	bool			in_reset;
+	/*
+	 * true: A tx MHI transaction has failed and a rx buffer is still queued
+	 * in control device. Such a buffer is considered lost rx buffer
+	 * false: No rx buffer is lost in control device
+	 */
+	bool			cntl_lost_buf;
+	/* Maximum number of DBC supported by this device */
+	u32			num_dbc;
+	/* Reference to the drm_device for this device when it is created */
+	struct qaic_drm_device	*qddev;
+	/* Generate the CRC of a control message */
+	u32 (*gen_crc)(void *msg);
+	/* Validate the CRC of a control message */
+	bool (*valid_crc)(void *msg);
+};
+
+struct qaic_drm_device {
+	/* Pointer to the root device struct driven by this driver */
+	struct qaic_device	*qdev;
+	/*
+	 * The physical device can be partition in number of logical devices.
+	 * And each logical device is given a partition id. This member stores
+	 * that id. QAIC_NO_PARTITION is a sentinel used to mark that this drm
+	 * device is the actual physical device
+	 */
+	s32			partition_id;
+	/* Pointer to the drm device struct of this drm device */
+	struct drm_device	*ddev;
+	/* Head in list of users who have opened this drm device */
+	struct list_head	users;
+	/* Synchronizes access to users list */
+	struct mutex		users_mutex;
+};
+
+struct qaic_bo {
+	struct drm_gem_object	base;
+	/* Scatter/gather table for allocate/imported BO */
+	struct sg_table		*sgt;
+	/* BO size requested by user. GEM object might be bigger in size. */
+	u64			size;
+	/* Head in list of slices of this BO */
+	struct list_head	slices;
+	/* Total nents, for all slices of this BO */
+	int			total_slice_nents;
+	/*
+	 * Direction of transfer. It can assume only two value DMA_TO_DEVICE and
+	 * DMA_FROM_DEVICE.
+	 */
+	int			dir;
+	/* The pointer of the DBC which operates on this BO */
+	struct dma_bridge_chan	*dbc;
+	/* Number of slice that belongs to this buffer */
+	u32			nr_slice;
+	/* Number of slice that have been transferred by DMA engine */
+	u32			nr_slice_xfer_done;
+	/* true = BO is queued for execution, true = BO is not queued */
+	bool			queued;
+	/*
+	 * If true then user has attached slicing information to this BO by
+	 * calling DRM_IOCTL_QAIC_ATTACH_SLICE_BO ioctl.
+	 */
+	bool			sliced;
+	/* Request ID of this BO if it is queued for execution */
+	u16			req_id;
+	/* Handle assigned to this BO */
+	u32			handle;
+	/* Wait on this for completion of DMA transfer of this BO */
+	struct completion	xfer_done;
+	/*
+	 * Node in linked list where head is dbc->xfer_list.
+	 * This link list contain BO's that are queued for DMA transfer.
+	 */
+	struct list_head	xfer_list;
+	/*
+	 * Node in linked list where head is dbc->bo_lists.
+	 * This link list contain BO's that are associated with the DBC it is
+	 * linked to.
+	 */
+	struct list_head	bo_list;
+	struct {
+		/*
+		 * Latest timestamp(ns) at which kernel received a request to
+		 * execute this BO
+		 */
+		u64		req_received_ts;
+		/*
+		 * Latest timestamp(ns) at which kernel enqueued requests of
+		 * this BO for execution in DMA queue
+		 */
+		u64		req_submit_ts;
+		/*
+		 * Latest timestamp(ns) at which kernel received a completion
+		 * interrupt for requests of this BO
+		 */
+		u64		req_processed_ts;
+		/*
+		 * Number of elements already enqueued in DMA queue before
+		 * enqueuing requests of this BO
+		 */
+		u32		queue_level_before;
+	} perf_stats;
+
+};
+
+struct bo_slice {
+	/* Mapped pages */
+	struct sg_table		*sgt;
+	/* Number of requests required to queue in DMA queue */
+	int			nents;
+	/* See enum dma_data_direction */
+	int			dir;
+	/* Actual requests that will be copied in DMA queue */
+	struct dbc_req		*reqs;
+	struct kref		ref_count;
+	/* true: No DMA transfer required */
+	bool			no_xfer;
+	/* Pointer to the parent BO handle */
+	struct qaic_bo		*bo;
+	/* Node in list of slices maintained by parent BO */
+	struct list_head	slice;
+	/* Size of this slice in bytes */
+	u64			size;
+	/* Offset of this slice in buffer */
+	u64			offset;
+};
+
+int get_dbc_req_elem_size(void);
+int get_dbc_rsp_elem_size(void);
+int get_cntl_version(struct qaic_device *qdev, struct qaic_user *usr, u16 *major, u16 *minor);
+int qaic_manage_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+void qaic_mhi_ul_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result);
+
+void qaic_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result *mhi_result);
+
+int qaic_control_open(struct qaic_device *qdev);
+void qaic_control_close(struct qaic_device *qdev);
+void qaic_release_usr(struct qaic_device *qdev, struct qaic_user *usr);
+
+irqreturn_t dbc_irq_threaded_fn(int irq, void *data);
+irqreturn_t dbc_irq_handler(int irq, void *data);
+int disable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr);
+void enable_dbc(struct qaic_device *qdev, u32 dbc_id, struct qaic_user *usr);
+void wakeup_dbc(struct qaic_device *qdev, u32 dbc_id);
+void release_dbc(struct qaic_device *qdev, u32 dbc_id);
+
+void wake_all_cntl(struct qaic_device *qdev);
+void qaic_dev_reset_clean_local_state(struct qaic_device *qdev, bool exit_reset);
+
+struct drm_gem_object *qaic_gem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf);
+
+int qaic_create_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+int qaic_mmap_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+int qaic_attach_slice_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+int qaic_execute_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+int qaic_partial_execute_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+int qaic_wait_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+int qaic_perf_stats_bo_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv);
+void irq_polling_work(struct work_struct *work);
+
+#endif /* _QAIC_H_ */
diff --git a/drivers/accel/qaic/qaic_drv.c b/drivers/accel/qaic/qaic_drv.c
new file mode 100644
index 000000000000..1106ad88a5b6
--- /dev/null
+++ b/drivers/accel/qaic/qaic_drv.c
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/* Copyright (c) 2019-2021, The Linux Foundation. All rights reserved. */
+/* Copyright (c) 2021-2023 Qualcomm Innovation Center, Inc. All rights reserved. */
+
+#include <linux/delay.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/kref.h>
+#include <linux/mhi.h>
+#include <linux/module.h>
+#include <linux/msi.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/wait.h>
+#include <drm/drm_accel.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_file.h>
+#include <drm/drm_gem.h>
+#include <drm/drm_ioctl.h>
+#include <uapi/drm/qaic_accel.h>
+
+#include "mhi_controller.h"
+#include "mhi_qaic_ctrl.h"
+#include "qaic.h"
+
+MODULE_IMPORT_NS(DMA_BUF);
+
+#define PCI_DEV_AIC100			0xa100
+#define QAIC_NAME			"qaic"
+#define QAIC_DESC			"Qualcomm Cloud AI Accelerators"
+#define CNTL_MAJOR			5
+#define CNTL_MINOR			0
+
+bool datapath_polling;
+module_param(datapath_polling, bool, 0400);
+MODULE_PARM_DESC(datapath_polling, "Operate the datapath in polling mode");
+static bool link_up;
+static DEFINE_IDA(qaic_usrs);
+
+static int qaic_create_drm_device(struct qaic_device *qdev, s32 partition_id);
+static void qaic_destroy_drm_device(struct qaic_device *qdev, s32 partition_id);
+
+static void free_usr(struct kref *kref)
+{
+	struct qaic_user *usr = container_of(kref, struct qaic_user, ref_count);
+
+	cleanup_srcu_struct(&usr->qddev_lock);
+	ida_free(&qaic_usrs, usr->handle);
+	kfree(usr);
+}
+
+static int qaic_open(struct drm_device *dev, struct drm_file *file)
+{
+	struct qaic_drm_device *qddev = dev->dev_private;
+	struct qaic_device *qdev = qddev->qdev;
+	struct qaic_user *usr;
+	int rcu_id;
+	int ret;
+
+	rcu_id = srcu_read_lock(&qdev->dev_lock);
+	if (qdev->in_reset) {
+		ret = -ENODEV;
+		goto dev_unlock;
+	}
+
+	usr = kmalloc(sizeof(*usr), GFP_KERNEL);
+	if (!usr) {
+		ret = -ENOMEM;
+		goto dev_unlock;
+	}
+
+	usr->handle = ida_alloc(&qaic_usrs, GFP_KERNEL);
+	if (usr->handle < 0) {
+		ret = usr->handle;
+		goto free_usr;
+	}
+	usr->qddev = qddev;
+	atomic_set(&usr->chunk_id, 0);
+	init_srcu_struct(&usr->qddev_lock);
+	kref_init(&usr->ref_count);
+
+	ret = mutex_lock_interruptible(&qddev->users_mutex);
+	if (ret)
+		goto cleanup_usr;
+
+	list_add(&usr->node, &qddev->users);
+	mutex_unlock(&qddev->users_mutex);
+
+	file->driver_priv = usr;
+
+	srcu_read_unlock(&qdev->dev_lock, rcu_id);
+	return 0;
+
+cleanup_usr:
+	cleanup_srcu_struct(&usr->qddev_lock);
+free_usr:
+	kfree(usr);
+dev_unlock:
+	srcu_read_unlock(&qdev->dev_lock, rcu_id);
+	return ret;
+}
+
+static void qaic_postclose(struct drm_device *dev, struct drm_file *file)
+{
+	struct qaic_user *usr = file->driver_priv;
+	struct qaic_drm_device *qddev;
+	struct qaic_device *qdev;
+	int qdev_rcu_id;
+	int usr_rcu_id;
+	int i;
+
+	qddev = usr->qddev;
+	usr_rcu_id = srcu_read_lock(&usr->qddev_lock);
+	if (qddev) {
+		qdev = qddev->qdev;
+		qdev_rcu_id = srcu_read_lock(&qdev->dev_lock);
+		if (!qdev->in_reset) {
+			qaic_release_usr(qdev, usr);
+			for (i = 0; i < qdev->num_dbc; ++i)
+				if (qdev->dbc[i].usr && qdev->dbc[i].usr->handle == usr->handle)
+					release_dbc(qdev, i);
+		}
+		srcu_read_unlock(&qdev->dev_lock, qdev_rcu_id);
+
+		mutex_lock(&qddev->users_mutex);
+		if (!list_empty(&usr->node))
+			list_del_init(&usr->node);
+		mutex_unlock(&qddev->users_mutex);
+	}
+
+	srcu_read_unlock(&usr->qddev_lock, usr_rcu_id);
+	kref_put(&usr->ref_count, free_usr);
+
+	file->driver_priv = NULL;
+}
+
+DEFINE_DRM_ACCEL_FOPS(qaic_accel_fops);
+
+static const struct drm_ioctl_desc qaic_drm_ioctls[] = {
+	DRM_IOCTL_DEF_DRV(QAIC_MANAGE, qaic_manage_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(QAIC_CREATE_BO, qaic_create_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(QAIC_MMAP_BO, qaic_mmap_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(QAIC_ATTACH_SLICE_BO, qaic_attach_slice_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(QAIC_EXECUTE_BO, qaic_execute_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(QAIC_PARTIAL_EXECUTE_BO, qaic_partial_execute_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(QAIC_WAIT_BO, qaic_wait_bo_ioctl, 0),
+	DRM_IOCTL_DEF_DRV(QAIC_PERF_STATS_BO, qaic_perf_stats_bo_ioctl, 0),
+};
+
+static const struct drm_driver qaic_accel_driver = {
+	.driver_features	= DRIVER_GEM | DRIVER_COMPUTE_ACCEL,
+
+	.name			= QAIC_NAME,
+	.desc			= QAIC_DESC,
+	.date			= "20190618",
+
+	.fops			= &qaic_accel_fops,
+	.open			= qaic_open,
+	.postclose		= qaic_postclose,
+
+	.ioctls			= qaic_drm_ioctls,
+	.num_ioctls		= ARRAY_SIZE(qaic_drm_ioctls),
+	.prime_fd_to_handle	= drm_gem_prime_fd_to_handle,
+	.gem_prime_import	= qaic_gem_prime_import,
+};
+
+static int qaic_create_drm_device(struct qaic_device *qdev, s32 partition_id)
+{
+	struct qaic_drm_device *qddev;
+	struct drm_device *ddev;
+	struct device *pdev;
+	int ret;
+
+	/* Hold off implementing partitions until the uapi is determined */
+	if (partition_id != QAIC_NO_PARTITION)
+		return -EINVAL;
+
+	pdev = &qdev->pdev->dev;
+
+	qddev = kzalloc(sizeof(*qddev), GFP_KERNEL);
+	if (!qddev)
+		return -ENOMEM;
+
+	ddev = drm_dev_alloc(&qaic_accel_driver, pdev);
+	if (IS_ERR(ddev)) {
+		ret = PTR_ERR(ddev);
+		goto ddev_fail;
+	}
+
+	ddev->dev_private = qddev;
+	qddev->ddev = ddev;
+
+	qddev->qdev = qdev;
+	qddev->partition_id = partition_id;
+	INIT_LIST_HEAD(&qddev->users);
+	mutex_init(&qddev->users_mutex);
+
+	qdev->qddev = qddev;
+
+	ret = drm_dev_register(ddev, 0);
+	if (ret) {
+		pci_dbg(qdev->pdev, "%s: drm_dev_register failed %d\n", __func__, ret);
+		goto drm_reg_fail;
+	}
+
+	return 0;
+
+drm_reg_fail:
+	mutex_destroy(&qddev->users_mutex);
+	qdev->qddev = NULL;
+	drm_dev_put(ddev);
+ddev_fail:
+	kfree(qddev);
+	return ret;
+}
+
+static void qaic_destroy_drm_device(struct qaic_device *qdev, s32 partition_id)
+{
+	struct qaic_drm_device *qddev;
+	struct qaic_user *usr;
+
+	qddev = qdev->qddev;
+
+	/*
+	 * Existing users get unresolvable errors till they close FDs.
+	 * Need to sync carefully with users calling close(). The
+	 * list of users can be modified elsewhere when the lock isn't
+	 * held here, but the sync'ing the srcu with the mutex held
+	 * could deadlock. Grab the mutex so that the list will be
+	 * unmodified. The user we get will exist as long as the
+	 * lock is held. Signal that the qcdev is going away, and
+	 * grab a reference to the user so they don't go away for
+	 * synchronize_srcu(). Then release the mutex to avoid
+	 * deadlock and make sure the user has observed the signal.
+	 * With the lock released, we cannot maintain any state of the
+	 * user list.
+	 */
+	mutex_lock(&qddev->users_mutex);
+	while (!list_empty(&qddev->users)) {
+		usr = list_first_entry(&qddev->users, struct qaic_user, node);
+		list_del_init(&usr->node);
+		kref_get(&usr->ref_count);
+		usr->qddev = NULL;
+		mutex_unlock(&qddev->users_mutex);
+		synchronize_srcu(&usr->qddev_lock);
+		kref_put(&usr->ref_count, free_usr);
+		mutex_lock(&qddev->users_mutex);
+	}
+	mutex_unlock(&qddev->users_mutex);
+
+	if (qddev->ddev) {
+		drm_dev_unregister(qddev->ddev);
+		drm_dev_put(qddev->ddev);
+	}
+
+	kfree(qddev);
+}
+
+static int qaic_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
+{
+	struct qaic_device *qdev;
+	u16 major, minor;
+	int ret;
+
+	/*
+	 * Invoking this function indicates that the control channel to the
+	 * device is available. We use that as a signal to indicate that
+	 * the device side firmware has booted. The device side firmware
+	 * manages the device resources, so we need to communicate with it
+	 * via the control channel in order to utilize the device. Therefore
+	 * we wait until this signal to create the drm dev that userspace will
+	 * use to control the device, because without the device side firmware,
+	 * userspace can't do anything useful.
+	 */
+
+	qdev = pci_get_drvdata(to_pci_dev(mhi_dev->mhi_cntrl->cntrl_dev));
+
+	qdev->in_reset = false;
+
+	dev_set_drvdata(&mhi_dev->dev, qdev);
+	qdev->cntl_ch = mhi_dev;
+
+	ret = qaic_control_open(qdev);
+	if (ret) {
+		pci_dbg(qdev->pdev, "%s: control_open failed %d\n", __func__, ret);
+		return ret;
+	}
+
+	ret = get_cntl_version(qdev, NULL, &major, &minor);
+	if (ret || major != CNTL_MAJOR || minor > CNTL_MINOR) {
+		pci_err(qdev->pdev, "%s: Control protocol version (%d.%d) not supported. Supported version is (%d.%d). Ret: %d\n",
+			__func__, major, minor, CNTL_MAJOR, CNTL_MINOR, ret);
+		ret = -EINVAL;
+		goto close_control;
+	}
+
+	ret = qaic_create_drm_device(qdev, QAIC_NO_PARTITION);
+
+	return ret;
+
+close_control:
+	qaic_control_close(qdev);
+	return ret;
+}
+
+static void qaic_mhi_remove(struct mhi_device *mhi_dev)
+{
+/* This is redundant since we have already observed the device crash */
+}
+
+static void qaic_notify_reset(struct qaic_device *qdev)
+{
+	int i;
+
+	qdev->in_reset = true;
+	/* wake up any waiters to avoid waiting for timeouts at sync */
+	wake_all_cntl(qdev);
+	for (i = 0; i < qdev->num_dbc; ++i)
+		wakeup_dbc(qdev, i);
+	synchronize_srcu(&qdev->dev_lock);
+}
+
+void qaic_dev_reset_clean_local_state(struct qaic_device *qdev, bool exit_reset)
+{
+	int i;
+
+	qaic_notify_reset(qdev);
+
+	/* remove drmdevs to prevent new users from coming in */
+	qaic_destroy_drm_device(qdev, QAIC_NO_PARTITION);
+
+	/* start tearing things down */
+	for (i = 0; i < qdev->num_dbc; ++i)
+		release_dbc(qdev, i);
+
+	if (exit_reset)
+		qdev->in_reset = false;
+}
+
+static struct qaic_device *create_qdev(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct qaic_device *qdev;
+	int i;
+
+	qdev = devm_kzalloc(&pdev->dev, sizeof(*qdev), GFP_KERNEL);
+	if (!qdev)
+		return NULL;
+
+	if (id->device == PCI_DEV_AIC100) {
+		qdev->num_dbc = 16;
+		qdev->dbc = devm_kcalloc(&pdev->dev, qdev->num_dbc, sizeof(*qdev->dbc), GFP_KERNEL);
+		if (!qdev->dbc)
+			return NULL;
+	}
+
+	qdev->cntl_wq = alloc_workqueue("qaic_cntl", WQ_UNBOUND, 0);
+	if (!qdev->cntl_wq)
+		return NULL;
+
+	pci_set_drvdata(pdev, qdev);
+	qdev->pdev = pdev;
+
+	mutex_init(&qdev->cntl_mutex);
+	INIT_LIST_HEAD(&qdev->cntl_xfer_list);
+	init_srcu_struct(&qdev->dev_lock);
+
+	for (i = 0; i < qdev->num_dbc; ++i) {
+		spin_lock_init(&qdev->dbc[i].xfer_lock);
+		qdev->dbc[i].qdev = qdev;
+		qdev->dbc[i].id = i;
+		INIT_LIST_HEAD(&qdev->dbc[i].xfer_list);
+		init_srcu_struct(&qdev->dbc[i].ch_lock);
+		init_waitqueue_head(&qdev->dbc[i].dbc_release);
+		INIT_LIST_HEAD(&qdev->dbc[i].bo_lists);
+	}
+
+	return qdev;
+}
+
+static void cleanup_qdev(struct qaic_device *qdev)
+{
+	int i;
+
+	for (i = 0; i < qdev->num_dbc; ++i)
+		cleanup_srcu_struct(&qdev->dbc[i].ch_lock);
+	cleanup_srcu_struct(&qdev->dev_lock);
+	pci_set_drvdata(qdev->pdev, NULL);
+	destroy_workqueue(qdev->cntl_wq);
+}
+
+static int init_pci(struct qaic_device *qdev, struct pci_dev *pdev)
+{
+	int bars;
+	int ret;
+
+	bars = pci_select_bars(pdev, IORESOURCE_MEM);
+
+	/* make sure the device has the expected BARs */
+	if (bars != (BIT(0) | BIT(2) | BIT(4))) {
+		pci_dbg(pdev, "%s: expected BARs 0, 2, and 4 not found in device. Found 0x%x\n",
+			__func__, bars);
+		return -EINVAL;
+	}
+
+	ret = pcim_enable_device(pdev);
+	if (ret)
+		return ret;
+
+	ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (ret)
+		return ret;
+	ret = dma_set_max_seg_size(&pdev->dev, UINT_MAX);
+	if (ret)
+		return ret;
+
+	qdev->bar_0 = devm_ioremap_resource(&pdev->dev, &pdev->resource[0]);
+	if (IS_ERR(qdev->bar_0))
+		return PTR_ERR(qdev->bar_0);
+
+	qdev->bar_2 = devm_ioremap_resource(&pdev->dev, &pdev->resource[2]);
+	if (IS_ERR(qdev->bar_2))
+		return PTR_ERR(qdev->bar_2);
+
+	/* Managed release since we use pcim_enable_device above */
+	pci_set_master(pdev);
+
+	return 0;
+}
+
+static int init_msi(struct qaic_device *qdev, struct pci_dev *pdev)
+{
+	int mhi_irq;
+	int ret;
+	int i;
+
+	/* Managed release since we use pcim_enable_device */
+	ret = pci_alloc_irq_vectors(pdev, 1, 32, PCI_IRQ_MSI);
+	if (ret < 0)
+		return ret;
+
+	if (ret < 32) {
+		pci_err(pdev, "%s: Requested 32 MSIs. Obtained %d MSIs which is less than the 32 required.\n",
+			__func__, ret);
+		return -ENODEV;
+	}
+
+	mhi_irq = pci_irq_vector(pdev, 0);
+	if (mhi_irq < 0)
+		return mhi_irq;
+
+	for (i = 0; i < qdev->num_dbc; ++i) {
+		ret = devm_request_threaded_irq(&pdev->dev, pci_irq_vector(pdev, i + 1),
+						dbc_irq_handler, dbc_irq_threaded_fn, IRQF_SHARED,
+						"qaic_dbc", &qdev->dbc[i]);
+		if (ret)
+			return ret;
+
+		if (datapath_polling) {
+			qdev->dbc[i].irq = pci_irq_vector(pdev, i + 1);
+			disable_irq_nosync(qdev->dbc[i].irq);
+			INIT_WORK(&qdev->dbc[i].poll_work, irq_polling_work);
+		}
+	}
+
+	return mhi_irq;
+}
+
+static int qaic_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct qaic_device *qdev;
+	int mhi_irq;
+	int ret;
+	int i;
+
+	qdev = create_qdev(pdev, id);
+	if (!qdev)
+		return -ENOMEM;
+
+	ret = init_pci(qdev, pdev);
+	if (ret)
+		goto cleanup_qdev;
+
+	for (i = 0; i < qdev->num_dbc; ++i)
+		qdev->dbc[i].dbc_base = qdev->bar_2 + QAIC_DBC_OFF(i);
+
+	mhi_irq = init_msi(qdev, pdev);
+	if (mhi_irq < 0) {
+		ret = mhi_irq;
+		goto cleanup_qdev;
+	}
+
+	qdev->mhi_cntrl = qaic_mhi_register_controller(pdev, qdev->bar_0, mhi_irq);
+	if (IS_ERR(qdev->mhi_cntrl)) {
+		ret = PTR_ERR(qdev->mhi_cntrl);
+		goto cleanup_qdev;
+	}
+
+	return 0;
+
+cleanup_qdev:
+	cleanup_qdev(qdev);
+	return ret;
+}
+
+static void qaic_pci_remove(struct pci_dev *pdev)
+{
+	struct qaic_device *qdev = pci_get_drvdata(pdev);
+
+	if (!qdev)
+		return;
+
+	qaic_dev_reset_clean_local_state(qdev, false);
+	qaic_mhi_free_controller(qdev->mhi_cntrl, link_up);
+	cleanup_qdev(qdev);
+}
+
+static void qaic_pci_shutdown(struct pci_dev *pdev)
+{
+	/* see qaic_exit for what link_up is doing */
+	link_up = true;
+	qaic_pci_remove(pdev);
+}
+
+static pci_ers_result_t qaic_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t error)
+{
+	return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static void qaic_pci_reset_prepare(struct pci_dev *pdev)
+{
+	struct qaic_device *qdev = pci_get_drvdata(pdev);
+
+	qaic_notify_reset(qdev);
+	qaic_mhi_start_reset(qdev->mhi_cntrl);
+	qaic_dev_reset_clean_local_state(qdev, false);
+}
+
+static void qaic_pci_reset_done(struct pci_dev *pdev)
+{
+	struct qaic_device *qdev = pci_get_drvdata(pdev);
+
+	qdev->in_reset = false;
+	qaic_mhi_reset_done(qdev->mhi_cntrl);
+}
+
+static const struct mhi_device_id qaic_mhi_match_table[] = {
+	{ .chan = "QAIC_CONTROL", },
+	{},
+};
+
+static struct mhi_driver qaic_mhi_driver = {
+	.id_table = qaic_mhi_match_table,
+	.remove = qaic_mhi_remove,
+	.probe = qaic_mhi_probe,
+	.ul_xfer_cb = qaic_mhi_ul_xfer_cb,
+	.dl_xfer_cb = qaic_mhi_dl_xfer_cb,
+	.driver = {
+		.name = "qaic_mhi",
+	},
+};
+
+static const struct pci_device_id qaic_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_QCOM, PCI_DEV_AIC100), },
+	{ }
+};
+MODULE_DEVICE_TABLE(pci, qaic_ids);
+
+static const struct pci_error_handlers qaic_pci_err_handler = {
+	.error_detected = qaic_pci_error_detected,
+	.reset_prepare = qaic_pci_reset_prepare,
+	.reset_done = qaic_pci_reset_done,
+};
+
+static struct pci_driver qaic_pci_driver = {
+	.name = QAIC_NAME,
+	.id_table = qaic_ids,
+	.probe = qaic_pci_probe,
+	.remove = qaic_pci_remove,
+	.shutdown = qaic_pci_shutdown,
+	.err_handler = &qaic_pci_err_handler,
+};
+
+static int __init qaic_init(void)
+{
+	int ret;
+
+	ret = mhi_driver_register(&qaic_mhi_driver);
+	if (ret) {
+		pr_debug("qaic: mhi_driver_register failed %d\n", ret);
+		return ret;
+	}
+
+	ret = pci_register_driver(&qaic_pci_driver);
+	if (ret) {
+		pr_debug("qaic: pci_register_driver failed %d\n", ret);
+		goto free_mhi;
+	}
+
+	ret = mhi_qaic_ctrl_init();
+	if (ret) {
+		pr_debug("qaic: mhi_qaic_ctrl_init failed %d\n", ret);
+		goto free_pci;
+	}
+
+	return 0;
+
+free_pci:
+	pci_unregister_driver(&qaic_pci_driver);
+free_mhi:
+	mhi_driver_unregister(&qaic_mhi_driver);
+	return ret;
+}
+
+static void __exit qaic_exit(void)
+{
+	/*
+	 * We assume that qaic_pci_remove() is called due to a hotplug event
+	 * which would mean that the link is down, and thus
+	 * qaic_mhi_free_controller() should not try to access the device during
+	 * cleanup.
+	 * We call pci_unregister_driver() below, which also triggers
+	 * qaic_pci_remove(), but since this is module exit, we expect the link
+	 * to the device to be up, in which case qaic_mhi_free_controller()
+	 * should try to access the device during cleanup to put the device in
+	 * a sane state.
+	 * For that reason, we set link_up here to let qaic_mhi_free_controller
+	 * know the expected link state. Since the module is going to be
+	 * removed at the end of this, we don't need to worry about
+	 * reinitializing the link_up state after the cleanup is done.
+	 */
+	link_up = true;
+	mhi_qaic_ctrl_deinit();
+	pci_unregister_driver(&qaic_pci_driver);
+	mhi_driver_unregister(&qaic_mhi_driver);
+}
+
+module_init(qaic_init);
+module_exit(qaic_exit);
+
+MODULE_AUTHOR(QAIC_DESC " Kernel Driver Team");
+MODULE_DESCRIPTION(QAIC_DESC " Accel Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/drm/qaic_accel.h b/include/uapi/drm/qaic_accel.h
new file mode 100644
index 000000000000..2d348744a853
--- /dev/null
+++ b/include/uapi/drm/qaic_accel.h
@@ -0,0 +1,397 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note
+ *
+ * Copyright (c) 2019-2020, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2021-2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef QAIC_ACCEL_H_
+#define QAIC_ACCEL_H_
+
+#include "drm.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* The length(4K) includes len and count fields of qaic_manage_msg */
+#define QAIC_MANAGE_MAX_MSG_LENGTH SZ_4K
+
+/* semaphore flags */
+#define QAIC_SEM_INSYNCFENCE	2
+#define QAIC_SEM_OUTSYNCFENCE	1
+
+/* Semaphore commands */
+#define QAIC_SEM_NOP		0
+#define QAIC_SEM_INIT		1
+#define QAIC_SEM_INC		2
+#define QAIC_SEM_DEC		3
+#define QAIC_SEM_WAIT_EQUAL	4
+#define QAIC_SEM_WAIT_GT_EQ	5 /* Greater than or equal */
+#define QAIC_SEM_WAIT_GT_0	6 /* Greater than 0 */
+
+#define QAIC_TRANS_UNDEFINED			0
+#define QAIC_TRANS_PASSTHROUGH_FROM_USR		1
+#define QAIC_TRANS_PASSTHROUGH_TO_USR		2
+#define QAIC_TRANS_PASSTHROUGH_FROM_DEV		3
+#define QAIC_TRANS_PASSTHROUGH_TO_DEV		4
+#define QAIC_TRANS_DMA_XFER_FROM_USR		5
+#define QAIC_TRANS_DMA_XFER_TO_DEV		6
+#define QAIC_TRANS_ACTIVATE_FROM_USR		7
+#define QAIC_TRANS_ACTIVATE_FROM_DEV		8
+#define QAIC_TRANS_ACTIVATE_TO_DEV		9
+#define QAIC_TRANS_DEACTIVATE_FROM_USR		10
+#define QAIC_TRANS_DEACTIVATE_FROM_DEV		11
+#define QAIC_TRANS_STATUS_FROM_USR		12
+#define QAIC_TRANS_STATUS_TO_USR		13
+#define QAIC_TRANS_STATUS_FROM_DEV		14
+#define QAIC_TRANS_STATUS_TO_DEV		15
+#define QAIC_TRANS_TERMINATE_FROM_DEV		16
+#define QAIC_TRANS_TERMINATE_TO_DEV		17
+#define QAIC_TRANS_DMA_XFER_CONT		18
+#define QAIC_TRANS_VALIDATE_PARTITION_FROM_DEV	19
+#define QAIC_TRANS_VALIDATE_PARTITION_TO_DEV	20
+
+/**
+ * struct qaic_manage_trans_hdr - Header for a transaction in a manage message.
+ * @type: In. Identifies this transaction. See QAIC_TRANS_* defines.
+ * @len: In. Length of this transaction, including this header.
+ */
+struct qaic_manage_trans_hdr {
+	__u32 type;
+	__u32 len;
+};
+
+/**
+ * struct qaic_manage_trans_passthrough - Defines a passthrough transaction.
+ * @hdr: In. Header to identify this transaction.
+ * @data: In. Payload of this ransaction. Opaque to the driver. Userspace must
+ *	  encode in little endian and align/pad to 64-bit.
+ */
+struct qaic_manage_trans_passthrough {
+	struct qaic_manage_trans_hdr hdr;
+	__u8 data[];
+};
+
+/**
+ * struct qaic_manage_trans_dma_xfer - Defines a DMA transfer transaction.
+ * @hdr: In. Header to identify this transaction.
+ * @tag: In. Identified this transfer in other transactions. Opaque to the
+ *	 driver.
+ * @pad: Structure padding.
+ * @addr: In. Address of the data to DMA to the device.
+ * @size: In. Length of the data to DMA to the device.
+ */
+struct qaic_manage_trans_dma_xfer {
+	struct qaic_manage_trans_hdr hdr;
+	__u32 tag;
+	__u32 pad;
+	__u64 addr;
+	__u64 size;
+};
+
+/**
+ * struct qaic_manage_trans_activate_to_dev - Defines an activate request.
+ * @hdr: In. Header to identify this transaction.
+ * @queue_size: In. Number of elements for DBC request and response queues.
+ * @eventfd: Unused.
+ * @options: In. Device specific options for this activate.
+ * @pad: Structure padding.  Must be 0.
+ */
+struct qaic_manage_trans_activate_to_dev {
+	struct qaic_manage_trans_hdr hdr;
+	__u32 queue_size;
+	__u32 eventfd;
+	__u32 options;
+	__u32 pad;
+};
+
+/**
+ * struct qaic_manage_trans_activate_from_dev - Defines an activate response.
+ * @hdr: Out. Header to identify this transaction.
+ * @status: Out. Return code of the request from the device.
+ * @dbc_id: Out. Id of the assigned DBC for successful request.
+ * @options: Out. Device specific options for this activate.
+ */
+struct qaic_manage_trans_activate_from_dev {
+	struct qaic_manage_trans_hdr hdr;
+	__u32 status;
+	__u32 dbc_id;
+	__u64 options;
+};
+
+/**
+ * struct qaic_manage_trans_deactivate - Defines a deactivate request.
+ * @hdr: In. Header to identify this transaction.
+ * @dbc_id: In. Id of assigned DBC.
+ * @pad: Structure padding.  Must be 0.
+ */
+struct qaic_manage_trans_deactivate {
+	struct qaic_manage_trans_hdr hdr;
+	__u32 dbc_id;
+	__u32 pad;
+};
+
+/**
+ * struct qaic_manage_trans_status_to_dev - Defines a status request.
+ * @hdr: In. Header to identify this transaction.
+ */
+struct qaic_manage_trans_status_to_dev {
+	struct qaic_manage_trans_hdr hdr;
+};
+
+/**
+ * struct qaic_manage_trans_status_from_dev - Defines a status response.
+ * @hdr: Out. Header to identify this transaction.
+ * @major: Out. NNC protocol version major number.
+ * @minor: Out. NNC protocol version minor number.
+ * @status: Out. Return code from device.
+ * @status_flags: Out. Flags from device.  Bit 0 indicates if CRCs are required.
+ */
+struct qaic_manage_trans_status_from_dev {
+	struct qaic_manage_trans_hdr hdr;
+	__u16 major;
+	__u16 minor;
+	__u32 status;
+	__u64 status_flags;
+};
+
+/**
+ * struct qaic_manage_msg - Defines a message to the device.
+ * @len: In. Length of all the transactions contained within this message.
+ * @count: In. Number of transactions in this message.
+ * @data: In. Address to an array where the transactions can be found.
+ */
+struct qaic_manage_msg {
+	__u32 len;
+	__u32 count;
+	__u64 data;
+};
+
+/**
+ * struct qaic_create_bo - Defines a request to create a buffer object.
+ * @size: In.  Size of the buffer in bytes.
+ * @handle: Out. GEM handle for the BO.
+ * @pad: Structure padding. Must be 0.
+ */
+struct qaic_create_bo {
+	__u64 size;
+	__u32 handle;
+	__u32 pad;
+};
+
+/**
+ * struct qaic_mmap_bo - Defines a request to prepare a BO for mmap().
+ * @handle: In.  Handle of the GEM BO to prepare for mmap().
+ * @pad: Structure padding. Must be 0.
+ * @offset: Out. Offset value to provide to mmap().
+ */
+struct qaic_mmap_bo {
+	__u32 handle;
+	__u32 pad;
+	__u64 offset;
+};
+
+/**
+ * struct qaic_sem - Defines a semaphore command for a BO slice.
+ * @val: In. Only lower 12 bits are valid.
+ * @index: In. Only lower 5 bits are valid.
+ * @presync: In. 1 if presync operation, 0 if postsync.
+ * @cmd: In. One of QAIC_SEM_*.
+ * @flags: In. Bitfield. See QAIC_SEM_INSYNCFENCE and QAIC_SEM_OUTSYNCFENCE
+ * @pad: Structure padding.  Must be 0.
+ */
+struct qaic_sem {
+	__u16 val;
+	__u8  index;
+	__u8  presync;
+	__u8  cmd;
+	__u8  flags;
+	__u16 pad;
+};
+
+/**
+ * struct qaic_attach_slice_entry - Defines a single BO slice.
+ * @size: In. Size of this slice in bytes.
+ * @sem0: In. Semaphore command 0. Must be 0 is not valid.
+ * @sem1: In. Semaphore command 1. Must be 0 is not valid.
+ * @sem2: In. Semaphore command 2. Must be 0 is not valid.
+ * @sem3: In. Semaphore command 3. Must be 0 is not valid.
+ * @dev_addr: In. Device address this slice pushes to or pulls from.
+ * @db_addr: In. Address of the doorbell to ring.
+ * @db_data: In. Data to write to the doorbell.
+ * @db_len: In. Size of the doorbell data in bits - 32, 16, or 8.  0 is for
+ *	    inactive doorbells.
+ * @offset: In. Start of this slice as an offset from the start of the BO.
+ */
+struct qaic_attach_slice_entry {
+	__u64 size;
+	struct qaic_sem	sem0;
+	struct qaic_sem	sem1;
+	struct qaic_sem	sem2;
+	struct qaic_sem	sem3;
+	__u64 dev_addr;
+	__u64 db_addr;
+	__u32 db_data;
+	__u32 db_len;
+	__u64 offset;
+};
+
+/**
+ * struct qaic_attach_slice_hdr - Defines metadata for a set of BO slices.
+ * @count: In. Number of slices for this BO.
+ * @dbc_id: In. Associate the sliced BO with this DBC.
+ * @handle: In. GEM handle of the BO to slice.
+ * @dir: In. Direction of data flow. 1 = DMA_TO_DEVICE, 2 = DMA_FROM_DEVICE
+ * @size: In. Total length of the BO.
+ *	  If BO is imported (DMABUF/PRIME) then this size
+ *	  should not exceed the size of DMABUF provided.
+ *	  If BO is allocated using DRM_IOCTL_QAIC_CREATE_BO
+ *	  then this size should be exactly same as the size
+ *	  provided during DRM_IOCTL_QAIC_CREATE_BO.
+ * @dev_addr: In. Device address this slice pushes to or pulls from.
+ * @db_addr: In. Address of the doorbell to ring.
+ * @db_data: In. Data to write to the doorbell.
+ * @db_len: In. Size of the doorbell data in bits - 32, 16, or 8.  0 is for
+ *	    inactive doorbells.
+ * @offset: In. Start of this slice as an offset from the start of the BO.
+ */
+struct qaic_attach_slice_hdr {
+	__u32 count;
+	__u32 dbc_id;
+	__u32 handle;
+	__u32 dir;
+	__u64 size;
+};
+
+/**
+ * struct qaic_attach_slice - Defines a set of BO slices.
+ * @hdr: In. Metadata of the set of slices.
+ * @data: In. Pointer to an array containing the slice definitions.
+ */
+struct qaic_attach_slice {
+	struct qaic_attach_slice_hdr hdr;
+	__u64 data;
+};
+
+/**
+ * struct qaic_execute_entry - Defines a BO to submit to the device.
+ * @handle: In. GEM handle of the BO to commit to the device.
+ * @dir: In. Direction of data. 1 = to device, 2 = from device.
+ */
+struct qaic_execute_entry {
+	__u32 handle;
+	__u32 dir;
+};
+
+/**
+ * struct qaic_partial_execute_entry - Defines a BO to resize and submit.
+ * @handle: In. GEM handle of the BO to commit to the device.
+ * @dir: In. Direction of data. 1 = to device, 2 = from device.
+ * @resize: In. New size of the BO.  Must be <= the original BO size.  0 is
+ *	    short for no resize.
+ */
+struct qaic_partial_execute_entry {
+	__u32 handle;
+	__u32 dir;
+	__u64 resize;
+};
+
+/**
+ * struct qaic_execute_hdr - Defines metadata for BO submission.
+ * @count: In. Number of BOs to submit.
+ * @dbc_id: In. DBC to submit the BOs on.
+ */
+struct qaic_execute_hdr {
+	__u32 count;
+	__u32 dbc_id;
+};
+
+/**
+ * struct qaic_execute - Defines a list of BOs to submit to the device.
+ * @hdr: In. BO list metadata.
+ * @data: In. Pointer to an array of BOs to submit.
+ */
+struct qaic_execute {
+	struct qaic_execute_hdr hdr;
+	__u64 data;
+};
+
+/**
+ * struct qaic_wait - Defines a blocking wait for BO execution.
+ * @handle: In. GEM handle of the BO to wait on.
+ * @timeout: In. Maximum time in ms to wait for the BO.
+ * @dbc_id: In. DBC the BO is submitted to.
+ * @pad: Structure padding. Must be 0.
+ */
+struct qaic_wait {
+	__u32 handle;
+	__u32 timeout;
+	__u32 dbc_id;
+	__u32 pad;
+};
+
+/**
+ * struct qaic_perf_stats_hdr - Defines metadata for getting BO perf info.
+ * @count: In. Number of BOs requested.
+ * @pad: Structure padding. Must be 0.
+ * @dbc_id: In. DBC the BO are associated with.
+ */
+struct qaic_perf_stats_hdr {
+	__u16 count;
+	__u16 pad;
+	__u32 dbc_id;
+};
+
+/**
+ * struct qaic_perf_stats - Defines a request for getting BO perf info.
+ * @hdr: In. Request metadata
+ * @data: In. Pointer to array of stats structures that will receive the data.
+ */
+struct qaic_perf_stats {
+	struct qaic_perf_stats_hdr hdr;
+	__u64 data;
+};
+
+/**
+ * struct qaic_perf_stats_entry - Defines a BO perf info.
+ * @handle: In. GEM handle of the BO to get perf stats for.
+ * @queue_level_before: Out. Number of elements in the queue before this BO
+ *			was submitted.
+ * @num_queue_element: Out. Number of elements added to the queue to submit
+ *		       this BO.
+ * @submit_latency_us: Out. Time taken by the driver to submit this BO.
+ * @device_latency_us: Out. Time taken by the device to execute this BO.
+ * @pad: Structure padding. Must be 0.
+ */
+struct qaic_perf_stats_entry {
+	__u32 handle;
+	__u32 queue_level_before;
+	__u32 num_queue_element;
+	__u32 submit_latency_us;
+	__u32 device_latency_us;
+	__u32 pad;
+};
+
+#define DRM_QAIC_MANAGE				0x00
+#define DRM_QAIC_CREATE_BO			0x01
+#define DRM_QAIC_MMAP_BO			0x02
+#define DRM_QAIC_ATTACH_SLICE_BO		0x03
+#define DRM_QAIC_EXECUTE_BO			0x04
+#define DRM_QAIC_PARTIAL_EXECUTE_BO		0x05
+#define DRM_QAIC_WAIT_BO			0x06
+#define DRM_QAIC_PERF_STATS_BO			0x07
+
+#define DRM_IOCTL_QAIC_MANAGE			DRM_IOWR(DRM_COMMAND_BASE + DRM_QAIC_MANAGE, struct qaic_manage_msg)
+#define DRM_IOCTL_QAIC_CREATE_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_QAIC_CREATE_BO,	struct qaic_create_bo)
+#define DRM_IOCTL_QAIC_MMAP_BO			DRM_IOWR(DRM_COMMAND_BASE + DRM_QAIC_MMAP_BO, struct qaic_mmap_bo)
+#define DRM_IOCTL_QAIC_ATTACH_SLICE_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_QAIC_ATTACH_SLICE_BO, struct qaic_attach_slice)
+#define DRM_IOCTL_QAIC_EXECUTE_BO		DRM_IOW(DRM_COMMAND_BASE + DRM_QAIC_EXECUTE_BO,	struct qaic_execute)
+#define DRM_IOCTL_QAIC_PARTIAL_EXECUTE_BO	DRM_IOW(DRM_COMMAND_BASE + DRM_QAIC_PARTIAL_EXECUTE_BO,	struct qaic_execute)
+#define DRM_IOCTL_QAIC_WAIT_BO			DRM_IOW(DRM_COMMAND_BASE + DRM_QAIC_WAIT_BO, struct qaic_wait)
+#define DRM_IOCTL_QAIC_PERF_STATS_BO		DRM_IOWR(DRM_COMMAND_BASE + DRM_QAIC_PERF_STATS_BO, struct qaic_perf_stats)
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* QAIC_ACCEL_H_ */
-- 
cgit v1.2.3


From f1af066bcfd38daa9eee7195ef772dadaaa18520 Mon Sep 17 00:00:00 2001
From: Danylo Piliaiev <dpiliaiev@igalia.com>
Date: Sun, 26 Mar 2023 09:38:13 -0700
Subject: drm/msm: Rename drm_msm_gem_submit_reloc::or in C++ code

Clashes with C++ `or` keyword

Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Patchwork: https://patchwork.freedesktop.org/patch/528751/
Link: https://lore.kernel.org/r/20230326163813.535762-1-robdclark@gmail.com
Signed-off-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
---
 include/uapi/drm/msm_drm.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h
index dbf0d6f43fa9..6c34272a13fd 100644
--- a/include/uapi/drm/msm_drm.h
+++ b/include/uapi/drm/msm_drm.h
@@ -186,7 +186,11 @@ struct drm_msm_gem_cpu_fini {
  */
 struct drm_msm_gem_submit_reloc {
 	__u32 submit_offset;  /* in, offset from submit_bo */
+#ifdef __cplusplus
+	__u32 _or;            /* in, value OR'd with result */
+#else
 	__u32 or;             /* in, value OR'd with result */
+#endif
 	__s32 shift;          /* in, amount of left shift (can be negative) */
 	__u32 reloc_idx;      /* in, index of reloc_bo buffer */
 	__u64 reloc_offset;   /* in, offset from start of reloc_bo */
-- 
cgit v1.2.3


From a25c2f7a467265fa24d63fb6dd46fa7ba4e3b108 Mon Sep 17 00:00:00 2001
From: Oded Gabbay <ogabbay@kernel.org>
Date: Thu, 30 Mar 2023 12:30:56 +0300
Subject: accel/habanalabs/uapi: new Gaudi2 server type

Add definition of a new Gaudi2 server type. This represents
the connectivity between the cards in that server type.

Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Reviewed-by: Stanislaw Gruszka <stanislaw.gruszka@linux.intel.com>
---
 include/uapi/drm/habanalabs_accel.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/uapi')

diff --git a/include/uapi/drm/habanalabs_accel.h b/include/uapi/drm/habanalabs_accel.h
index c139aab17c8a..d9ef1b151d04 100644
--- a/include/uapi/drm/habanalabs_accel.h
+++ b/include/uapi/drm/habanalabs_accel.h
@@ -708,7 +708,8 @@ enum hl_server_type {
 	HL_SERVER_GAUDI_HLS1H = 2,
 	HL_SERVER_GAUDI_TYPE1 = 3,
 	HL_SERVER_GAUDI_TYPE2 = 4,
-	HL_SERVER_GAUDI2_HLS2 = 5
+	HL_SERVER_GAUDI2_HLS2 = 5,
+	HL_SERVER_GAUDI2_TYPE1 = 7
 };
 
 /*
-- 
cgit v1.2.3