summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/ras
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/ras')
-rw-r--r--drivers/gpu/drm/amd/ras/Makefile34
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/Makefile33
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c285
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h54
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c182
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.h27
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c648
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h83
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.c94
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.h30
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.c125
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.h30
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c190
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h41
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c279
-rw-r--r--drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h110
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/Makefile44
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras.h370
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_aca.c672
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_aca.h164
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c379
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h71
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_cmd.c522
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_cmd.h426
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_core.c603
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_cper.c315
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_cper.h304
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_eeprom.c1339
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h197
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_gfx.c70
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_gfx.h43
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.c426
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.h259
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c317
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h93
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_mp1.c81
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_mp1.h50
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.c105
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.h30
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_nbio.c96
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_nbio.h46
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.c123
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.h31
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_process.c322
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_process.h53
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_psp.c750
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_psp.h145
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.c46
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.h31
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h231
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_umc.c707
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_umc.h166
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c511
-rw-r--r--drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.h314
54 files changed, 12697 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/ras/Makefile b/drivers/gpu/drm/amd/ras/Makefile
new file mode 100644
index 000000000000..bbdaba811d34
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/Makefile
@@ -0,0 +1,34 @@
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+ifeq ($(AMD_GPU_RAS_MGR),)
+ AMD_GPU_RAS_MGR := ras_mgr
+endif
+
+subdir-ccflags-y += -I$(AMD_GPU_RAS_FULL_PATH)/rascore
+subdir-ccflags-y += -I$(AMD_GPU_RAS_FULL_PATH)/$(AMD_GPU_RAS_MGR)
+
+RAS_LIBS = $(AMD_GPU_RAS_MGR) rascore
+
+AMD_RAS = $(addsuffix /Makefile, $(addprefix $(AMD_GPU_RAS_FULL_PATH)/,$(RAS_LIBS)))
+
+include $(AMD_RAS)
+
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/Makefile b/drivers/gpu/drm/amd/ras/ras_mgr/Makefile
new file mode 100644
index 000000000000..5e5a2cfa4068
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/Makefile
@@ -0,0 +1,33 @@
+# Copyright 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+RAS_MGR_FILES = amdgpu_ras_sys.o \
+ amdgpu_ras_mgr.o \
+ amdgpu_ras_eeprom_i2c.o \
+ amdgpu_ras_mp1_v13_0.o \
+ amdgpu_ras_cmd.o \
+ amdgpu_ras_process.o \
+ amdgpu_ras_nbio_v7_9.o
+
+
+RAS_MGR = $(addprefix $(AMD_GPU_RAS_PATH)/ras_mgr/, $(RAS_MGR_FILES))
+
+AMD_GPU_RAS_FILES += $(RAS_MGR)
+
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
new file mode 100644
index 000000000000..78419b7f7729
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/pci.h>
+#include "amdgpu.h"
+#include "amdgpu_ras.h"
+#include "ras_sys.h"
+#include "amdgpu_ras_cmd.h"
+#include "amdgpu_ras_mgr.h"
+
+/* inject address is 52 bits */
+#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
+
+#define AMDGPU_RAS_TYPE_RASCORE 0x1
+#define AMDGPU_RAS_TYPE_AMDGPU 0x2
+#define AMDGPU_RAS_TYPE_VF 0x3
+
+static int amdgpu_ras_trigger_error_prepare(struct ras_core_context *ras_core,
+ struct ras_cmd_inject_error_req *block_info)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ int ret;
+
+ if (block_info->block_id == TA_RAS_BLOCK__XGMI_WAFL) {
+ if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
+ RAS_DEV_WARN(adev, "Failed to disallow df cstate");
+
+ ret = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DISALLOW);
+ if (ret && (ret != -EOPNOTSUPP))
+ RAS_DEV_WARN(adev, "Failed to disallow XGMI power down");
+ }
+
+ return 0;
+}
+
+static int amdgpu_ras_trigger_error_end(struct ras_core_context *ras_core,
+ struct ras_cmd_inject_error_req *block_info)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ int ret;
+
+ if (block_info->block_id == TA_RAS_BLOCK__XGMI_WAFL) {
+ if (amdgpu_ras_intr_triggered())
+ return 0;
+
+ ret = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DEFAULT);
+ if (ret && (ret != -EOPNOTSUPP))
+ RAS_DEV_WARN(adev, "Failed to allow XGMI power down");
+
+ if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
+ RAS_DEV_WARN(adev, "Failed to allow df cstate");
+ }
+
+ return 0;
+}
+
+static uint64_t local_addr_to_xgmi_global_addr(struct ras_core_context *ras_core,
+ uint64_t addr)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi;
+
+ return (addr + xgmi->physical_node_id * xgmi->node_segment_size);
+}
+
+static int amdgpu_ras_inject_error(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct ras_cmd_inject_error_req *req =
+ (struct ras_cmd_inject_error_req *)cmd->input_buff_raw;
+ int ret = RAS_CMD__ERROR_GENERIC;
+
+ if (req->block_id == RAS_BLOCK_ID__UMC) {
+ if (amdgpu_ras_mgr_check_retired_addr(adev, req->address)) {
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS WARN: inject: 0x%llx has already been marked as bad!\n",
+ req->address);
+ return RAS_CMD__ERROR_ACCESS_DENIED;
+ }
+
+ if ((req->address >= adev->gmc.mc_vram_size &&
+ adev->gmc.mc_vram_size) ||
+ (req->address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
+ RAS_DEV_WARN(adev, "RAS WARN: input address 0x%llx is invalid.",
+ req->address);
+ return RAS_CMD__ERROR_INVALID_INPUT_DATA;
+ }
+
+ /* Calculate XGMI relative offset */
+ if (adev->gmc.xgmi.num_physical_nodes > 1 &&
+ req->block_id != RAS_BLOCK_ID__GFX) {
+ req->address = local_addr_to_xgmi_global_addr(ras_core, req->address);
+ }
+ }
+
+ amdgpu_ras_trigger_error_prepare(ras_core, req);
+ ret = rascore_handle_cmd(ras_core, cmd, data);
+ amdgpu_ras_trigger_error_end(ras_core, req);
+ if (ret) {
+ RAS_DEV_ERR(adev, "ras inject block %u failed %d\n", req->block_id, ret);
+ ret = RAS_CMD__ERROR_ACCESS_DENIED;
+ }
+
+
+ return ret;
+}
+
+static int amdgpu_ras_get_ras_safe_fb_addr_ranges(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct ras_cmd_dev_handle *input_data =
+ (struct ras_cmd_dev_handle *)cmd->input_buff_raw;
+ struct ras_cmd_ras_safe_fb_address_ranges_rsp *ranges =
+ (struct ras_cmd_ras_safe_fb_address_ranges_rsp *)cmd->output_buff_raw;
+ struct amdgpu_mem_partition_info *mem_ranges;
+ uint32_t i = 0;
+
+ if (cmd->input_size != sizeof(*input_data))
+ return RAS_CMD__ERROR_INVALID_INPUT_DATA;
+
+ mem_ranges = adev->gmc.mem_partitions;
+ for (i = 0; i < adev->gmc.num_mem_partitions; i++) {
+ ranges->range[i].start = mem_ranges[i].range.fpfn << AMDGPU_GPU_PAGE_SHIFT;
+ ranges->range[i].size = mem_ranges[i].size;
+ ranges->range[i].idx = i;
+ }
+
+ ranges->num_ranges = adev->gmc.num_mem_partitions;
+
+ ranges->version = 0;
+ cmd->output_size = sizeof(struct ras_cmd_ras_safe_fb_address_ranges_rsp);
+
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_translate_fb_address(struct ras_core_context *ras_core,
+ enum ras_fb_addr_type src_type,
+ enum ras_fb_addr_type dest_type,
+ union ras_translate_fb_address *src_addr,
+ union ras_translate_fb_address *dest_addr)
+{
+ uint64_t soc_phy_addr;
+ int ret = RAS_CMD__SUCCESS;
+
+ /* Does not need to be queued as event as this is a SW translation */
+ switch (src_type) {
+ case RAS_FB_ADDR_SOC_PHY:
+ soc_phy_addr = src_addr->soc_phy_addr;
+ break;
+ case RAS_FB_ADDR_BANK:
+ ret = ras_cmd_translate_bank_to_soc_pa(ras_core,
+ src_addr->bank_addr, &soc_phy_addr);
+ if (ret)
+ return RAS_CMD__ERROR_GENERIC;
+ break;
+ default:
+ return RAS_CMD__ERROR_INVALID_CMD;
+ }
+
+ switch (dest_type) {
+ case RAS_FB_ADDR_SOC_PHY:
+ dest_addr->soc_phy_addr = soc_phy_addr;
+ break;
+ case RAS_FB_ADDR_BANK:
+ ret = ras_cmd_translate_soc_pa_to_bank(ras_core,
+ soc_phy_addr, &dest_addr->bank_addr);
+ if (ret)
+ return RAS_CMD__ERROR_GENERIC;
+ break;
+ default:
+ return RAS_CMD__ERROR_INVALID_CMD;
+ }
+
+ return ret;
+}
+
+static int amdgpu_ras_translate_fb_address(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_translate_fb_address_req *req_buff =
+ (struct ras_cmd_translate_fb_address_req *)cmd->input_buff_raw;
+ struct ras_cmd_translate_fb_address_rsp *rsp_buff =
+ (struct ras_cmd_translate_fb_address_rsp *)cmd->output_buff_raw;
+ int ret = RAS_CMD__ERROR_GENERIC;
+
+ if (cmd->input_size != sizeof(struct ras_cmd_translate_fb_address_req))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ if ((req_buff->src_addr_type >= RAS_FB_ADDR_UNKNOWN) ||
+ (req_buff->dest_addr_type >= RAS_FB_ADDR_UNKNOWN))
+ return RAS_CMD__ERROR_INVALID_INPUT_DATA;
+
+ ret = ras_translate_fb_address(ras_core, req_buff->src_addr_type,
+ req_buff->dest_addr_type, &req_buff->trans_addr, &rsp_buff->trans_addr);
+ if (ret)
+ return RAS_CMD__ERROR_GENERIC;
+
+ rsp_buff->version = 0;
+ cmd->output_size = sizeof(struct ras_cmd_translate_fb_address_rsp);
+
+ return RAS_CMD__SUCCESS;
+}
+
+static struct ras_cmd_func_map amdgpu_ras_cmd_maps[] = {
+ {RAS_CMD__INJECT_ERROR, amdgpu_ras_inject_error},
+ {RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES, amdgpu_ras_get_ras_safe_fb_addr_ranges},
+ {RAS_CMD__TRANSLATE_FB_ADDRESS, amdgpu_ras_translate_fb_address},
+};
+
+int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_func_map *ras_cmd = NULL;
+ int i, res;
+
+ for (i = 0; i < ARRAY_SIZE(amdgpu_ras_cmd_maps); i++) {
+ if (cmd->cmd_id == amdgpu_ras_cmd_maps[i].cmd_id) {
+ ras_cmd = &amdgpu_ras_cmd_maps[i];
+ break;
+ }
+ }
+
+ if (ras_cmd)
+ res = ras_cmd->func(ras_core, cmd, NULL);
+ else
+ res = RAS_CMD__ERROR_UKNOWN_CMD;
+
+ return res;
+}
+
+int amdgpu_ras_submit_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd)
+{
+ struct ras_core_context *cmd_core = ras_core;
+ int timeout = 60;
+ int res;
+
+ cmd->cmd_res = RAS_CMD__ERROR_INVALID_CMD;
+ cmd->output_size = 0;
+
+ if (!ras_core_is_enabled(cmd_core))
+ return RAS_CMD__ERROR_ACCESS_DENIED;
+
+ while (ras_core_gpu_in_reset(cmd_core)) {
+ msleep(1000);
+ if (!timeout--)
+ return RAS_CMD__ERROR_TIMEOUT;
+ }
+
+ res = amdgpu_ras_handle_cmd(cmd_core, cmd, NULL);
+ if (res == RAS_CMD__ERROR_UKNOWN_CMD)
+ res = rascore_handle_cmd(cmd_core, cmd, NULL);
+
+ cmd->cmd_res = res;
+
+ if (cmd->output_size > cmd->output_buf_size) {
+ RAS_DEV_ERR(cmd_core->dev,
+ "Output size 0x%x exceeds output buffer size 0x%x!\n",
+ cmd->output_size, cmd->output_buf_size);
+ return RAS_CMD__SUCCESS_EXEED_BUFFER;
+ }
+
+ return RAS_CMD__SUCCESS;
+}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
new file mode 100644
index 000000000000..5973b156cc85
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __AMDGPU_RAS_CMD_H__
+#define __AMDGPU_RAS_CMD_H__
+#include "ras.h"
+
+enum amdgpu_ras_cmd_id {
+ RAS_CMD__AMDGPU_BEGIN = RAS_CMD_ID_AMDGPU_START,
+ RAS_CMD__TRANSLATE_MEMORY_FD,
+ RAS_CMD__AMDGPU_SUPPORTED_MAX = RAS_CMD_ID_AMDGPU_END,
+};
+
+struct ras_cmd_translate_memory_fd_req {
+ struct ras_cmd_dev_handle dev;
+ uint32_t type;
+ uint32_t fd;
+ uint64_t address;
+ uint32_t reserved[4];
+};
+
+struct ras_cmd_translate_memory_fd_rsp {
+ uint32_t version;
+ uint32_t padding;
+ uint64_t start;
+ uint64_t size;
+ uint32_t reserved[2];
+};
+
+int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data);
+int amdgpu_ras_submit_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd);
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c
new file mode 100644
index 000000000000..3ed3ff42b7e1
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c
@@ -0,0 +1,182 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "amdgpu.h"
+#include "amdgpu_atomfirmware.h"
+#include "amdgpu_ras_eeprom.h"
+#include "amdgpu_ras_mgr.h"
+#include "amdgpu_ras_eeprom_i2c.h"
+#include "ras_eeprom.h"
+
+/* These are memory addresses as would be seen by one or more EEPROM
+ * chips strung on the I2C bus, usually by manipulating pins 1-3 of a
+ * set of EEPROM devices. They form a continuous memory space.
+ *
+ * The I2C device address includes the device type identifier, 1010b,
+ * which is a reserved value and indicates that this is an I2C EEPROM
+ * device. It also includes the top 3 bits of the 19 bit EEPROM memory
+ * address, namely bits 18, 17, and 16. This makes up the 7 bit
+ * address sent on the I2C bus with bit 0 being the direction bit,
+ * which is not represented here, and sent by the hardware directly.
+ *
+ * For instance,
+ * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0.
+ * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h.
+ * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h.
+ * Depending on the size of the I2C EEPROM device(s), bits 18:16 may
+ * address memory in a device or a device on the I2C bus, depending on
+ * the status of pins 1-3. See top of amdgpu_eeprom.c.
+ *
+ * The RAS table lives either at address 0 or address 40000h of EEPROM.
+ */
+#define EEPROM_I2C_MADDR_0 0x0
+#define EEPROM_I2C_MADDR_4 0x40000
+
+#define MAKE_I2C_ADDR(_aa) ((0xA << 3) | (((_aa) >> 16) & 0xF))
+#define to_amdgpu_ras(x) (container_of(x, struct amdgpu_ras, eeprom_control))
+
+#define EEPROM_PAGE_BITS 8
+#define EEPROM_PAGE_SIZE (1U << EEPROM_PAGE_BITS)
+#define EEPROM_PAGE_MASK (EEPROM_PAGE_SIZE - 1)
+
+#define EEPROM_OFFSET_SIZE 2
+
+static int ras_eeprom_i2c_config(struct ras_core_context *ras_core)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ u8 i2c_addr;
+
+ if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) {
+ /* The address given by VBIOS is an 8-bit, wire-format
+ * address, i.e. the most significant byte.
+ *
+ * Normalize it to a 19-bit EEPROM address. Remove the
+ * device type identifier and make it a 7-bit address;
+ * then make it a 19-bit EEPROM address. See top of
+ * amdgpu_eeprom.c.
+ */
+ i2c_addr = (i2c_addr & 0x0F) >> 1;
+ control->i2c_address = ((u32) i2c_addr) << 16;
+ return 0;
+ }
+
+ switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+ case IP_VERSION(13, 0, 5):
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 10):
+ case IP_VERSION(13, 0, 12):
+ case IP_VERSION(13, 0, 14):
+ control->i2c_address = EEPROM_I2C_MADDR_4;
+ return 0;
+ default:
+ return -ENODATA;
+ }
+ return -ENODATA;
+}
+
+static int ras_eeprom_i2c_xfer(struct ras_core_context *ras_core, u32 eeprom_addr,
+ u8 *eeprom_buf, u32 buf_size, bool read)
+{
+ struct i2c_adapter *i2c_adap = ras_core->ras_eeprom.i2c_adapter;
+ u8 eeprom_offset_buf[EEPROM_OFFSET_SIZE];
+ struct i2c_msg msgs[] = {
+ {
+ .flags = 0,
+ .len = EEPROM_OFFSET_SIZE,
+ .buf = eeprom_offset_buf,
+ },
+ {
+ .flags = read ? I2C_M_RD : 0,
+ },
+ };
+ const u8 *p = eeprom_buf;
+ int r;
+ u16 len;
+
+ for (r = 0; buf_size > 0;
+ buf_size -= len, eeprom_addr += len, eeprom_buf += len) {
+ /* Set the EEPROM address we want to write to/read from.
+ */
+ msgs[0].addr = MAKE_I2C_ADDR(eeprom_addr);
+ msgs[1].addr = msgs[0].addr;
+ msgs[0].buf[0] = (eeprom_addr >> 8) & 0xff;
+ msgs[0].buf[1] = eeprom_addr & 0xff;
+
+ if (!read) {
+ /* Write the maximum amount of data, without
+ * crossing the device's page boundary, as per
+ * its spec. Partial page writes are allowed,
+ * starting at any location within the page,
+ * so long as the page boundary isn't crossed
+ * over (actually the page pointer rolls
+ * over).
+ *
+ * As per the AT24CM02 EEPROM spec, after
+ * writing into a page, the I2C driver should
+ * terminate the transfer, i.e. in
+ * "i2c_transfer()" below, with a STOP
+ * condition, so that the self-timed write
+ * cycle begins. This is implied for the
+ * "i2c_transfer()" abstraction.
+ */
+ len = min(EEPROM_PAGE_SIZE - (eeprom_addr & EEPROM_PAGE_MASK),
+ buf_size);
+ } else {
+ /* Reading from the EEPROM has no limitation
+ * on the number of bytes read from the EEPROM
+ * device--they are simply sequenced out.
+ * Keep in mind that i2c_msg.len is u16 type.
+ */
+ len = min(U16_MAX, buf_size);
+ }
+ msgs[1].len = len;
+ msgs[1].buf = eeprom_buf;
+
+
+ /* This constitutes a START-STOP transaction.
+ */
+ r = i2c_transfer(i2c_adap, msgs, ARRAY_SIZE(msgs));
+ if (r != ARRAY_SIZE(msgs))
+ break;
+
+ if (!read) {
+ /* According to EEPROM specs the length of the
+ * self-writing cycle, tWR (tW), is 10 ms.
+ *
+ * TODO: Use polling on ACK, aka Acknowledge
+ * Polling, to minimize waiting for the
+ * internal write cycle to complete, as it is
+ * usually smaller than tWR (tW).
+ */
+ msleep(10);
+ }
+ }
+
+ return r < 0 ? r : eeprom_buf - p;
+}
+
+const struct ras_eeprom_sys_func amdgpu_ras_eeprom_i2c_sys_func = {
+ .eeprom_i2c_xfer = ras_eeprom_i2c_xfer,
+ .update_eeprom_i2c_config = ras_eeprom_i2c_config,
+};
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.h
new file mode 100644
index 000000000000..3b5878605411
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright (C) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef __AMDGPU_RAS_EEPROM_I2C_H__
+#define __AMDGPU_RAS_EEPROM_I2C_H__
+#include "ras.h"
+
+extern const struct ras_eeprom_sys_func amdgpu_ras_eeprom_i2c_sys_func;
+#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
new file mode 100644
index 000000000000..afe8135b6258
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c
@@ -0,0 +1,648 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "amdgpu_reset.h"
+#include "amdgpu_xgmi.h"
+#include "ras_sys.h"
+#include "amdgpu_ras_mgr.h"
+#include "amdgpu_ras_cmd.h"
+#include "amdgpu_ras_process.h"
+#include "amdgpu_ras_eeprom_i2c.h"
+#include "amdgpu_ras_mp1_v13_0.h"
+#include "amdgpu_ras_nbio_v7_9.h"
+
+#define MAX_SOCKET_NUM_PER_HIVE 8
+#define MAX_AID_NUM_PER_SOCKET 4
+#define MAX_XCD_NUM_PER_AID 2
+
+/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
+#define TYPICAL_ECC_BAD_PAGE_RATE (100ULL * SZ_1M)
+
+#define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4)
+
+/* Reserve 8 physical dram row for possible retirement.
+ * In worst cases, it will lose 8 * 2MB memory in vram domain
+ */
+#define RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20)
+
+
+static void ras_mgr_init_event_mgr(struct ras_event_manager *mgr)
+{
+ struct ras_event_state *event_state;
+ int i;
+
+ memset(mgr, 0, sizeof(*mgr));
+ atomic64_set(&mgr->seqno, 0);
+
+ for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) {
+ event_state = &mgr->event_state[i];
+ event_state->last_seqno = RAS_EVENT_INVALID_ID;
+ atomic64_set(&event_state->count, 0);
+ }
+}
+
+static void amdgpu_ras_mgr_init_event_mgr(struct ras_core_context *ras_core)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ struct ras_event_manager *event_mgr;
+ struct amdgpu_hive_info *hive;
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
+
+ /* init event manager with node 0 on xgmi system */
+ if (!amdgpu_reset_in_recovery(adev)) {
+ if (!hive || adev->gmc.xgmi.node_id == 0)
+ ras_mgr_init_event_mgr(event_mgr);
+ }
+
+ if (hive)
+ amdgpu_put_xgmi_hive(hive);
+}
+
+static int amdgpu_ras_mgr_init_aca_config(struct amdgpu_device *adev,
+ struct ras_core_config *config)
+{
+ struct ras_aca_config *aca_cfg = &config->aca_cfg;
+
+ aca_cfg->socket_num_per_hive = MAX_SOCKET_NUM_PER_HIVE;
+ aca_cfg->aid_num_per_socket = MAX_AID_NUM_PER_SOCKET;
+ aca_cfg->xcd_num_per_aid = MAX_XCD_NUM_PER_AID;
+
+ return 0;
+}
+
+static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev,
+ struct ras_core_config *config)
+{
+ struct ras_eeprom_config *eeprom_cfg = &config->eeprom_cfg;
+
+ eeprom_cfg->eeprom_sys_fn = &amdgpu_ras_eeprom_i2c_sys_func;
+ eeprom_cfg->eeprom_i2c_adapter = adev->pm.ras_eeprom_i2c_bus;
+ if (eeprom_cfg->eeprom_i2c_adapter) {
+ const struct i2c_adapter_quirks *quirks =
+ ((struct i2c_adapter *)eeprom_cfg->eeprom_i2c_adapter)->quirks;
+
+ if (quirks) {
+ eeprom_cfg->max_i2c_read_len = quirks->max_read_len;
+ eeprom_cfg->max_i2c_write_len = quirks->max_write_len;
+ }
+ }
+
+ /*
+ * amdgpu_bad_page_threshold is used to config
+ * the threshold for the number of bad pages.
+ * -1: Threshold is set to default value
+ * Driver will issue a warning message when threshold is reached
+ * and continue runtime services.
+ * 0: Disable bad page retirement
+ * Driver will not retire bad pages
+ * which is intended for debugging purpose.
+ * -2: Threshold is determined by a formula
+ * that assumes 1 bad page per 100M of local memory.
+ * Driver will continue runtime services when threhold is reached.
+ * 0 < threshold < max number of bad page records in EEPROM,
+ * A user-defined threshold is set
+ * Driver will halt runtime services when this custom threshold is reached.
+ */
+ if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD)
+ eeprom_cfg->eeprom_record_threshold_count =
+ div64_u64(adev->gmc.mc_vram_size, TYPICAL_ECC_BAD_PAGE_RATE);
+ else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD)
+ eeprom_cfg->eeprom_record_threshold_count =
+ COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT);
+ else
+ eeprom_cfg->eeprom_record_threshold_count = amdgpu_bad_page_threshold;
+
+ eeprom_cfg->eeprom_record_threshold_config = amdgpu_bad_page_threshold;
+
+ return 0;
+}
+
+static int amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device *adev,
+ struct ras_core_config *config)
+{
+ struct ras_mp1_config *mp1_cfg = &config->mp1_cfg;
+ int ret = 0;
+
+ switch (config->mp1_ip_version) {
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
+ case IP_VERSION(13, 0, 12):
+ mp1_cfg->mp1_sys_fn = &amdgpu_ras_mp1_sys_func_v13_0;
+ break;
+ default:
+ RAS_DEV_ERR(adev,
+ "The mp1(0x%x) ras config is not right!\n",
+ config->mp1_ip_version);
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev,
+ struct ras_core_config *config)
+{
+ struct ras_nbio_config *nbio_cfg = &config->nbio_cfg;
+ int ret = 0;
+
+ switch (config->nbio_ip_version) {
+ case IP_VERSION(7, 9, 0):
+ case IP_VERSION(7, 9, 1):
+ nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9;
+ break;
+ default:
+ RAS_DEV_ERR(adev,
+ "The nbio(0x%x) ras config is not right!\n",
+ config->nbio_ip_version);
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+static int amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context *ras_core,
+ struct ras_psp_sys_status *status)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct ta_context *context = &adev->psp.ras_context.context;
+
+ status->initialized = context->initialized;
+ status->session_id = context->session_id;
+ status->psp_cmd_mutex = &adev->psp.mutex;
+
+ return 0;
+}
+
+static int amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context *ras_core,
+ struct ras_ta_init_param *ras_ta_param)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ uint32_t nps_mode;
+
+ if (amdgpu_ras_is_poison_mode_supported(adev))
+ ras_ta_param->poison_mode_en = 1;
+
+ if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
+ ras_ta_param->dgpu_mode = 1;
+
+ ras_ta_param->xcc_mask = adev->gfx.xcc_mask;
+ ras_ta_param->channel_dis_num = hweight32(adev->gmc.m_half_use) * 2;
+
+ ras_ta_param->active_umc_mask = adev->umc.active_mask;
+
+ if (!amdgpu_ras_mgr_get_curr_nps_mode(adev, &nps_mode))
+ ras_ta_param->nps_mode = nps_mode;
+
+ return 0;
+}
+
+const struct ras_psp_sys_func amdgpu_ras_psp_sys_func = {
+ .get_ras_psp_system_status = amdgpu_ras_mgr_get_ras_psp_system_status,
+ .get_ras_ta_init_param = amdgpu_ras_mgr_get_ras_ta_init_param,
+};
+
+static int amdgpu_ras_mgr_init_psp_config(struct amdgpu_device *adev,
+ struct ras_core_config *config)
+{
+ struct ras_psp_config *psp_cfg = &config->psp_cfg;
+
+ psp_cfg->psp_sys_fn = &amdgpu_ras_psp_sys_func;
+
+ return 0;
+}
+
+static int amdgpu_ras_mgr_init_umc_config(struct amdgpu_device *adev,
+ struct ras_core_config *config)
+{
+ struct ras_umc_config *umc_cfg = &config->umc_cfg;
+
+ umc_cfg->umc_vram_type = adev->gmc.vram_type;
+
+ return 0;
+}
+
+static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_device *adev)
+{
+ struct ras_core_config init_config;
+
+ memset(&init_config, 0, sizeof(init_config));
+
+ init_config.umc_ip_version = amdgpu_ip_version(adev, UMC_HWIP, 0);
+ init_config.mp1_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
+ init_config.gfx_ip_version = amdgpu_ip_version(adev, GC_HWIP, 0);
+ init_config.nbio_ip_version = amdgpu_ip_version(adev, NBIO_HWIP, 0);
+ init_config.psp_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0);
+
+ if (init_config.umc_ip_version == IP_VERSION(12, 0, 0) ||
+ init_config.umc_ip_version == IP_VERSION(12, 5, 0))
+ init_config.aca_ip_version = IP_VERSION(1, 0, 0);
+
+ init_config.sys_fn = &amdgpu_ras_sys_fn;
+ init_config.ras_eeprom_supported = true;
+ init_config.poison_supported =
+ amdgpu_ras_is_poison_mode_supported(adev);
+
+ amdgpu_ras_mgr_init_aca_config(adev, &init_config);
+ amdgpu_ras_mgr_init_eeprom_config(adev, &init_config);
+ amdgpu_ras_mgr_init_mp1_config(adev, &init_config);
+ amdgpu_ras_mgr_init_nbio_config(adev, &init_config);
+ amdgpu_ras_mgr_init_psp_config(adev, &init_config);
+ amdgpu_ras_mgr_init_umc_config(adev, &init_config);
+
+ return ras_core_create(&init_config);
+}
+
+static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_mgr *ras_mgr;
+ int ret = 0;
+
+ /* Disabled by default */
+ con->uniras_enabled = false;
+
+ /* Enabled only in debug mode */
+ if (adev->debug_enable_ras_aca) {
+ con->uniras_enabled = true;
+ RAS_DEV_INFO(adev, "Debug amdgpu uniras!");
+ }
+
+ if (!con->uniras_enabled)
+ return 0;
+
+ ras_mgr = kzalloc(sizeof(*ras_mgr), GFP_KERNEL);
+ if (!ras_mgr)
+ return -EINVAL;
+
+ con->ras_mgr = ras_mgr;
+ ras_mgr->adev = adev;
+
+ ras_mgr->ras_core = amdgpu_ras_mgr_create_ras_core(adev);
+ if (!ras_mgr->ras_core) {
+ RAS_DEV_ERR(adev, "Failed to create ras core!\n");
+ ret = -EINVAL;
+ goto err;
+ }
+
+ ras_mgr->ras_core->dev = adev;
+
+ amdgpu_ras_process_init(adev);
+ ras_core_sw_init(ras_mgr->ras_core);
+ amdgpu_ras_mgr_init_event_mgr(ras_mgr->ras_core);
+ return 0;
+
+err:
+ kfree(ras_mgr);
+ return ret;
+}
+
+static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr;
+
+ if (!con->uniras_enabled)
+ return 0;
+
+ if (!ras_mgr)
+ return 0;
+
+ amdgpu_ras_process_fini(adev);
+ ras_core_sw_fini(ras_mgr->ras_core);
+ ras_core_destroy(ras_mgr->ras_core);
+ ras_mgr->ras_core = NULL;
+
+ kfree(con->ras_mgr);
+ con->ras_mgr = NULL;
+
+ return 0;
+}
+
+static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ int ret;
+
+ if (!con->uniras_enabled)
+ return 0;
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return -EINVAL;
+
+ ret = ras_core_hw_init(ras_mgr->ras_core);
+ if (ret) {
+ RAS_DEV_ERR(adev, "Failed to initialize ras core!\n");
+ return ret;
+ }
+
+ ras_mgr->ras_is_ready = true;
+
+ amdgpu_enable_uniras(adev, true);
+
+ RAS_DEV_INFO(adev, "AMDGPU RAS Is Ready.\n");
+ return 0;
+}
+
+static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block)
+{
+ struct amdgpu_device *adev = ip_block->adev;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!con->uniras_enabled)
+ return 0;
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return -EINVAL;
+
+ ras_core_hw_fini(ras_mgr->ras_core);
+
+ ras_mgr->ras_is_ready = false;
+
+ return 0;
+}
+
+struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(struct amdgpu_device *adev)
+{
+ if (!adev || !adev->psp.ras_context.ras)
+ return NULL;
+
+ return (struct amdgpu_ras_mgr *)adev->psp.ras_context.ras->ras_mgr;
+}
+
+static const struct amd_ip_funcs __maybe_unused ras_v1_0_ip_funcs = {
+ .name = "ras_v1_0",
+ .sw_init = amdgpu_ras_mgr_sw_init,
+ .sw_fini = amdgpu_ras_mgr_sw_fini,
+ .hw_init = amdgpu_ras_mgr_hw_init,
+ .hw_fini = amdgpu_ras_mgr_hw_fini,
+};
+
+const struct amdgpu_ip_block_version ras_v1_0_ip_block = {
+ .type = AMD_IP_BLOCK_TYPE_RAS,
+ .major = 1,
+ .minor = 0,
+ .rev = 0,
+ .funcs = &ras_v1_0_ip_funcs,
+};
+
+int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return -EPERM;
+
+ if (amdgpu_sriov_vf(adev))
+ return -EPERM;
+
+ RAS_DEV_INFO(adev, "Enable amdgpu unified ras!");
+ return ras_core_set_status(ras_mgr->ras_core, enable);
+}
+
+bool amdgpu_uniras_enabled(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return false;
+
+ if (amdgpu_sriov_vf(adev))
+ return false;
+
+ return ras_core_is_enabled(ras_mgr->ras_core);
+}
+
+static bool amdgpu_ras_mgr_is_ready(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (ras_mgr && ras_mgr->ras_core && ras_mgr->ras_is_ready &&
+ ras_core_is_ready(ras_mgr->ras_core))
+ return true;
+
+ return false;
+}
+
+int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return -EPERM;
+
+ return ras_core_handle_nbio_irq(ras_mgr->ras_core, data);
+}
+
+uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev,
+ enum ras_seqno_type seqno_type)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ int ret;
+ uint64_t seq_no;
+
+ if (!amdgpu_ras_mgr_is_ready(adev) ||
+ (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
+ return 0;
+
+ seq_no = ras_core_gen_seqno(ras_mgr->ras_core, seqno_type);
+
+ if ((seqno_type == RAS_SEQNO_TYPE_DE) ||
+ (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)) {
+ ret = ras_core_put_seqno(ras_mgr->ras_core, seqno_type, seq_no);
+ if (ret)
+ RAS_DEV_WARN(adev, "There are too many ras interrupts!");
+ }
+
+ return seq_no;
+}
+
+int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
+ uint64_t seq_no = 0;
+ int ret = 0;
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return -EPERM;
+
+ if (ih_info && (ih_info->block == AMDGPU_RAS_BLOCK__UMC)) {
+ if (ras_mgr->ras_core->poison_supported) {
+ seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_DE);
+ RAS_DEV_INFO(adev,
+ "{%llu} RAS poison is created, no user action is needed.\n",
+ seq_no);
+ }
+
+ ret = amdgpu_ras_process_handle_umc_interrupt(adev, ih_info);
+ } else if (ras_mgr->ras_core->poison_supported) {
+ ret = amdgpu_ras_process_handle_unexpected_interrupt(adev, ih_info);
+ } else {
+ RAS_DEV_WARN(adev,
+ "No RAS interrupt handler for non-UMC block with poison disabled.\n");
+ }
+
+ return ret;
+}
+
+int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data)
+{
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return -EPERM;
+
+ return amdgpu_ras_process_handle_consumption_interrupt(adev, data);
+}
+
+int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return -EPERM;
+
+ return ras_core_update_ecc_info(ras_mgr->ras_core);
+}
+
+int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return -EPERM;
+
+ con->gpu_reset_flags |= flags;
+ return amdgpu_ras_reset_gpu(adev);
+}
+
+bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return false;
+
+ return ras_eeprom_check_safety_watermark(ras_mgr->ras_core);
+}
+
+int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev,
+ uint32_t *nps_mode)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ uint32_t mode;
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return -EINVAL;
+
+ mode = ras_core_get_curr_nps_mode(ras_mgr->ras_core);
+ if (!mode || mode > AMDGPU_NPS8_PARTITION_MODE)
+ return -EINVAL;
+
+ *nps_mode = mode;
+
+ return 0;
+}
+
+bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
+ uint64_t addr)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return false;
+
+ return ras_umc_check_retired_addr(ras_mgr->ras_core, addr);
+}
+
+bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!ras_mgr || !ras_mgr->ras_core || !ras_mgr->ras_is_ready)
+ return false;
+
+ return ras_core_gpu_is_rma(ras_mgr->ras_core);
+}
+
+int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
+ uint32_t cmd_id, void *input, uint32_t input_size,
+ void *output, uint32_t out_size)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ struct ras_cmd_ctx *cmd_ctx;
+ uint32_t ctx_buf_size = PAGE_SIZE;
+ int ret;
+
+ if (!amdgpu_ras_mgr_is_ready(adev))
+ return -EPERM;
+
+ cmd_ctx = kzalloc(ctx_buf_size, GFP_KERNEL);
+ if (!cmd_ctx)
+ return -ENOMEM;
+
+ cmd_ctx->cmd_id = cmd_id;
+
+ memcpy(cmd_ctx->input_buff_raw, input, input_size);
+ cmd_ctx->input_size = input_size;
+ cmd_ctx->output_buf_size = ctx_buf_size - sizeof(*cmd_ctx);
+
+ ret = amdgpu_ras_submit_cmd(ras_mgr->ras_core, cmd_ctx);
+ if (!ret && !cmd_ctx->cmd_res && output && (out_size == cmd_ctx->output_size))
+ memcpy(output, cmd_ctx->output_buff_raw, cmd_ctx->output_size);
+
+ kfree(cmd_ctx);
+
+ return ret;
+}
+
+int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev)
+{
+ if (!amdgpu_ras_mgr_is_ready(adev)) {
+ RAS_DEV_ERR(adev, "Invalid ras suspend!\n");
+ return -EPERM;
+ }
+
+ amdgpu_ras_process_pre_reset(adev);
+ return 0;
+}
+
+int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev)
+{
+ if (!amdgpu_ras_mgr_is_ready(adev)) {
+ RAS_DEV_ERR(adev, "Invalid ras resume!\n");
+ return -EPERM;
+ }
+
+ amdgpu_ras_process_post_reset(adev);
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
new file mode 100644
index 000000000000..8fb7eb4b8f13
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h
@@ -0,0 +1,83 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef __AMDGPU_RAS_MGR_H__
+#define __AMDGPU_RAS_MGR_H__
+#include "ras.h"
+#include "amdgpu_ras_process.h"
+
+enum ras_ih_type {
+ RAS_IH_NONE,
+ RAS_IH_FROM_BLOCK_CONTROLLER,
+ RAS_IH_FROM_CONSUMER_CLIENT,
+ RAS_IH_FROM_FATAL_ERROR,
+};
+
+struct ras_ih_info {
+ uint32_t block;
+ union {
+ struct amdgpu_iv_entry iv_entry;
+ struct {
+ uint16_t pasid;
+ uint32_t reset;
+ pasid_notify pasid_fn;
+ void *data;
+ };
+ };
+};
+
+struct amdgpu_ras_mgr {
+ struct amdgpu_device *adev;
+ struct ras_core_context *ras_core;
+ struct delayed_work retire_page_dwork;
+ struct ras_event_manager ras_event_mgr;
+ uint64_t last_poison_consumption_seqno;
+ bool ras_is_ready;
+
+ bool is_paused;
+ struct completion ras_event_done;
+};
+
+extern const struct amdgpu_ip_block_version ras_v1_0_ip_block;
+
+struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(
+ struct amdgpu_device *adev);
+int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable);
+bool amdgpu_uniras_enabled(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data);
+int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data);
+int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data);
+int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags);
+uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev,
+ enum ras_seqno_type seqno_type);
+bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev, uint32_t *nps_mode);
+bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev,
+ uint64_t addr);
+bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev,
+ uint32_t cmd_id, void *input, uint32_t input_size,
+ void *output, uint32_t out_size);
+int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev);
+int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.c
new file mode 100644
index 000000000000..79a51b1603ac
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu_smu.h"
+#include "amdgpu_reset.h"
+#include "amdgpu_ras_mp1_v13_0.h"
+
+#define RAS_MP1_MSG_QueryValidMcaCeCount 0x3A
+#define RAS_MP1_MSG_McaBankCeDumpDW 0x3B
+
+static int mp1_v13_0_get_valid_bank_count(struct ras_core_context *ras_core,
+ u32 msg, u32 *count)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ u32 smu_msg;
+ int ret = 0;
+
+ if (!count)
+ return -EINVAL;
+
+ smu_msg = (msg == RAS_MP1_MSG_QueryValidMcaCeCount) ?
+ SMU_MSG_QueryValidMcaCeCount : SMU_MSG_QueryValidMcaCount;
+
+ if (down_read_trylock(&adev->reset_domain->sem)) {
+ ret = amdgpu_smu_ras_send_msg(adev, smu_msg, 0, count);
+ up_read(&adev->reset_domain->sem);
+ } else {
+ ret = -RAS_CORE_GPU_IN_MODE1_RESET;
+ }
+
+ if (ret)
+ *count = 0;
+
+ return ret;
+}
+
+static int mp1_v13_0_dump_valid_bank(struct ras_core_context *ras_core,
+ u32 msg, u32 idx, u32 reg_idx, u64 *val)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ uint32_t data[2] = {0, 0};
+ uint32_t param;
+ int ret = 0;
+ int i, offset;
+ u32 smu_msg = (msg == RAS_MP1_MSG_McaBankCeDumpDW) ?
+ SMU_MSG_McaBankCeDumpDW : SMU_MSG_McaBankDumpDW;
+
+ if (down_read_trylock(&adev->reset_domain->sem)) {
+ offset = reg_idx * 8;
+ for (i = 0; i < ARRAY_SIZE(data); i++) {
+ param = ((idx & 0xffff) << 16) | ((offset + (i << 2)) & 0xfffc);
+ ret = amdgpu_smu_ras_send_msg(adev, smu_msg, param, &data[i]);
+ if (ret) {
+ RAS_DEV_ERR(adev, "ACA failed to read register[%d], offset:0x%x\n",
+ reg_idx, offset);
+ break;
+ }
+ }
+ up_read(&adev->reset_domain->sem);
+
+ if (!ret)
+ *val = (uint64_t)data[1] << 32 | data[0];
+ } else {
+ ret = -RAS_CORE_GPU_IN_MODE1_RESET;
+ }
+
+ return ret;
+}
+
+const struct ras_mp1_sys_func amdgpu_ras_mp1_sys_func_v13_0 = {
+ .mp1_get_valid_bank_count = mp1_v13_0_get_valid_bank_count,
+ .mp1_dump_valid_bank = mp1_v13_0_dump_valid_bank,
+};
+
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.h
new file mode 100644
index 000000000000..71c614ae1ae4
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __AMDGPU_RAS_MP1_V13_0_H__
+#define __AMDGPU_RAS_MP1_V13_0_H__
+#include "ras.h"
+
+extern const struct ras_mp1_sys_func amdgpu_ras_mp1_sys_func_v13_0;
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.c
new file mode 100644
index 000000000000..2783f5875c7c
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu_ras_mgr.h"
+#include "amdgpu_ras_nbio_v7_9.h"
+#include "nbio/nbio_7_9_0_offset.h"
+#include "nbio/nbio_7_9_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+
+static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *src,
+ unsigned int type,
+ enum amdgpu_interrupt_state state)
+{
+ /* Dummy function, there is no initialization operation in driver */
+
+ return 0;
+}
+
+static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+ /* By design, the ih cookie for ras_controller_irq should be written
+ * to BIFring instead of general iv ring. However, due to known bif ring
+ * hw bug, it has to be disabled. There is no chance the process function
+ * will be involked. Just left it as a dummy one.
+ */
+ return 0;
+}
+
+static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *src,
+ unsigned int type,
+ enum amdgpu_interrupt_state state)
+{
+ /* Dummy function, there is no initialization operation in driver */
+
+ return 0;
+}
+
+static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev,
+ struct amdgpu_irq_src *source,
+ struct amdgpu_iv_entry *entry)
+{
+ /* By design, the ih cookie for err_event_athub_irq should be written
+ * to BIFring instead of general iv ring. However, due to known bif ring
+ * hw bug, it has to be disabled. There is no chance the process function
+ * will be involked. Just left it as a dummy one.
+ */
+ return 0;
+}
+
+static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = {
+ .set = nbio_v7_9_set_ras_controller_irq_state,
+ .process = nbio_v7_9_process_ras_controller_irq,
+};
+
+static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = {
+ .set = nbio_v7_9_set_ras_err_event_athub_irq_state,
+ .process = nbio_v7_9_process_err_event_athub_irq,
+};
+
+static int nbio_v7_9_init_ras_controller_interrupt(struct ras_core_context *ras_core, bool state)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ int r;
+
+ /* init the irq funcs */
+ adev->nbio.ras_controller_irq.funcs =
+ &nbio_v7_9_ras_controller_irq_funcs;
+ adev->nbio.ras_controller_irq.num_types = 1;
+
+ /* register ras controller interrupt */
+ r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
+ NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT,
+ &adev->nbio.ras_controller_irq);
+
+ return r;
+}
+
+static int nbio_v7_9_init_ras_err_event_athub_interrupt(struct ras_core_context *ras_core,
+ bool state)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ int r;
+
+ /* init the irq funcs */
+ adev->nbio.ras_err_event_athub_irq.funcs =
+ &nbio_v7_9_ras_err_event_athub_irq_funcs;
+ adev->nbio.ras_err_event_athub_irq.num_types = 1;
+
+ /* register ras err event athub interrupt */
+ r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF,
+ NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT,
+ &adev->nbio.ras_err_event_athub_irq);
+
+ return r;
+}
+
+const struct ras_nbio_sys_func amdgpu_ras_nbio_sys_func_v7_9 = {
+ .set_ras_controller_irq_state = nbio_v7_9_init_ras_controller_interrupt,
+ .set_ras_err_event_athub_irq_state = nbio_v7_9_init_ras_err_event_athub_interrupt,
+};
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.h
new file mode 100644
index 000000000000..272259e9a0e7
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __AMDGPU_RAS_NBIO_V7_9_H__
+#define __AMDGPU_RAS_NBIO_V7_9_H__
+
+extern const struct ras_nbio_sys_func amdgpu_ras_nbio_sys_func_v7_9;
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c
new file mode 100644
index 000000000000..5782c007de71
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c
@@ -0,0 +1,190 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "amdgpu.h"
+#include "amdgpu_reset.h"
+#include "amdgpu_xgmi.h"
+#include "ras_sys.h"
+#include "amdgpu_ras_mgr.h"
+#include "amdgpu_ras_process.h"
+
+#define RAS_MGR_RETIRE_PAGE_INTERVAL 100
+#define RAS_EVENT_PROCESS_TIMEOUT 1200
+
+static void ras_process_retire_page_dwork(struct work_struct *work)
+{
+ struct amdgpu_ras_mgr *ras_mgr =
+ container_of(work, struct amdgpu_ras_mgr, retire_page_dwork.work);
+ struct amdgpu_device *adev = ras_mgr->adev;
+ int ret;
+
+ if (amdgpu_ras_is_rma(adev))
+ return;
+
+ /* If gpu reset is ongoing, delay retiring the bad pages */
+ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) {
+ schedule_delayed_work(&ras_mgr->retire_page_dwork,
+ msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL * 3));
+ return;
+ }
+
+ ret = ras_umc_handle_bad_pages(ras_mgr->ras_core, NULL);
+ if (!ret)
+ schedule_delayed_work(&ras_mgr->retire_page_dwork,
+ msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL));
+}
+
+int amdgpu_ras_process_init(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ ras_mgr->is_paused = false;
+ init_completion(&ras_mgr->ras_event_done);
+
+ INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork);
+
+ return 0;
+}
+
+int amdgpu_ras_process_fini(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ ras_mgr->is_paused = false;
+ /* Save all cached bad pages to eeprom */
+ flush_delayed_work(&ras_mgr->retire_page_dwork);
+ cancel_delayed_work_sync(&ras_mgr->retire_page_dwork);
+ return 0;
+}
+
+int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev, void *data)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!ras_mgr->ras_core)
+ return -EINVAL;
+
+ return ras_process_add_interrupt_req(ras_mgr->ras_core, NULL, true);
+}
+
+int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data)
+{
+ amdgpu_ras_set_fed(adev, true);
+ return amdgpu_ras_mgr_reset_gpu(adev, AMDGPU_RAS_GPU_RESET_MODE1_RESET);
+}
+
+int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ struct ras_ih_info *ih_info = (struct ras_ih_info *)data;
+ struct ras_event_req req;
+ uint64_t seqno;
+
+ if (!ih_info)
+ return -EINVAL;
+
+ memset(&req, 0, sizeof(req));
+ req.block = ih_info->block;
+ req.data = ih_info->data;
+ req.pasid = ih_info->pasid;
+ req.pasid_fn = ih_info->pasid_fn;
+ req.reset = ih_info->reset;
+
+ seqno = ras_core_get_seqno(ras_mgr->ras_core,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION, false);
+
+ /* When the ACA register cannot be read from FW, the poison
+ * consumption seqno in the fifo will not pop up, so it is
+ * necessary to check whether the seqno is the previous seqno.
+ */
+ if (seqno == ras_mgr->last_poison_consumption_seqno) {
+ /* Pop and discard the previous seqno */
+ ras_core_get_seqno(ras_mgr->ras_core,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION, true);
+ seqno = ras_core_get_seqno(ras_mgr->ras_core,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION, false);
+ }
+ ras_mgr->last_poison_consumption_seqno = seqno;
+ req.seqno = seqno;
+
+ return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false);
+}
+
+int amdgpu_ras_process_begin(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (ras_mgr->is_paused)
+ return -EAGAIN;
+
+ reinit_completion(&ras_mgr->ras_event_done);
+ return 0;
+}
+
+int amdgpu_ras_process_end(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ complete(&ras_mgr->ras_event_done);
+ return 0;
+}
+
+int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ long rc;
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return -EINVAL;
+
+ if (!ras_mgr->ras_core->is_initialized)
+ return -EPERM;
+
+ ras_mgr->is_paused = true;
+
+ /* Wait for RAS event processing to complete */
+ rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done,
+ msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT));
+ if (rc <= 0)
+ RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n",
+ rc ? "interrupted" : "timeout");
+
+ flush_delayed_work(&ras_mgr->retire_page_dwork);
+ return 0;
+}
+
+int amdgpu_ras_process_post_reset(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+
+ if (!ras_mgr || !ras_mgr->ras_core)
+ return -EINVAL;
+
+ if (!ras_mgr->ras_core->is_initialized)
+ return -EPERM;
+
+ ras_mgr->is_paused = false;
+
+ schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h
new file mode 100644
index 000000000000..d55cdaeac441
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef __AMDGPU_RAS_PROCESS_H__
+#define __AMDGPU_RAS_PROCESS_H__
+#include "ras_process.h"
+#include "amdgpu_ras_mgr.h"
+
+enum ras_ih_type;
+int amdgpu_ras_process_init(struct amdgpu_device *adev);
+int amdgpu_ras_process_fini(struct amdgpu_device *adev);
+int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev,
+ void *data);
+int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev,
+ void *data);
+int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev,
+ void *data);
+int amdgpu_ras_process_begin(struct amdgpu_device *adev);
+int amdgpu_ras_process_end(struct amdgpu_device *adev);
+int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev);
+int amdgpu_ras_process_post_reset(struct amdgpu_device *adev);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
new file mode 100644
index 000000000000..45ed8c3b5563
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
@@ -0,0 +1,279 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras_sys.h"
+#include "amdgpu_ras_mgr.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
+
+static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ int ret;
+ uint64_t seq_no;
+
+ ret = amdgpu_ras_global_ras_isr(adev);
+ if (ret)
+ return ret;
+
+ seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE);
+ RAS_DEV_INFO(adev,
+ "{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n",
+ seq_no);
+
+ return amdgpu_ras_process_handle_unexpected_interrupt(adev, data);
+}
+
+static int amdgpu_ras_sys_poison_consumption_event(struct ras_core_context *ras_core,
+ void *data)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct ras_event_req *req = (struct ras_event_req *)data;
+ pasid_notify pasid_fn;
+
+ if (!req)
+ return -EINVAL;
+
+ if (req->pasid_fn) {
+ pasid_fn = (pasid_notify)req->pasid_fn;
+ pasid_fn(adev, req->pasid, req->data);
+ }
+
+ return 0;
+}
+
+static int amdgpu_ras_sys_gen_seqno(struct ras_core_context *ras_core,
+ enum ras_seqno_type seqno_type, uint64_t *seqno)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev);
+ struct ras_event_manager *event_mgr;
+ struct ras_event_state *event_state;
+ struct amdgpu_hive_info *hive;
+ enum ras_event_type event_type;
+ uint64_t seq_no;
+
+ if (!ras_mgr || !seqno ||
+ (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX))
+ return -EINVAL;
+
+ switch (seqno_type) {
+ case RAS_SEQNO_TYPE_UE:
+ event_type = RAS_EVENT_TYPE_FATAL;
+ break;
+ case RAS_SEQNO_TYPE_CE:
+ case RAS_SEQNO_TYPE_DE:
+ event_type = RAS_EVENT_TYPE_POISON_CREATION;
+ break;
+ case RAS_SEQNO_TYPE_POISON_CONSUMPTION:
+ event_type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
+ break;
+ default:
+ event_type = RAS_EVENT_TYPE_INVALID;
+ break;
+ }
+
+ hive = amdgpu_get_xgmi_hive(adev);
+ event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr;
+ event_state = &event_mgr->event_state[event_type];
+ if ((event_type == RAS_EVENT_TYPE_FATAL) && amdgpu_ras_in_recovery(adev)) {
+ seq_no = event_state->last_seqno;
+ } else {
+ seq_no = atomic64_inc_return(&event_mgr->seqno);
+ event_state->last_seqno = seq_no;
+ atomic64_inc(&event_state->count);
+ }
+ amdgpu_put_xgmi_hive(hive);
+
+ *seqno = seq_no;
+ return 0;
+
+}
+
+static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core,
+ enum ras_notify_event event_id, void *data)
+{
+ struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev);
+ int ret = 0;
+
+ switch (event_id) {
+ case RAS_EVENT_ID__BAD_PAGE_DETECTED:
+ schedule_delayed_work(&ras_mgr->retire_page_dwork, 0);
+ break;
+ case RAS_EVENT_ID__POISON_CONSUMPTION:
+ amdgpu_ras_sys_poison_consumption_event(ras_core, data);
+ break;
+ case RAS_EVENT_ID__RESERVE_BAD_PAGE:
+ ret = amdgpu_ras_reserve_page(ras_core->dev, *(uint64_t *)data);
+ break;
+ case RAS_EVENT_ID__FATAL_ERROR_DETECTED:
+ ret = amdgpu_ras_sys_detect_fatal_event(ras_core, data);
+ break;
+ case RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM:
+ ret = amdgpu_dpm_send_hbm_bad_pages_num(ras_core->dev, *(uint32_t *)data);
+ break;
+ case RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP:
+ ret = amdgpu_dpm_send_hbm_bad_channel_flag(ras_core->dev, *(uint32_t *)data);
+ break;
+ case RAS_EVENT_ID__DEVICE_RMA:
+ ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL);
+ ret = amdgpu_dpm_send_rma_reason(ras_core->dev);
+ break;
+ case RAS_EVENT_ID__RESET_GPU:
+ ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
+ break;
+ case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN:
+ ret = amdgpu_ras_process_begin(ras_core->dev);
+ break;
+ case RAS_EVENT_ID__RAS_EVENT_PROC_END:
+ ret = amdgpu_ras_process_end(ras_core->dev);
+ break;
+ default:
+ RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id);
+ break;
+ }
+
+ return ret;
+}
+
+static u64 amdgpu_ras_sys_get_utc_second_timestamp(struct ras_core_context *ras_core)
+{
+ return ktime_get_real_seconds();
+}
+
+static int amdgpu_ras_sys_check_gpu_status(struct ras_core_context *ras_core,
+ uint32_t *status)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ uint32_t gpu_status = 0;
+
+ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev))
+ gpu_status |= RAS_GPU_STATUS__IN_RESET;
+
+ if (amdgpu_sriov_vf(adev))
+ gpu_status |= RAS_GPU_STATUS__IS_VF;
+
+ *status = gpu_status;
+
+ return 0;
+}
+
+static int amdgpu_ras_sys_get_device_system_info(struct ras_core_context *ras_core,
+ struct device_system_info *dev_info)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+
+ dev_info->device_id = adev->pdev->device;
+ dev_info->vendor_id = adev->pdev->vendor;
+ dev_info->socket_id = adev->smuio.funcs->get_socket_id(adev);
+
+ return 0;
+}
+
+static int amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context *ras_core,
+ bool down, bool try)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ int ret = 0;
+
+ if (down && try)
+ ret = down_read_trylock(&adev->reset_domain->sem);
+ else if (down)
+ down_read(&adev->reset_domain->sem);
+ else
+ up_read(&adev->reset_domain->sem);
+
+ return ret;
+}
+
+static bool amdgpu_ras_sys_detect_ras_interrupt(struct ras_core_context *ras_core)
+{
+ return !!atomic_read(&amdgpu_ras_in_intr);
+}
+
+static int amdgpu_ras_sys_get_gpu_mem(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
+{
+ struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev;
+ struct psp_context *psp = &adev->psp;
+ struct psp_ring *psp_ring;
+ struct ta_mem_context *mem_ctx;
+
+ if (mem_type == GPU_MEM_TYPE_RAS_PSP_RING) {
+ psp_ring = &psp->km_ring;
+ gpu_mem->mem_bo = adev->firmware.rbuf;
+ gpu_mem->mem_size = psp_ring->ring_size;
+ gpu_mem->mem_mc_addr = psp_ring->ring_mem_mc_addr;
+ gpu_mem->mem_cpu_addr = psp_ring->ring_mem;
+ } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_CMD) {
+ gpu_mem->mem_bo = psp->cmd_buf_bo;
+ gpu_mem->mem_size = PSP_CMD_BUFFER_SIZE;
+ gpu_mem->mem_mc_addr = psp->cmd_buf_mc_addr;
+ gpu_mem->mem_cpu_addr = psp->cmd_buf_mem;
+ } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_FENCE) {
+ gpu_mem->mem_bo = psp->fence_buf_bo;
+ gpu_mem->mem_size = PSP_FENCE_BUFFER_SIZE;
+ gpu_mem->mem_mc_addr = psp->fence_buf_mc_addr;
+ gpu_mem->mem_cpu_addr = psp->fence_buf;
+ } else if (mem_type == GPU_MEM_TYPE_RAS_TA_FW) {
+ gpu_mem->mem_bo = psp->fw_pri_bo;
+ gpu_mem->mem_size = PSP_1_MEG;
+ gpu_mem->mem_mc_addr = psp->fw_pri_mc_addr;
+ gpu_mem->mem_cpu_addr = psp->fw_pri_buf;
+ } else if (mem_type == GPU_MEM_TYPE_RAS_TA_CMD) {
+ mem_ctx = &psp->ras_context.context.mem_context;
+ gpu_mem->mem_bo = mem_ctx->shared_bo;
+ gpu_mem->mem_size = mem_ctx->shared_mem_size;
+ gpu_mem->mem_mc_addr = mem_ctx->shared_mc_addr;
+ gpu_mem->mem_cpu_addr = mem_ctx->shared_buf;
+ } else {
+ return -EINVAL;
+ }
+
+ if (!gpu_mem->mem_bo || !gpu_mem->mem_size ||
+ !gpu_mem->mem_mc_addr || !gpu_mem->mem_cpu_addr) {
+ RAS_DEV_ERR(ras_core->dev, "The ras psp gpu memory is invalid!\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
+{
+
+ return 0;
+}
+
+const struct ras_sys_func amdgpu_ras_sys_fn = {
+ .ras_notifier = amdgpu_ras_sys_event_notifier,
+ .get_utc_second_timestamp = amdgpu_ras_sys_get_utc_second_timestamp,
+ .gen_seqno = amdgpu_ras_sys_gen_seqno,
+ .check_gpu_status = amdgpu_ras_sys_check_gpu_status,
+ .get_device_system_info = amdgpu_ras_sys_get_device_system_info,
+ .gpu_reset_lock = amdgpu_ras_sys_gpu_reset_lock,
+ .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt,
+ .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem,
+ .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem,
+};
diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
new file mode 100644
index 000000000000..8156531a7b63
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_SYS_H__
+#define __RAS_SYS_H__
+#include <linux/stdarg.h>
+#include <linux/printk.h>
+#include <linux/dev_printk.h>
+#include <linux/mempool.h>
+#include "amdgpu.h"
+
+#define RAS_DEV_ERR(device, fmt, ...) \
+ do { \
+ if (device) \
+ dev_err(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \
+ else \
+ printk(KERN_ERR fmt, ##__VA_ARGS__); \
+ } while (0)
+
+#define RAS_DEV_WARN(device, fmt, ...) \
+ do { \
+ if (device) \
+ dev_warn(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \
+ else \
+ printk(KERN_WARNING fmt, ##__VA_ARGS__); \
+ } while (0)
+
+#define RAS_DEV_INFO(device, fmt, ...) \
+ do { \
+ if (device) \
+ dev_info(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \
+ else \
+ printk(KERN_INFO fmt, ##__VA_ARGS__); \
+ } while (0)
+
+#define RAS_DEV_DBG(device, fmt, ...) \
+ do { \
+ if (device) \
+ dev_dbg(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \
+ else \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ } while (0)
+
+#define RAS_INFO(fmt, ...) printk(KERN_INFO fmt, ##__VA_ARGS__)
+
+#define RAS_DEV_RREG32_SOC15(dev, ip, inst, reg) \
+({ \
+ struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
+ __RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg, \
+ 0, ip##_HWIP, inst); \
+})
+
+#define RAS_DEV_WREG32_SOC15(dev, ip, inst, reg, value) \
+({ \
+ struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
+ __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg), \
+ value, 0, ip##_HWIP, inst); \
+})
+
+/* GET_INST returns the physical instance corresponding to a logical instance */
+#define RAS_GET_INST(dev, ip, inst) \
+({ \
+ struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
+ adev->ip_map.logical_to_dev_inst ? \
+ adev->ip_map.logical_to_dev_inst(adev, ip##_HWIP, inst) : inst; \
+})
+
+#define RAS_GET_MASK(dev, ip, mask) \
+({ \
+ struct amdgpu_device *adev = (struct amdgpu_device *)dev; \
+ (adev->ip_map.logical_to_dev_mask ? \
+ adev->ip_map.logical_to_dev_mask(adev, ip##_HWIP, mask) : mask); \
+})
+
+static inline void *ras_radix_tree_delete_iter(struct radix_tree_root *root, void *iter)
+{
+ return radix_tree_delete(root, ((struct radix_tree_iter *)iter)->index);
+}
+
+static inline long ras_wait_event_interruptible_timeout(void *wq_head,
+ int (*condition)(void *param), void *param, unsigned int timeout)
+{
+ return wait_event_interruptible_timeout(*(wait_queue_head_t *)wq_head,
+ condition(param), timeout);
+}
+
+extern const struct ras_sys_func amdgpu_ras_sys_fn;
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/Makefile b/drivers/gpu/drm/amd/ras/rascore/Makefile
index e69de29bb2d1..e826a1f86424 100644
--- a/drivers/gpu/drm/amd/ras/rascore/Makefile
+++ b/drivers/gpu/drm/amd/ras/rascore/Makefile
@@ -0,0 +1,44 @@
+#
+# Copyright 2025 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+RAS_CORE_FILES = ras_core.o \
+ ras_mp1.o \
+ ras_mp1_v13_0.o \
+ ras_aca.o \
+ ras_aca_v1_0.o \
+ ras_eeprom.o \
+ ras_umc.o \
+ ras_umc_v12_0.o \
+ ras_cmd.o \
+ ras_gfx.o \
+ ras_gfx_v9_0.o \
+ ras_process.o \
+ ras_nbio.o \
+ ras_nbio_v7_9.o \
+ ras_log_ring.o \
+ ras_cper.o \
+ ras_psp.o \
+ ras_psp_v13_0.o
+
+
+RAS_CORE = $(addprefix $(AMD_GPU_RAS_PATH)/rascore/,$(RAS_CORE_FILES))
+
+AMD_GPU_RAS_FILES += $(RAS_CORE)
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h
new file mode 100644
index 000000000000..3396b2e0949d
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras.h
@@ -0,0 +1,370 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_H__
+#define __RAS_H__
+#include "ras_sys.h"
+#include "ras_umc.h"
+#include "ras_aca.h"
+#include "ras_eeprom.h"
+#include "ras_core_status.h"
+#include "ras_process.h"
+#include "ras_gfx.h"
+#include "ras_cmd.h"
+#include "ras_nbio.h"
+#include "ras_mp1.h"
+#include "ras_psp.h"
+#include "ras_log_ring.h"
+
+#define RAS_HW_ERR "[Hardware Error]: "
+
+#define RAS_GPU_PAGE_SHIFT 12
+#define RAS_ADDR_TO_PFN(addr) ((addr) >> RAS_GPU_PAGE_SHIFT)
+#define RAS_PFN_TO_ADDR(pfn) ((pfn) << RAS_GPU_PAGE_SHIFT)
+
+#define RAS_CORE_RESET_GPU 0x10000
+
+#define GPU_RESET_CAUSE_POISON (RAS_CORE_RESET_GPU | 0x0001)
+#define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002)
+#define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004)
+
+enum ras_block_id {
+ RAS_BLOCK_ID__UMC = 0,
+ RAS_BLOCK_ID__SDMA,
+ RAS_BLOCK_ID__GFX,
+ RAS_BLOCK_ID__MMHUB,
+ RAS_BLOCK_ID__ATHUB,
+ RAS_BLOCK_ID__PCIE_BIF,
+ RAS_BLOCK_ID__HDP,
+ RAS_BLOCK_ID__XGMI_WAFL,
+ RAS_BLOCK_ID__DF,
+ RAS_BLOCK_ID__SMN,
+ RAS_BLOCK_ID__SEM,
+ RAS_BLOCK_ID__MP0,
+ RAS_BLOCK_ID__MP1,
+ RAS_BLOCK_ID__FUSE,
+ RAS_BLOCK_ID__MCA,
+ RAS_BLOCK_ID__VCN,
+ RAS_BLOCK_ID__JPEG,
+ RAS_BLOCK_ID__IH,
+ RAS_BLOCK_ID__MPIO,
+
+ RAS_BLOCK_ID__LAST
+};
+
+enum ras_ecc_err_type {
+ RAS_ECC_ERR__NONE = 0,
+ RAS_ECC_ERR__PARITY = 1,
+ RAS_ECC_ERR__SINGLE_CORRECTABLE = 2,
+ RAS_ECC_ERR__MULTI_UNCORRECTABLE = 4,
+ RAS_ECC_ERR__POISON = 8,
+};
+
+enum ras_err_type {
+ RAS_ERR_TYPE__UE = 0,
+ RAS_ERR_TYPE__CE,
+ RAS_ERR_TYPE__DE,
+ RAS_ERR_TYPE__LAST
+};
+
+enum ras_seqno_type {
+ RAS_SEQNO_TYPE_INVALID = 0,
+ RAS_SEQNO_TYPE_UE,
+ RAS_SEQNO_TYPE_CE,
+ RAS_SEQNO_TYPE_DE,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION,
+ RAS_SEQNO_TYPE_COUNT_MAX,
+};
+
+enum ras_seqno_fifo {
+ SEQNO_FIFO_INVALID = 0,
+ SEQNO_FIFO_POISON_CREATION,
+ SEQNO_FIFO_POISON_CONSUMPTION,
+ SEQNO_FIFO_COUNT_MAX
+};
+
+enum ras_notify_event {
+ RAS_EVENT_ID__NONE,
+ RAS_EVENT_ID__BAD_PAGE_DETECTED,
+ RAS_EVENT_ID__POISON_CONSUMPTION,
+ RAS_EVENT_ID__RESERVE_BAD_PAGE,
+ RAS_EVENT_ID__DEVICE_RMA,
+ RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM,
+ RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP,
+ RAS_EVENT_ID__FATAL_ERROR_DETECTED,
+ RAS_EVENT_ID__RESET_GPU,
+ RAS_EVENT_ID__RESET_VF,
+ RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN,
+ RAS_EVENT_ID__RAS_EVENT_PROC_END,
+};
+
+enum ras_gpu_status {
+ RAS_GPU_STATUS__NOT_READY = 0,
+ RAS_GPU_STATUS__READY = 0x1,
+ RAS_GPU_STATUS__IN_RESET = 0x2,
+ RAS_GPU_STATUS__IS_RMA = 0x4,
+ RAS_GPU_STATUS__IS_VF = 0x8,
+};
+
+struct ras_core_context;
+struct ras_bank_ecc;
+struct ras_umc;
+struct ras_aca;
+struct ras_process;
+struct ras_nbio;
+struct ras_log_ring;
+struct ras_psp;
+
+struct ras_mp1_sys_func {
+ int (*mp1_get_valid_bank_count)(struct ras_core_context *ras_core,
+ u32 msg, u32 *count);
+ int (*mp1_dump_valid_bank)(struct ras_core_context *ras_core,
+ u32 msg, u32 idx, u32 reg_idx, u64 *val);
+};
+
+struct ras_eeprom_sys_func {
+ int (*eeprom_i2c_xfer)(struct ras_core_context *ras_core,
+ u32 eeprom_addr, u8 *eeprom_buf, u32 buf_size, bool read);
+ int (*update_eeprom_i2c_config)(struct ras_core_context *ras_core);
+};
+
+struct ras_nbio_sys_func {
+ int (*set_ras_controller_irq_state)(struct ras_core_context *ras_core,
+ bool state);
+ int (*set_ras_err_event_athub_irq_state)(struct ras_core_context *ras_core,
+ bool state);
+};
+
+struct ras_time {
+ int tm_sec;
+ int tm_min;
+ int tm_hour;
+ int tm_mday;
+ int tm_mon;
+ long tm_year;
+};
+
+struct device_system_info {
+ uint32_t device_id;
+ uint32_t vendor_id;
+ uint32_t socket_id;
+};
+
+enum gpu_mem_type {
+ GPU_MEM_TYPE_DEFAULT,
+ GPU_MEM_TYPE_RAS_PSP_RING,
+ GPU_MEM_TYPE_RAS_PSP_CMD,
+ GPU_MEM_TYPE_RAS_PSP_FENCE,
+ GPU_MEM_TYPE_RAS_TA_FW,
+ GPU_MEM_TYPE_RAS_TA_CMD,
+};
+
+struct ras_psp_sys_func {
+ int (*get_ras_psp_system_status)(struct ras_core_context *ras_core,
+ struct ras_psp_sys_status *status);
+ int (*get_ras_ta_init_param)(struct ras_core_context *ras_core,
+ struct ras_ta_init_param *ras_ta_param);
+};
+
+struct ras_sys_func {
+ int (*gpu_reset_lock)(struct ras_core_context *ras_core,
+ bool down, bool try);
+ int (*check_gpu_status)(struct ras_core_context *ras_core,
+ uint32_t *status);
+ int (*gen_seqno)(struct ras_core_context *ras_core,
+ enum ras_seqno_type seqno_type, uint64_t *seqno);
+ int (*async_handle_ras_event)(struct ras_core_context *ras_core, void *data);
+ int (*ras_notifier)(struct ras_core_context *ras_core,
+ enum ras_notify_event event_id, void *data);
+ u64 (*get_utc_second_timestamp)(struct ras_core_context *ras_core);
+ int (*get_device_system_info)(struct ras_core_context *ras_core,
+ struct device_system_info *dev_info);
+ bool (*detect_ras_interrupt)(struct ras_core_context *ras_core);
+ int (*get_gpu_mem)(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
+ int (*put_gpu_mem)(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
+};
+
+struct ras_ecc_count {
+ uint64_t new_ce_count;
+ uint64_t total_ce_count;
+ uint64_t new_ue_count;
+ uint64_t total_ue_count;
+ uint64_t new_de_count;
+ uint64_t total_de_count;
+};
+
+struct ras_bank_ecc {
+ uint32_t nps;
+ uint64_t seq_no;
+ uint64_t status;
+ uint64_t ipid;
+ uint64_t addr;
+};
+
+struct ras_bank_ecc_node {
+ struct list_head node;
+ struct ras_bank_ecc ecc;
+};
+
+struct ras_aca_config {
+ u32 socket_num_per_hive;
+ u32 aid_num_per_socket;
+ u32 xcd_num_per_aid;
+};
+
+struct ras_mp1_config {
+ const struct ras_mp1_sys_func *mp1_sys_fn;
+};
+
+struct ras_nbio_config {
+ const struct ras_nbio_sys_func *nbio_sys_fn;
+};
+
+struct ras_psp_config {
+ const struct ras_psp_sys_func *psp_sys_fn;
+};
+
+struct ras_umc_config {
+ uint32_t umc_vram_type;
+};
+
+struct ras_eeprom_config {
+ const struct ras_eeprom_sys_func *eeprom_sys_fn;
+ int eeprom_record_threshold_config;
+ uint32_t eeprom_record_threshold_count;
+ void *eeprom_i2c_adapter;
+ u32 eeprom_i2c_addr;
+ u32 eeprom_i2c_port;
+ u16 max_i2c_read_len;
+ u16 max_i2c_write_len;
+};
+
+struct ras_core_config {
+ u32 aca_ip_version;
+ u32 umc_ip_version;
+ u32 mp1_ip_version;
+ u32 gfx_ip_version;
+ u32 nbio_ip_version;
+ u32 psp_ip_version;
+
+ bool poison_supported;
+ bool ras_eeprom_supported;
+ const struct ras_sys_func *sys_fn;
+
+ struct ras_aca_config aca_cfg;
+ struct ras_mp1_config mp1_cfg;
+ struct ras_nbio_config nbio_cfg;
+ struct ras_psp_config psp_cfg;
+ struct ras_eeprom_config eeprom_cfg;
+ struct ras_umc_config umc_cfg;
+};
+
+struct ras_core_context {
+ void *dev;
+ struct ras_core_config *config;
+ u32 socket_num_per_hive;
+ u32 aid_num_per_socket;
+ u32 xcd_num_per_aid;
+ int max_ue_banks_per_query;
+ int max_ce_banks_per_query;
+ struct ras_aca ras_aca;
+
+ bool ras_eeprom_supported;
+ struct ras_eeprom_control ras_eeprom;
+
+ struct ras_psp ras_psp;
+ struct ras_umc ras_umc;
+ struct ras_nbio ras_nbio;
+ struct ras_gfx ras_gfx;
+ struct ras_mp1 ras_mp1;
+ struct ras_process ras_proc;
+ struct ras_cmd_mgr ras_cmd;
+ struct ras_log_ring ras_log_ring;
+
+ const struct ras_sys_func *sys_fn;
+
+ /* is poison mode supported */
+ bool poison_supported;
+
+ bool is_rma;
+ bool is_initialized;
+
+ struct kfifo de_seqno_fifo;
+ struct kfifo consumption_seqno_fifo;
+ spinlock_t seqno_lock;
+
+ bool ras_core_enabled;
+};
+
+struct ras_core_context *ras_core_create(struct ras_core_config *init_config);
+void ras_core_destroy(struct ras_core_context *ras_core);
+int ras_core_sw_init(struct ras_core_context *ras_core);
+int ras_core_sw_fini(struct ras_core_context *ras_core);
+int ras_core_hw_init(struct ras_core_context *ras_core);
+int ras_core_hw_fini(struct ras_core_context *ras_core);
+bool ras_core_is_ready(struct ras_core_context *ras_core);
+uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core,
+ enum ras_seqno_type seqno_type);
+uint64_t ras_core_get_seqno(struct ras_core_context *ras_core,
+ enum ras_seqno_type seqno_type, bool pop);
+
+int ras_core_put_seqno(struct ras_core_context *ras_core,
+ enum ras_seqno_type seqno_type, uint64_t seqno);
+
+int ras_core_update_ecc_info(struct ras_core_context *ras_core);
+int ras_core_query_block_ecc_data(struct ras_core_context *ras_core,
+ enum ras_block_id block, struct ras_ecc_count *ecc_count);
+
+bool ras_core_gpu_in_reset(struct ras_core_context *ras_core);
+bool ras_core_gpu_is_rma(struct ras_core_context *ras_core);
+bool ras_core_gpu_is_vf(struct ras_core_context *ras_core);
+bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data);
+int ras_core_handle_fatal_error(struct ras_core_context *ras_core);
+
+uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core);
+const char *ras_core_get_ras_block_name(enum ras_block_id block_id);
+int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core,
+ uint64_t timestamp, struct ras_time *tm);
+
+int ras_core_set_status(struct ras_core_context *ras_core, bool enable);
+bool ras_core_is_enabled(struct ras_core_context *ras_core);
+uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core);
+int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
+ uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa);
+bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core);
+int ras_core_get_gpu_mem(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
+int ras_core_put_gpu_mem(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem);
+bool ras_core_check_safety_watermark(struct ras_core_context *ras_core);
+int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core);
+void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core);
+void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core);
+int ras_core_event_notify(struct ras_core_context *ras_core,
+ enum ras_notify_event event_id, void *data);
+int ras_core_get_device_system_info(struct ras_core_context *ras_core,
+ struct device_system_info *dev_info);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca.c b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c
new file mode 100644
index 000000000000..e433c70d2989
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c
@@ -0,0 +1,672 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_aca.h"
+#include "ras_aca_v1_0.h"
+#include "ras_mp1_v13_0.h"
+
+#define ACA_MARK_FATAL_FLAG 0x100
+#define ACA_MARK_UE_READ_FLAG 0x1
+
+#define blk_name(block_id) ras_core_get_ras_block_name(block_id)
+
+static struct aca_regs_dump {
+ const char *name;
+ int reg_idx;
+} aca_regs[] = {
+ {"CONTROL", ACA_REG_IDX__CTL},
+ {"STATUS", ACA_REG_IDX__STATUS},
+ {"ADDR", ACA_REG_IDX__ADDR},
+ {"MISC", ACA_REG_IDX__MISC0},
+ {"CONFIG", ACA_REG_IDX__CONFG},
+ {"IPID", ACA_REG_IDX__IPID},
+ {"SYND", ACA_REG_IDX__SYND},
+ {"DESTAT", ACA_REG_IDX__DESTAT},
+ {"DEADDR", ACA_REG_IDX__DEADDR},
+ {"CONTROL_MASK", ACA_REG_IDX__CTL_MASK},
+};
+
+
+static void aca_report_ecc_info(struct ras_core_context *ras_core,
+ u64 seq_no, u32 blk, u32 skt, u32 aid,
+ struct aca_aid_ecc *aid_ecc,
+ struct aca_bank_ecc *new_ecc)
+{
+ struct aca_ecc_count ecc_count = {0};
+
+ ecc_count.new_ue_count = new_ecc->ue_count;
+ ecc_count.new_de_count = new_ecc->de_count;
+ ecc_count.new_ce_count = new_ecc->ce_count;
+ if (blk == RAS_BLOCK_ID__GFX) {
+ struct aca_ecc_count *xcd_ecc;
+ int xcd_id;
+
+ for (xcd_id = 0; xcd_id < aid_ecc->xcd.xcd_num; xcd_id++) {
+ xcd_ecc = &aid_ecc->xcd.xcd[xcd_id].ecc_err;
+ ecc_count.total_ue_count += xcd_ecc->total_ue_count;
+ ecc_count.total_de_count += xcd_ecc->total_de_count;
+ ecc_count.total_ce_count += xcd_ecc->total_ce_count;
+ }
+ } else {
+ ecc_count.total_ue_count = aid_ecc->ecc_err.total_ue_count;
+ ecc_count.total_de_count = aid_ecc->ecc_err.total_de_count;
+ ecc_count.total_ce_count = aid_ecc->ecc_err.total_ce_count;
+ }
+
+ if (ecc_count.new_ue_count) {
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} socket: %d, die: %d, %u new uncorrectable hardware errors detected in %s block\n",
+ seq_no, skt, aid, ecc_count.new_ue_count, blk_name(blk));
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} socket: %d, die: %d, %u uncorrectable hardware errors detected in total in %s block\n",
+ seq_no, skt, aid, ecc_count.total_ue_count, blk_name(blk));
+ }
+
+ if (ecc_count.new_de_count) {
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} socket: %d, die: %d, %u new %s detected in %s block\n",
+ seq_no, skt, aid, ecc_count.new_de_count,
+ (blk == RAS_BLOCK_ID__UMC) ?
+ "deferred hardware errors" : "poison consumption",
+ blk_name(blk));
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} socket: %d, die: %d, %u %s detected in total in %s block\n",
+ seq_no, skt, aid, ecc_count.total_de_count,
+ (blk == RAS_BLOCK_ID__UMC) ?
+ "deferred hardware errors" : "poison consumption",
+ blk_name(blk));
+ }
+
+ if (ecc_count.new_ce_count) {
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} socket: %d, die: %d, %u new correctable hardware errors detected in %s block\n",
+ seq_no, skt, aid, ecc_count.new_ce_count, blk_name(blk));
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} socket: %d, die: %d, %u correctable hardware errors detected in total in %s block\n",
+ seq_no, skt, aid, ecc_count.total_ce_count, blk_name(blk));
+ }
+}
+
+static void aca_bank_log(struct ras_core_context *ras_core,
+ int idx, int total, struct aca_bank_reg *bank,
+ struct aca_bank_ecc *bank_ecc)
+{
+ int i;
+
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu}" RAS_HW_ERR "Accelerator Check Architecture events logged\n",
+ bank->seq_no);
+ /* plus 1 for output format, e.g: ACA[08/08]: xxxx */
+ for (i = 0; i < ARRAY_SIZE(aca_regs); i++)
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu}" RAS_HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n",
+ bank->seq_no, idx + 1, total,
+ aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]);
+}
+
+static void aca_log_bank_data(struct ras_core_context *ras_core,
+ struct aca_bank_reg *bank, struct aca_bank_ecc *bank_ecc,
+ struct ras_log_batch_tag *batch)
+{
+ if (bank_ecc->ue_count)
+ ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_UE, bank->regs, batch);
+ else if (bank_ecc->de_count)
+ ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_DE, bank->regs, batch);
+ else
+ ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_CE, bank->regs, batch);
+}
+
+static int aca_get_bank_count(struct ras_core_context *ras_core,
+ enum ras_err_type type, u32 *count)
+{
+ return ras_mp1_get_bank_count(ras_core, type, count);
+}
+
+static bool aca_match_bank(struct aca_block *aca_blk, struct aca_bank_reg *bank)
+{
+ const struct aca_bank_hw_ops *bank_ops;
+
+ if (!aca_blk->blk_info)
+ return false;
+
+ bank_ops = &aca_blk->blk_info->bank_ops;
+ if (!bank_ops->bank_match)
+ return false;
+
+ return bank_ops->bank_match(aca_blk, bank);
+}
+
+static int aca_parse_bank(struct ras_core_context *ras_core,
+ struct aca_block *aca_blk,
+ struct aca_bank_reg *bank,
+ struct aca_bank_ecc *ecc)
+{
+ const struct aca_bank_hw_ops *bank_ops = &aca_blk->blk_info->bank_ops;
+
+ if (!bank_ops || !bank_ops->bank_parse)
+ return -RAS_CORE_NOT_SUPPORTED;
+
+ return bank_ops->bank_parse(ras_core, aca_blk, bank, ecc);
+}
+
+static int aca_check_block_ecc_info(struct ras_core_context *ras_core,
+ struct aca_block *aca_blk, struct aca_ecc_info *info)
+{
+ if (info->socket_id >= aca_blk->ecc.socket_num_per_hive) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Socket id (%d) is out of config! max:%u\n",
+ info->socket_id, aca_blk->ecc.socket_num_per_hive);
+ return -ENODATA;
+ }
+
+ if (info->die_id >= aca_blk->ecc.socket[info->socket_id].aid_num) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Die id (%d) is out of config! max:%u\n",
+ info->die_id, aca_blk->ecc.socket[info->socket_id].aid_num);
+ return -ENODATA;
+ }
+
+ if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX) &&
+ (info->xcd_id >=
+ aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num)) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Xcd id (%d) is out of config! max:%u\n",
+ info->xcd_id,
+ aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num);
+ return -ENODATA;
+ }
+
+ return 0;
+}
+
+static int aca_log_bad_bank(struct ras_core_context *ras_core,
+ struct aca_block *aca_blk, struct aca_bank_reg *bank,
+ struct aca_bank_ecc *bank_ecc)
+{
+ struct aca_ecc_info *info;
+ struct aca_ecc_count *ecc_err;
+ struct aca_aid_ecc *aid_ecc;
+ int ret;
+
+ info = &bank_ecc->bank_info;
+
+ ret = aca_check_block_ecc_info(ras_core, aca_blk, info);
+ if (ret)
+ return ret;
+
+ mutex_lock(&ras_core->ras_aca.aca_lock);
+ aid_ecc = &aca_blk->ecc.socket[info->socket_id].aid[info->die_id];
+ if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX)
+ ecc_err = &aid_ecc->xcd.xcd[info->xcd_id].ecc_err;
+ else
+ ecc_err = &aid_ecc->ecc_err;
+
+ ecc_err->new_ce_count += bank_ecc->ce_count;
+ ecc_err->total_ce_count += bank_ecc->ce_count;
+ ecc_err->new_ue_count += bank_ecc->ue_count;
+ ecc_err->total_ue_count += bank_ecc->ue_count;
+ ecc_err->new_de_count += bank_ecc->de_count;
+ ecc_err->total_de_count += bank_ecc->de_count;
+ mutex_unlock(&ras_core->ras_aca.aca_lock);
+
+ if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC) &&
+ bank_ecc->de_count) {
+ struct ras_bank_ecc ras_ecc = {0};
+
+ ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core);
+ ras_ecc.addr = bank_ecc->bank_info.addr;
+ ras_ecc.ipid = bank_ecc->bank_info.ipid;
+ ras_ecc.status = bank_ecc->bank_info.status;
+ ras_ecc.seq_no = bank->seq_no;
+
+ if (ras_core_gpu_in_reset(ras_core))
+ ras_umc_log_bad_bank_pending(ras_core, &ras_ecc);
+ else
+ ras_umc_log_bad_bank(ras_core, &ras_ecc);
+ }
+
+ aca_report_ecc_info(ras_core,
+ bank->seq_no, aca_blk->blk_info->ras_block_id, info->socket_id, info->die_id,
+ &aca_blk->ecc.socket[info->socket_id].aid[info->die_id], bank_ecc);
+
+ return 0;
+}
+
+static struct aca_block *aca_get_bank_aca_block(struct ras_core_context *ras_core,
+ struct aca_bank_reg *bank)
+{
+ int i = 0;
+
+ for (i = 0; i < RAS_BLOCK_ID__LAST; i++)
+ if (aca_match_bank(&ras_core->ras_aca.aca_blk[i], bank))
+ return &ras_core->ras_aca.aca_blk[i];
+
+ return NULL;
+}
+
+static int aca_dump_bank(struct ras_core_context *ras_core, u32 ecc_type,
+ int idx, void *data)
+{
+ struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
+ int i, ret, reg_cnt;
+
+ reg_cnt = min_t(int, 16, ARRAY_SIZE(bank->regs));
+ for (i = 0; i < reg_cnt; i++) {
+ ret = ras_mp1_dump_bank(ras_core, ecc_type, idx, i, &bank->regs[i]);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static uint64_t aca_get_bank_seqno(struct ras_core_context *ras_core,
+ enum ras_err_type err_type, struct aca_block *aca_blk,
+ struct aca_bank_ecc *bank_ecc)
+{
+ uint64_t seq_no = 0;
+
+ if (bank_ecc->de_count) {
+ if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC)
+ seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_DE, true);
+ else
+ seq_no = ras_core_get_seqno(ras_core,
+ RAS_SEQNO_TYPE_POISON_CONSUMPTION, true);
+ } else if (bank_ecc->ue_count) {
+ seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_UE, true);
+ } else {
+ seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_CE, true);
+ }
+
+ return seq_no;
+}
+
+static bool aca_dup_update_ue_in_fatal(struct ras_core_context *ras_core,
+ u32 ecc_type)
+{
+ struct ras_aca *aca = &ras_core->ras_aca;
+
+ if (ecc_type != RAS_ERR_TYPE__UE)
+ return false;
+
+ if (aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) {
+ if (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG)
+ return true;
+
+ aca->ue_updated_mark |= ACA_MARK_UE_READ_FLAG;
+ }
+
+ return false;
+}
+
+void ras_aca_mark_fatal_flag(struct ras_core_context *ras_core)
+{
+ struct ras_aca *aca = &ras_core->ras_aca;
+
+ if (!aca)
+ return;
+
+ aca->ue_updated_mark |= ACA_MARK_FATAL_FLAG;
+}
+
+void ras_aca_clear_fatal_flag(struct ras_core_context *ras_core)
+{
+ struct ras_aca *aca = &ras_core->ras_aca;
+
+ if (!aca)
+ return;
+
+ if ((aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) &&
+ (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG))
+ aca->ue_updated_mark = 0;
+}
+
+static int aca_banks_update(struct ras_core_context *ras_core,
+ u32 ecc_type, void *data)
+{
+ struct aca_bank_reg bank;
+ struct aca_block *aca_blk;
+ struct aca_bank_ecc bank_ecc;
+ struct ras_log_batch_tag *batch_tag = NULL;
+ u32 count = 0;
+ int ret = 0;
+ int i;
+
+ mutex_lock(&ras_core->ras_aca.bank_op_lock);
+
+ if (aca_dup_update_ue_in_fatal(ras_core, ecc_type))
+ goto out;
+
+ ret = aca_get_bank_count(ras_core, ecc_type, &count);
+ if (ret)
+ goto out;
+
+ if (!count)
+ goto out;
+
+ batch_tag = ras_log_ring_create_batch_tag(ras_core);
+ for (i = 0; i < count; i++) {
+ memset(&bank, 0, sizeof(bank));
+ ret = aca_dump_bank(ras_core, ecc_type, i, &bank);
+ if (ret)
+ break;
+
+ bank.ecc_type = ecc_type;
+
+ memset(&bank_ecc, 0, sizeof(bank_ecc));
+ aca_blk = aca_get_bank_aca_block(ras_core, &bank);
+ if (aca_blk)
+ ret = aca_parse_bank(ras_core, aca_blk, &bank, &bank_ecc);
+
+ bank.seq_no = aca_get_bank_seqno(ras_core, ecc_type, aca_blk, &bank_ecc);
+
+ aca_log_bank_data(ras_core, &bank, &bank_ecc, batch_tag);
+ aca_bank_log(ras_core, i, count, &bank, &bank_ecc);
+
+ if (!ret && aca_blk)
+ ret = aca_log_bad_bank(ras_core, aca_blk, &bank, &bank_ecc);
+
+ if (ret)
+ break;
+ }
+ ras_log_ring_destroy_batch_tag(ras_core, batch_tag);
+
+out:
+ mutex_unlock(&ras_core->ras_aca.bank_op_lock);
+ return ret;
+}
+
+int ras_aca_update_ecc(struct ras_core_context *ras_core, u32 type, void *data)
+{
+ /* Update aca bank to aca source error_cache first */
+ return aca_banks_update(ras_core, type, data);
+}
+
+static struct aca_block *ras_aca_get_block_handle(struct ras_core_context *ras_core, uint32_t blk)
+{
+ return &ras_core->ras_aca.aca_blk[blk];
+}
+
+static int ras_aca_clear_block_ecc_count(struct ras_core_context *ras_core, u32 blk)
+{
+ struct aca_block *aca_blk;
+ struct aca_aid_ecc *aid_ecc;
+ int skt, aid, xcd;
+
+ mutex_lock(&ras_core->ras_aca.aca_lock);
+ aca_blk = ras_aca_get_block_handle(ras_core, blk);
+ for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
+ for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
+ aid_ecc = &aca_blk->ecc.socket[skt].aid[aid];
+ if (blk == RAS_BLOCK_ID__GFX) {
+ for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++)
+ memset(&aid_ecc->xcd.xcd[xcd],
+ 0, sizeof(struct aca_xcd_ecc));
+ } else {
+ memset(&aid_ecc->ecc_err, 0, sizeof(aid_ecc->ecc_err));
+ }
+ }
+ }
+ mutex_unlock(&ras_core->ras_aca.aca_lock);
+
+ return 0;
+}
+
+int ras_aca_clear_all_blocks_ecc_count(struct ras_core_context *ras_core)
+{
+ enum ras_block_id blk;
+ int ret;
+
+ for (blk = RAS_BLOCK_ID__UMC; blk < RAS_BLOCK_ID__LAST; blk++) {
+ ret = ras_aca_clear_block_ecc_count(ras_core, blk);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+int ras_aca_clear_block_new_ecc_count(struct ras_core_context *ras_core, u32 blk)
+{
+ struct aca_block *aca_blk;
+ int skt, aid, xcd;
+ struct aca_ecc_count *ecc_err;
+ struct aca_aid_ecc *aid_ecc;
+
+ mutex_lock(&ras_core->ras_aca.aca_lock);
+ aca_blk = ras_aca_get_block_handle(ras_core, blk);
+ for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
+ for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
+ aid_ecc = &aca_blk->ecc.socket[skt].aid[aid];
+ if (blk == RAS_BLOCK_ID__GFX) {
+ for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++) {
+ ecc_err = &aid_ecc->xcd.xcd[xcd].ecc_err;
+ ecc_err->new_ce_count = 0;
+ ecc_err->new_ue_count = 0;
+ ecc_err->new_de_count = 0;
+ }
+ } else {
+ ecc_err = &aid_ecc->ecc_err;
+ ecc_err->new_ce_count = 0;
+ ecc_err->new_ue_count = 0;
+ ecc_err->new_de_count = 0;
+ }
+ }
+ }
+ mutex_unlock(&ras_core->ras_aca.aca_lock);
+
+ return 0;
+}
+
+static int ras_aca_get_block_each_aid_ecc_count(struct ras_core_context *ras_core,
+ u32 blk, u32 skt, u32 aid, u32 xcd,
+ struct aca_ecc_count *ecc_count)
+{
+ struct aca_block *aca_blk;
+ struct aca_ecc_count *ecc_err;
+
+ aca_blk = ras_aca_get_block_handle(ras_core, blk);
+ if (blk == RAS_BLOCK_ID__GFX)
+ ecc_err = &aca_blk->ecc.socket[skt].aid[aid].xcd.xcd[xcd].ecc_err;
+ else
+ ecc_err = &aca_blk->ecc.socket[skt].aid[aid].ecc_err;
+
+ ecc_count->new_ce_count = ecc_err->new_ce_count;
+ ecc_count->total_ce_count = ecc_err->total_ce_count;
+ ecc_count->new_ue_count = ecc_err->new_ue_count;
+ ecc_count->total_ue_count = ecc_err->total_ue_count;
+ ecc_count->new_de_count = ecc_err->new_de_count;
+ ecc_count->total_de_count = ecc_err->total_de_count;
+
+ return 0;
+}
+
+static inline void _add_ecc_count(struct aca_ecc_count *des, struct aca_ecc_count *src)
+{
+ des->new_ce_count += src->new_ce_count;
+ des->total_ce_count += src->total_ce_count;
+ des->new_ue_count += src->new_ue_count;
+ des->total_ue_count += src->total_ue_count;
+ des->new_de_count += src->new_de_count;
+ des->total_de_count += src->total_de_count;
+}
+
+static const struct ras_aca_ip_func *aca_get_ip_func(
+ struct ras_core_context *ras_core, uint32_t ip_version)
+{
+ switch (ip_version) {
+ case IP_VERSION(1, 0, 0):
+ return &ras_aca_func_v1_0;
+ default:
+ RAS_DEV_ERR(ras_core->dev,
+ "ACA ip version(0x%x) is not supported!\n", ip_version);
+ break;
+ }
+
+ return NULL;
+}
+
+int ras_aca_get_block_ecc_count(struct ras_core_context *ras_core,
+ u32 blk, void *data)
+{
+ struct ras_ecc_count *err_data = (struct ras_ecc_count *)data;
+ struct aca_block *aca_blk;
+ int skt, aid, xcd;
+ struct aca_ecc_count ecc_xcd;
+ struct aca_ecc_count ecc_aid;
+ struct aca_ecc_count ecc;
+
+ if (blk >= RAS_BLOCK_ID__LAST)
+ return -EINVAL;
+
+ if (!err_data)
+ return -EINVAL;
+
+ aca_blk = ras_aca_get_block_handle(ras_core, blk);
+ memset(&ecc, 0, sizeof(ecc));
+
+ mutex_lock(&ras_core->ras_aca.aca_lock);
+ if (blk == RAS_BLOCK_ID__GFX) {
+ for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
+ for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
+ memset(&ecc_aid, 0, sizeof(ecc_aid));
+ for (xcd = 0;
+ xcd < aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num;
+ xcd++) {
+ memset(&ecc_xcd, 0, sizeof(ecc_xcd));
+ if (ras_aca_get_block_each_aid_ecc_count(ras_core,
+ blk, skt, aid, xcd, &ecc_xcd))
+ continue;
+ _add_ecc_count(&ecc_aid, &ecc_xcd);
+ }
+ _add_ecc_count(&ecc, &ecc_aid);
+ }
+ }
+ } else {
+ for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
+ for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) {
+ memset(&ecc_aid, 0, sizeof(ecc_aid));
+ if (ras_aca_get_block_each_aid_ecc_count(ras_core,
+ blk, skt, aid, 0, &ecc_aid))
+ continue;
+ _add_ecc_count(&ecc, &ecc_aid);
+ }
+ }
+ }
+
+ err_data->new_ce_count = ecc.new_ce_count;
+ err_data->total_ce_count = ecc.total_ce_count;
+ err_data->new_ue_count = ecc.new_ue_count;
+ err_data->total_ue_count = ecc.total_ue_count;
+ err_data->new_de_count = ecc.new_de_count;
+ err_data->total_de_count = ecc.total_de_count;
+ mutex_unlock(&ras_core->ras_aca.aca_lock);
+
+ return 0;
+}
+
+int ras_aca_sw_init(struct ras_core_context *ras_core)
+{
+ struct ras_aca *ras_aca = &ras_core->ras_aca;
+ struct ras_aca_config *aca_cfg = &ras_core->config->aca_cfg;
+ struct aca_block *aca_blk;
+ uint32_t socket_num_per_hive;
+ uint32_t aid_num_per_socket;
+ uint32_t xcd_num_per_aid;
+ int blk, skt, aid;
+
+ socket_num_per_hive = aca_cfg->socket_num_per_hive;
+ aid_num_per_socket = aca_cfg->aid_num_per_socket;
+ xcd_num_per_aid = aca_cfg->xcd_num_per_aid;
+
+ if (!xcd_num_per_aid || !aid_num_per_socket ||
+ (socket_num_per_hive > MAX_SOCKET_NUM_PER_HIVE) ||
+ (aid_num_per_socket > MAX_AID_NUM_PER_SOCKET) ||
+ (xcd_num_per_aid > MAX_XCD_NUM_PER_AID)) {
+ RAS_DEV_ERR(ras_core->dev, "Invalid ACA system configuration: %d, %d, %d\n",
+ socket_num_per_hive, aid_num_per_socket, xcd_num_per_aid);
+ return -EINVAL;
+ }
+
+ memset(ras_aca, 0, sizeof(*ras_aca));
+
+ for (blk = 0; blk < RAS_BLOCK_ID__LAST; blk++) {
+ aca_blk = &ras_aca->aca_blk[blk];
+ aca_blk->ecc.socket_num_per_hive = socket_num_per_hive;
+ for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) {
+ aca_blk->ecc.socket[skt].aid_num = aid_num_per_socket;
+ if (blk == RAS_BLOCK_ID__GFX) {
+ for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++)
+ aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num =
+ xcd_num_per_aid;
+ }
+ }
+ }
+
+ mutex_init(&ras_aca->aca_lock);
+ mutex_init(&ras_aca->bank_op_lock);
+
+ return 0;
+}
+
+int ras_aca_sw_fini(struct ras_core_context *ras_core)
+{
+ struct ras_aca *ras_aca = &ras_core->ras_aca;
+
+ mutex_destroy(&ras_aca->aca_lock);
+ mutex_destroy(&ras_aca->bank_op_lock);
+
+ return 0;
+}
+
+int ras_aca_hw_init(struct ras_core_context *ras_core)
+{
+ struct ras_aca *ras_aca = &ras_core->ras_aca;
+ struct aca_block *aca_blk;
+ const struct ras_aca_ip_func *ip_func;
+ int i;
+
+ ras_aca->aca_ip_version = ras_core->config->aca_ip_version;
+ ip_func = aca_get_ip_func(ras_core, ras_aca->aca_ip_version);
+ if (!ip_func)
+ return -EINVAL;
+
+ for (i = 0; i < ip_func->block_num; i++) {
+ aca_blk = &ras_aca->aca_blk[ip_func->block_info[i]->ras_block_id];
+ aca_blk->blk_info = ip_func->block_info[i];
+ }
+
+ ras_aca->ue_updated_mark = 0;
+
+ return 0;
+}
+
+int ras_aca_hw_fini(struct ras_core_context *ras_core)
+{
+ struct ras_aca *ras_aca = &ras_core->ras_aca;
+
+ ras_aca->ue_updated_mark = 0;
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca.h b/drivers/gpu/drm/amd/ras/rascore/ras_aca.h
new file mode 100644
index 000000000000..f61b02a5f0fc
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca.h
@@ -0,0 +1,164 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_ACA_H__
+#define __RAS_ACA_H__
+#include "ras.h"
+
+#define MAX_SOCKET_NUM_PER_HIVE 8
+#define MAX_AID_NUM_PER_SOCKET 4
+#define MAX_XCD_NUM_PER_AID 2
+#define MAX_ACA_RAS_BLOCK 20
+
+#define ACA_ERROR__UE_MASK (0x1 << RAS_ERR_TYPE__UE)
+#define ACA_ERROR__CE_MASK (0x1 << RAS_ERR_TYPE__CE)
+#define ACA_ERROR__DE_MASK (0x1 << RAS_ERR_TYPE__DE)
+
+enum ras_aca_reg_idx {
+ ACA_REG_IDX__CTL = 0,
+ ACA_REG_IDX__STATUS = 1,
+ ACA_REG_IDX__ADDR = 2,
+ ACA_REG_IDX__MISC0 = 3,
+ ACA_REG_IDX__CONFG = 4,
+ ACA_REG_IDX__IPID = 5,
+ ACA_REG_IDX__SYND = 6,
+ ACA_REG_IDX__DESTAT = 8,
+ ACA_REG_IDX__DEADDR = 9,
+ ACA_REG_IDX__CTL_MASK = 10,
+ ACA_REG_MAX_COUNT = 16,
+};
+
+struct ras_core_context;
+struct aca_block;
+
+struct aca_bank_reg {
+ u32 ecc_type;
+ u64 seq_no;
+ u64 regs[ACA_REG_MAX_COUNT];
+};
+
+enum aca_ecc_hwip {
+ ACA_ECC_HWIP__UNKNOWN = -1,
+ ACA_ECC_HWIP__PSP = 0,
+ ACA_ECC_HWIP__UMC,
+ ACA_ECC_HWIP__SMU,
+ ACA_ECC_HWIP__PCS_XGMI,
+ ACA_ECC_HWIP_COUNT,
+};
+
+struct aca_ecc_info {
+ int die_id;
+ int socket_id;
+ int xcd_id;
+ int hwid;
+ int mcatype;
+ uint64_t status;
+ uint64_t ipid;
+ uint64_t addr;
+};
+
+struct aca_bank_ecc {
+ struct aca_ecc_info bank_info;
+ u32 ce_count;
+ u32 ue_count;
+ u32 de_count;
+};
+
+struct aca_ecc_count {
+ u32 new_ce_count;
+ u32 total_ce_count;
+ u32 new_ue_count;
+ u32 total_ue_count;
+ u32 new_de_count;
+ u32 total_de_count;
+};
+
+struct aca_xcd_ecc {
+ struct aca_ecc_count ecc_err;
+};
+
+struct aca_aid_ecc {
+ union {
+ struct aca_xcd {
+ struct aca_xcd_ecc xcd[MAX_XCD_NUM_PER_AID];
+ u32 xcd_num;
+ } xcd;
+ struct aca_ecc_count ecc_err;
+ };
+};
+
+struct aca_socket_ecc {
+ struct aca_aid_ecc aid[MAX_AID_NUM_PER_SOCKET];
+ u32 aid_num;
+};
+
+struct aca_block_ecc {
+ struct aca_socket_ecc socket[MAX_SOCKET_NUM_PER_HIVE];
+ u32 socket_num_per_hive;
+};
+
+struct aca_bank_hw_ops {
+ bool (*bank_match)(struct aca_block *ras_blk, void *data);
+ int (*bank_parse)(struct ras_core_context *ras_core,
+ struct aca_block *aca_blk, void *data, void *buf);
+};
+
+struct aca_block_info {
+ char name[32];
+ u32 ras_block_id;
+ enum aca_ecc_hwip hwip;
+ struct aca_bank_hw_ops bank_ops;
+ u32 mask;
+};
+
+struct aca_block {
+ const struct aca_block_info *blk_info;
+ struct aca_block_ecc ecc;
+};
+
+struct ras_aca_ip_func {
+ uint32_t block_num;
+ const struct aca_block_info **block_info;
+};
+
+struct ras_aca {
+ uint32_t aca_ip_version;
+ const struct ras_aca_ip_func *ip_func;
+ struct mutex aca_lock;
+ struct mutex bank_op_lock;
+ struct aca_block aca_blk[MAX_ACA_RAS_BLOCK];
+ uint32_t ue_updated_mark;
+};
+
+int ras_aca_sw_init(struct ras_core_context *ras_core);
+int ras_aca_sw_fini(struct ras_core_context *ras_core);
+int ras_aca_hw_init(struct ras_core_context *ras_core);
+int ras_aca_hw_fini(struct ras_core_context *ras_core);
+int ras_aca_get_block_ecc_count(struct ras_core_context *ras_core, u32 blk, void *data);
+int ras_aca_clear_block_new_ecc_count(struct ras_core_context *ras_core, u32 blk);
+int ras_aca_clear_all_blocks_ecc_count(struct ras_core_context *ras_core);
+int ras_aca_update_ecc(struct ras_core_context *ras_core, u32 ecc_type, void *data);
+void ras_aca_mark_fatal_flag(struct ras_core_context *ras_core);
+void ras_aca_clear_fatal_flag(struct ras_core_context *ras_core);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c
new file mode 100644
index 000000000000..29df98948703
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_aca.h"
+#include "ras_core_status.h"
+#include "ras_aca_v1_0.h"
+
+struct ras_aca_hwip {
+ int hwid;
+ int mcatype;
+};
+
+static struct ras_aca_hwip aca_hwid_mcatypes[ACA_ECC_HWIP_COUNT] = {
+ [ACA_ECC_HWIP__SMU] = {0x01, 0x01},
+ [ACA_ECC_HWIP__PCS_XGMI] = {0x50, 0x00},
+ [ACA_ECC_HWIP__UMC] = {0x96, 0x00},
+};
+
+static int aca_decode_bank_info(struct aca_block *aca_blk,
+ struct aca_bank_reg *bank, struct aca_ecc_info *info)
+{
+ u64 ipid;
+ u32 instidhi, instidlo;
+
+ ipid = bank->regs[ACA_REG_IDX__IPID];
+ info->hwid = ACA_REG_IPID_HARDWAREID(ipid);
+ info->mcatype = ACA_REG_IPID_MCATYPE(ipid);
+ /*
+ * Unified DieID Format: SAASS. A:AID, S:Socket.
+ * Unified DieID[4:4] = InstanceId[0:0]
+ * Unified DieID[0:3] = InstanceIdHi[0:3]
+ */
+ instidhi = ACA_REG_IPID_INSTANCEIDHI(ipid);
+ instidlo = ACA_REG_IPID_INSTANCEIDLO(ipid);
+ info->die_id = ((instidhi >> 2) & 0x03);
+ info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03);
+
+ if ((aca_blk->blk_info->hwip == ACA_ECC_HWIP__SMU) &&
+ (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX))
+ info->xcd_id =
+ ((instidlo & GENMASK_ULL(31, 1)) == mmSMNAID_XCD0_MCA_SMU) ? 0 : 1;
+
+ return 0;
+}
+
+static bool aca_check_bank_hwip(struct aca_bank_reg *bank, enum aca_ecc_hwip type)
+{
+ struct ras_aca_hwip *hwip;
+ int hwid, mcatype;
+ u64 ipid;
+
+ if (!bank || (type == ACA_ECC_HWIP__UNKNOWN))
+ return false;
+
+ hwip = &aca_hwid_mcatypes[type];
+ if (!hwip->hwid)
+ return false;
+
+ ipid = bank->regs[ACA_REG_IDX__IPID];
+ hwid = ACA_REG_IPID_HARDWAREID(ipid);
+ mcatype = ACA_REG_IPID_MCATYPE(ipid);
+
+ return hwip->hwid == hwid && hwip->mcatype == mcatype;
+}
+
+static bool aca_match_bank_default(struct aca_block *aca_blk, void *data)
+{
+ return aca_check_bank_hwip((struct aca_bank_reg *)data, aca_blk->blk_info->hwip);
+}
+
+static bool aca_match_gfx_bank(struct aca_block *aca_blk, void *data)
+{
+ struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
+ u32 instlo;
+
+ if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip))
+ return false;
+
+ instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]);
+ instlo &= GENMASK_ULL(31, 1);
+ switch (instlo) {
+ case mmSMNAID_XCD0_MCA_SMU:
+ case mmSMNAID_XCD1_MCA_SMU:
+ case mmSMNXCD_XCD0_MCA_SMU:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool aca_match_sdma_bank(struct aca_block *aca_blk, void *data)
+{
+ struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
+ /* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */
+ static int sdma_err_codes[] = { 33, 34, 35, 36 };
+ u32 instlo;
+ int errcode, i;
+
+ if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip))
+ return false;
+
+ instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]);
+ instlo &= GENMASK_ULL(31, 1);
+ if (instlo != mmSMNAID_AID0_MCA_SMU)
+ return false;
+
+ errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]);
+ errcode &= 0xff;
+
+ /* Check SDMA error codes */
+ for (i = 0; i < ARRAY_SIZE(sdma_err_codes); i++) {
+ if (errcode == sdma_err_codes[i])
+ return true;
+ }
+
+ return false;
+}
+
+static bool aca_match_mmhub_bank(struct aca_block *aca_blk, void *data)
+{
+ struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
+ /* reference to smu driver if header file */
+ const int mmhub_err_codes[] = {
+ 0, 1, 2, 3, 4, /* CODE_DAGB0 - 4 */
+ 5, 6, 7, 8, 9, /* CODE_EA0 - 4 */
+ 10, /* CODE_UTCL2_ROUTER */
+ 11, /* CODE_VML2 */
+ 12, /* CODE_VML2_WALKER */
+ 13, /* CODE_MMCANE */
+ };
+ u32 instlo;
+ int errcode, i;
+
+ if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip))
+ return false;
+
+ instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]);
+ instlo &= GENMASK_ULL(31, 1);
+ if (instlo != mmSMNAID_AID0_MCA_SMU)
+ return false;
+
+ errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]);
+ errcode &= 0xff;
+
+ /* Check MMHUB error codes */
+ for (i = 0; i < ARRAY_SIZE(mmhub_err_codes); i++) {
+ if (errcode == mmhub_err_codes[i])
+ return true;
+ }
+
+ return false;
+}
+
+static bool aca_check_umc_de(struct ras_core_context *ras_core, uint64_t mc_umc_status)
+{
+ return (ras_core->poison_supported &&
+ ACA_REG_STATUS_VAL(mc_umc_status) &&
+ ACA_REG_STATUS_DEFERRED(mc_umc_status));
+}
+
+static bool aca_check_umc_ue(struct ras_core_context *ras_core, uint64_t mc_umc_status)
+{
+ if (aca_check_umc_de(ras_core, mc_umc_status))
+ return false;
+
+ return (ACA_REG_STATUS_VAL(mc_umc_status) &&
+ (ACA_REG_STATUS_PCC(mc_umc_status) ||
+ ACA_REG_STATUS_UC(mc_umc_status) ||
+ ACA_REG_STATUS_TCC(mc_umc_status)));
+}
+
+static bool aca_check_umc_ce(struct ras_core_context *ras_core, uint64_t mc_umc_status)
+{
+ if (aca_check_umc_de(ras_core, mc_umc_status))
+ return false;
+
+ return (ACA_REG_STATUS_VAL(mc_umc_status) &&
+ (ACA_REG_STATUS_CECC(mc_umc_status) ||
+ (ACA_REG_STATUS_UECC(mc_umc_status) &&
+ ACA_REG_STATUS_UC(mc_umc_status) == 0) ||
+ /* Identify data parity error in replay mode */
+ ((ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0x5 ||
+ ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0xb) &&
+ !(aca_check_umc_ue(ras_core, mc_umc_status)))));
+}
+
+static int aca_parse_umc_bank(struct ras_core_context *ras_core,
+ struct aca_block *ras_blk, void *data, void *buf)
+{
+ struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
+ struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf;
+ struct aca_ecc_info bank_info;
+ uint32_t ext_error_code;
+ uint64_t status0;
+
+ status0 = bank->regs[ACA_REG_IDX__STATUS];
+ if (!ACA_REG_STATUS_VAL(status0))
+ return 0;
+
+ memset(&bank_info, 0, sizeof(bank_info));
+ aca_decode_bank_info(ras_blk, bank, &bank_info);
+ memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info));
+ ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS];
+ ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID];
+ ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR];
+
+ ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status0);
+
+ if (aca_check_umc_de(ras_core, status0))
+ ecc->de_count = 1;
+ else if (aca_check_umc_ue(ras_core, status0))
+ ecc->ue_count = ext_error_code ?
+ 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]);
+ else if (aca_check_umc_ce(ras_core, status0))
+ ecc->ce_count = ext_error_code ?
+ 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]);
+
+ return 0;
+}
+
+static bool aca_check_bank_is_de(struct ras_core_context *ras_core,
+ uint64_t status)
+{
+ return (ACA_REG_STATUS_POISON(status) ||
+ ACA_REG_STATUS_DEFERRED(status));
+}
+
+static int aca_parse_bank_default(struct ras_core_context *ras_core,
+ struct aca_block *ras_blk,
+ void *data, void *buf)
+{
+ struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
+ struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf;
+ struct aca_ecc_info bank_info;
+ u64 misc0 = bank->regs[ACA_REG_IDX__MISC0];
+ u64 status = bank->regs[ACA_REG_IDX__STATUS];
+
+ memset(&bank_info, 0, sizeof(bank_info));
+ aca_decode_bank_info(ras_blk, bank, &bank_info);
+ memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info));
+ ecc->bank_info.status = status;
+ ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID];
+ ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR];
+
+ if (aca_check_bank_is_de(ras_core, status)) {
+ ecc->de_count = 1;
+ } else {
+ if (bank->ecc_type == RAS_ERR_TYPE__UE)
+ ecc->ue_count = 1;
+ else if (bank->ecc_type == RAS_ERR_TYPE__CE)
+ ecc->ce_count = ACA_REG_MISC0_ERRCNT(misc0);
+ }
+
+ return 0;
+}
+
+static int aca_parse_xgmi_bank(struct ras_core_context *ras_core,
+ struct aca_block *ras_blk,
+ void *data, void *buf)
+{
+ struct aca_bank_reg *bank = (struct aca_bank_reg *)data;
+ struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf;
+ struct aca_ecc_info bank_info;
+ u64 status, count;
+ int ext_error_code;
+
+ memset(&bank_info, 0, sizeof(bank_info));
+ aca_decode_bank_info(ras_blk, bank, &bank_info);
+ memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info));
+ ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS];
+ ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID];
+ ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR];
+
+ status = bank->regs[ACA_REG_IDX__STATUS];
+ ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status);
+
+ count = ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]);
+ if (bank->ecc_type == RAS_ERR_TYPE__UE) {
+ if (ext_error_code != 0 && ext_error_code != 9)
+ count = 0ULL;
+ ecc->ue_count = count;
+ } else if (bank->ecc_type == RAS_ERR_TYPE__CE) {
+ count = ext_error_code == 6 ? count : 0ULL;
+ ecc->ce_count = count;
+ }
+
+ return 0;
+}
+
+static const struct aca_block_info aca_v1_0_umc = {
+ .name = "umc",
+ .ras_block_id = RAS_BLOCK_ID__UMC,
+ .hwip = ACA_ECC_HWIP__UMC,
+ .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK | ACA_ERROR__DE_MASK,
+ .bank_ops = {
+ .bank_match = aca_match_bank_default,
+ .bank_parse = aca_parse_umc_bank,
+ },
+};
+
+static const struct aca_block_info aca_v1_0_gfx = {
+ .name = "gfx",
+ .ras_block_id = RAS_BLOCK_ID__GFX,
+ .hwip = ACA_ECC_HWIP__SMU,
+ .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK,
+ .bank_ops = {
+ .bank_match = aca_match_gfx_bank,
+ .bank_parse = aca_parse_bank_default,
+ },
+};
+
+static const struct aca_block_info aca_v1_0_sdma = {
+ .name = "sdma",
+ .ras_block_id = RAS_BLOCK_ID__SDMA,
+ .hwip = ACA_ECC_HWIP__SMU,
+ .mask = ACA_ERROR__UE_MASK,
+ .bank_ops = {
+ .bank_match = aca_match_sdma_bank,
+ .bank_parse = aca_parse_bank_default,
+ },
+};
+
+static const struct aca_block_info aca_v1_0_mmhub = {
+ .name = "mmhub",
+ .ras_block_id = RAS_BLOCK_ID__MMHUB,
+ .hwip = ACA_ECC_HWIP__SMU,
+ .mask = ACA_ERROR__UE_MASK,
+ .bank_ops = {
+ .bank_match = aca_match_mmhub_bank,
+ .bank_parse = aca_parse_bank_default,
+ },
+};
+
+static const struct aca_block_info aca_v1_0_xgmi = {
+ .name = "xgmi",
+ .ras_block_id = RAS_BLOCK_ID__XGMI_WAFL,
+ .hwip = ACA_ECC_HWIP__PCS_XGMI,
+ .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK,
+ .bank_ops = {
+ .bank_match = aca_match_bank_default,
+ .bank_parse = aca_parse_xgmi_bank,
+ },
+};
+
+static const struct aca_block_info *aca_block_info_v1_0[] = {
+ &aca_v1_0_umc,
+ &aca_v1_0_gfx,
+ &aca_v1_0_sdma,
+ &aca_v1_0_mmhub,
+ &aca_v1_0_xgmi,
+};
+
+const struct ras_aca_ip_func ras_aca_func_v1_0 = {
+ .block_num = ARRAY_SIZE(aca_block_info_v1_0),
+ .block_info = aca_block_info_v1_0,
+};
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h
new file mode 100644
index 000000000000..40e5d94b037f
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_ACA_V1_0_H__
+#define __RAS_ACA_V1_0_H__
+#include "ras.h"
+
+#define ACA__REG__FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l)
+#define ACA_REG_STATUS_VAL(x) ACA__REG__FIELD(x, 63, 63)
+#define ACA_REG_STATUS_OVERFLOW(x) ACA__REG__FIELD(x, 62, 62)
+#define ACA_REG_STATUS_UC(x) ACA__REG__FIELD(x, 61, 61)
+#define ACA_REG_STATUS_EN(x) ACA__REG__FIELD(x, 60, 60)
+#define ACA_REG_STATUS_MISCV(x) ACA__REG__FIELD(x, 59, 59)
+#define ACA_REG_STATUS_ADDRV(x) ACA__REG__FIELD(x, 58, 58)
+#define ACA_REG_STATUS_PCC(x) ACA__REG__FIELD(x, 57, 57)
+#define ACA_REG_STATUS_ERRCOREIDVAL(x) ACA__REG__FIELD(x, 56, 56)
+#define ACA_REG_STATUS_TCC(x) ACA__REG__FIELD(x, 55, 55)
+#define ACA_REG_STATUS_SYNDV(x) ACA__REG__FIELD(x, 53, 53)
+#define ACA_REG_STATUS_CECC(x) ACA__REG__FIELD(x, 46, 46)
+#define ACA_REG_STATUS_UECC(x) ACA__REG__FIELD(x, 45, 45)
+#define ACA_REG_STATUS_DEFERRED(x) ACA__REG__FIELD(x, 44, 44)
+#define ACA_REG_STATUS_POISON(x) ACA__REG__FIELD(x, 43, 43)
+#define ACA_REG_STATUS_SCRUB(x) ACA__REG__FIELD(x, 40, 40)
+#define ACA_REG_STATUS_ERRCOREID(x) ACA__REG__FIELD(x, 37, 32)
+#define ACA_REG_STATUS_ADDRLSB(x) ACA__REG__FIELD(x, 29, 24)
+#define ACA_REG_STATUS_ERRORCODEEXT(x) ACA__REG__FIELD(x, 21, 16)
+#define ACA_REG_STATUS_ERRORCODE(x) ACA__REG__FIELD(x, 15, 0)
+
+#define ACA_REG_IPID_MCATYPE(x) ACA__REG__FIELD(x, 63, 48)
+#define ACA_REG_IPID_INSTANCEIDHI(x) ACA__REG__FIELD(x, 47, 44)
+#define ACA_REG_IPID_HARDWAREID(x) ACA__REG__FIELD(x, 43, 32)
+#define ACA_REG_IPID_INSTANCEIDLO(x) ACA__REG__FIELD(x, 31, 0)
+
+#define ACA_REG_MISC0_VALID(x) ACA__REG__FIELD(x, 63, 63)
+#define ACA_REG_MISC0_OVRFLW(x) ACA__REG__FIELD(x, 48, 48)
+#define ACA_REG_MISC0_ERRCNT(x) ACA__REG__FIELD(x, 43, 32)
+
+#define ACA_REG_SYND_ERRORINFORMATION(x) ACA__REG__FIELD(x, 17, 0)
+
+/* NOTE: The following codes refers to the smu header file */
+#define ACA_EXTERROR_CODE_CE 0x3a
+#define ACA_EXTERROR_CODE_FAULT 0x3b
+
+#define mmSMNAID_XCD0_MCA_SMU 0x36430400 /* SMN AID XCD0 */
+#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */
+#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */
+#define mmSMNAID_AID0_MCA_SMU 0x03b30400 /* SMN AID AID0 */
+
+extern const struct ras_aca_ip_func ras_aca_func_v1_0;
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
new file mode 100644
index 000000000000..94e6d7420d94
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c
@@ -0,0 +1,522 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_cmd.h"
+
+#define RAS_CMD_MAJOR_VERSION 6
+#define RAS_CMD_MINOR_VERSION 0
+#define RAS_CMD_VERSION (((RAS_CMD_MAJOR_VERSION) << 10) | (RAS_CMD_MINOR_VERSION))
+
+static int ras_cmd_add_device(struct ras_core_context *ras_core)
+{
+ INIT_LIST_HEAD(&ras_core->ras_cmd.head);
+ ras_core->ras_cmd.ras_core = ras_core;
+ ras_core->ras_cmd.dev_handle = (uintptr_t)ras_core ^ RAS_CMD_DEV_HANDLE_MAGIC;
+ return 0;
+}
+
+static int ras_cmd_remove_device(struct ras_core_context *ras_core)
+{
+ memset(&ras_core->ras_cmd, 0, sizeof(ras_core->ras_cmd));
+ return 0;
+}
+
+static int ras_get_block_ecc_info(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_block_ecc_info_req *input_data =
+ (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw;
+ struct ras_cmd_block_ecc_info_rsp *output_data =
+ (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw;
+ struct ras_ecc_count err_data;
+ int ret;
+
+ if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ memset(&err_data, 0, sizeof(err_data));
+ ret = ras_aca_get_block_ecc_count(ras_core, input_data->block_id, &err_data);
+ if (ret)
+ return RAS_CMD__ERROR_GENERIC;
+
+ output_data->ce_count = err_data.total_ce_count;
+ output_data->ue_count = err_data.total_ue_count;
+ output_data->de_count = err_data.total_de_count;
+
+ cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp);
+ return RAS_CMD__SUCCESS;
+}
+
+static void ras_cmd_update_bad_page_info(struct ras_cmd_bad_page_record *ras_cmd_record,
+ struct eeprom_umc_record *record)
+{
+ ras_cmd_record->retired_page = record->cur_nps_retired_row_pfn;
+ ras_cmd_record->ts = record->ts;
+ ras_cmd_record->err_type = record->err_type;
+ ras_cmd_record->mem_channel = record->mem_channel;
+ ras_cmd_record->mcumc_id = record->mcumc_id;
+ ras_cmd_record->address = record->address;
+ ras_cmd_record->bank = record->bank;
+ ras_cmd_record->valid = 1;
+}
+
+static int ras_cmd_get_group_bad_pages(struct ras_core_context *ras_core,
+ uint32_t group_index, struct ras_cmd_bad_pages_info_rsp *output_data)
+{
+ struct eeprom_umc_record record;
+ struct ras_cmd_bad_page_record *ras_cmd_record;
+ uint32_t i = 0, bp_cnt = 0, group_cnt = 0;
+
+ output_data->bp_in_group = 0;
+ output_data->group_index = 0;
+
+ bp_cnt = ras_umc_get_badpage_count(ras_core);
+ if (bp_cnt) {
+ output_data->group_index = group_index;
+ group_cnt = bp_cnt / RAS_CMD_MAX_BAD_PAGES_PER_GROUP
+ + ((bp_cnt % RAS_CMD_MAX_BAD_PAGES_PER_GROUP) ? 1 : 0);
+
+ if (group_index >= group_cnt)
+ return RAS_CMD__ERROR_INVALID_INPUT_DATA;
+
+ i = group_index * RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+ for (;
+ i < bp_cnt && output_data->bp_in_group < RAS_CMD_MAX_BAD_PAGES_PER_GROUP;
+ i++) {
+ if (ras_umc_get_badpage_record(ras_core, i, &record))
+ return RAS_CMD__ERROR_GENERIC;
+
+ ras_cmd_record = &output_data->records[i % RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
+
+ memset(ras_cmd_record, 0, sizeof(*ras_cmd_record));
+ ras_cmd_update_bad_page_info(ras_cmd_record, &record);
+ output_data->bp_in_group++;
+ }
+ }
+ output_data->bp_total_cnt = bp_cnt;
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_bad_pages_info_req *input_data =
+ (struct ras_cmd_bad_pages_info_req *)cmd->input_buff_raw;
+ struct ras_cmd_bad_pages_info_rsp *output_data =
+ (struct ras_cmd_bad_pages_info_rsp *)cmd->output_buff_raw;
+ int ret;
+
+ if (cmd->input_size != sizeof(struct ras_cmd_bad_pages_info_req))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ ret = ras_cmd_get_group_bad_pages(ras_core, input_data->group_index, output_data);
+ if (ret)
+ return RAS_CMD__ERROR_GENERIC;
+
+ output_data->version = 0;
+
+ cmd->output_size = sizeof(struct ras_cmd_bad_pages_info_rsp);
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ if (ras_eeprom_reset_table(ras_core))
+ return RAS_CMD__ERROR_GENERIC;
+
+ if (ras_umc_clean_badpage_data(ras_core))
+ return RAS_CMD__ERROR_GENERIC;
+
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ if (cmd->input_size != sizeof(struct ras_cmd_dev_handle))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ if (ras_aca_clear_all_blocks_ecc_count(ras_core))
+ return RAS_CMD__ERROR_GENERIC;
+
+ if (ras_umc_clear_logged_ecc(ras_core))
+ return RAS_CMD__ERROR_GENERIC;
+
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_cper_snapshot_rsp *output_data =
+ (struct ras_cmd_cper_snapshot_rsp *)cmd->output_buff_raw;
+ struct ras_log_batch_overview overview;
+
+ if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ ras_log_ring_get_batch_overview(ras_core, &overview);
+
+ output_data->total_cper_num = overview.logged_batch_count;
+ output_data->start_cper_id = overview.first_batch_id;
+ output_data->latest_cper_id = overview.last_batch_id;
+
+ output_data->version = 0;
+
+ cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp);
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_cmd_get_cper_records(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_cper_record_req *req =
+ (struct ras_cmd_cper_record_req *)cmd->input_buff_raw;
+ struct ras_cmd_cper_record_rsp *rsp =
+ (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw;
+ struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0};
+ struct ras_log_batch_overview overview;
+ uint32_t offset = 0, real_data_len = 0;
+ uint64_t batch_id;
+ uint8_t *buffer;
+ int ret = 0, i, count;
+
+ if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ if (!req->buf_size || !req->buf_ptr || !req->cper_num)
+ return RAS_CMD__ERROR_INVALID_INPUT_DATA;
+
+ buffer = kzalloc(req->buf_size, GFP_KERNEL);
+ if (!buffer)
+ return RAS_CMD__ERROR_GENERIC;
+
+ ras_log_ring_get_batch_overview(ras_core, &overview);
+ for (i = 0; i < req->cper_num; i++) {
+ batch_id = req->cper_start_id + i;
+ if (batch_id >= overview.last_batch_id)
+ break;
+
+ count = ras_log_ring_get_batch_records(ras_core, batch_id, trace,
+ ARRAY_SIZE(trace));
+ if (count > 0) {
+ ret = ras_cper_generate_cper(ras_core, trace, count,
+ &buffer[offset], req->buf_size - offset, &real_data_len);
+ if (ret)
+ break;
+
+ offset += real_data_len;
+ }
+ }
+
+ if ((ret && (ret != -ENOMEM)) ||
+ copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer, offset)) {
+ kfree(buffer);
+ return RAS_CMD__ERROR_GENERIC;
+ }
+
+ rsp->real_data_size = offset;
+ rsp->real_cper_num = i;
+ rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0;
+ rsp->version = 0;
+
+ cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp);
+
+ kfree(buffer);
+
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_batch_trace_snapshot_rsp *rsp =
+ (struct ras_cmd_batch_trace_snapshot_rsp *)cmd->output_buff_raw;
+ struct ras_log_batch_overview overview;
+
+
+ if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_snapshot_req))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ ras_log_ring_get_batch_overview(ras_core, &overview);
+
+ rsp->total_batch_num = overview.logged_batch_count;
+ rsp->start_batch_id = overview.first_batch_id;
+ rsp->latest_batch_id = overview.last_batch_id;
+ rsp->version = 0;
+
+ cmd->output_size = sizeof(struct ras_cmd_batch_trace_snapshot_rsp);
+ return RAS_CMD__SUCCESS;
+}
+
+static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_batch_trace_record_req *input_data =
+ (struct ras_cmd_batch_trace_record_req *)cmd->input_buff_raw;
+ struct ras_cmd_batch_trace_record_rsp *output_data =
+ (struct ras_cmd_batch_trace_record_rsp *)cmd->output_buff_raw;
+ struct ras_log_batch_overview overview;
+ struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0};
+ struct ras_log_info *record;
+ int i, j, count = 0, offset = 0;
+ uint64_t id;
+ bool completed = false;
+
+ if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_record_req))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ if ((!input_data->batch_num) || (input_data->batch_num > RAS_CMD_MAX_BATCH_NUM))
+ return RAS_CMD__ERROR_INVALID_INPUT_DATA;
+
+ ras_log_ring_get_batch_overview(ras_core, &overview);
+ if ((input_data->start_batch_id < overview.first_batch_id) ||
+ (input_data->start_batch_id >= overview.last_batch_id))
+ return RAS_CMD__ERROR_INVALID_INPUT_SIZE;
+
+ for (i = 0; i < input_data->batch_num; i++) {
+ id = input_data->start_batch_id + i;
+ if (id >= overview.last_batch_id) {
+ completed = true;
+ break;
+ }
+
+ count = ras_log_ring_get_batch_records(ras_core,
+ id, trace_arry, ARRAY_SIZE(trace_arry));
+ if (count > 0) {
+ if ((offset + count) > RAS_CMD_MAX_TRACE_NUM)
+ break;
+ for (j = 0; j < count; j++) {
+ record = &output_data->records[offset + j];
+ record->seqno = trace_arry[j]->seqno;
+ record->timestamp = trace_arry[j]->timestamp;
+ record->event = trace_arry[j]->event;
+ memcpy(&record->aca_reg,
+ &trace_arry[j]->aca_reg, sizeof(trace_arry[j]->aca_reg));
+ }
+ } else {
+ count = 0;
+ }
+
+ output_data->batchs[i].batch_id = id;
+ output_data->batchs[i].offset = offset;
+ output_data->batchs[i].trace_num = count;
+ offset += count;
+ }
+
+ output_data->start_batch_id = input_data->start_batch_id;
+ output_data->real_batch_num = i;
+ output_data->remain_num = completed ? 0 : (input_data->batch_num - i);
+ output_data->version = 0;
+
+ cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp);
+
+ return RAS_CMD__SUCCESS;
+}
+
+static enum ras_ta_block __get_ras_ta_block(enum ras_block_id block)
+{
+ switch (block) {
+ case RAS_BLOCK_ID__UMC:
+ return RAS_TA_BLOCK__UMC;
+ case RAS_BLOCK_ID__SDMA:
+ return RAS_TA_BLOCK__SDMA;
+ case RAS_BLOCK_ID__GFX:
+ return RAS_TA_BLOCK__GFX;
+ case RAS_BLOCK_ID__MMHUB:
+ return RAS_TA_BLOCK__MMHUB;
+ case RAS_BLOCK_ID__ATHUB:
+ return RAS_TA_BLOCK__ATHUB;
+ case RAS_BLOCK_ID__PCIE_BIF:
+ return RAS_TA_BLOCK__PCIE_BIF;
+ case RAS_BLOCK_ID__HDP:
+ return RAS_TA_BLOCK__HDP;
+ case RAS_BLOCK_ID__XGMI_WAFL:
+ return RAS_TA_BLOCK__XGMI_WAFL;
+ case RAS_BLOCK_ID__DF:
+ return RAS_TA_BLOCK__DF;
+ case RAS_BLOCK_ID__SMN:
+ return RAS_TA_BLOCK__SMN;
+ case RAS_BLOCK_ID__SEM:
+ return RAS_TA_BLOCK__SEM;
+ case RAS_BLOCK_ID__MP0:
+ return RAS_TA_BLOCK__MP0;
+ case RAS_BLOCK_ID__MP1:
+ return RAS_TA_BLOCK__MP1;
+ case RAS_BLOCK_ID__FUSE:
+ return RAS_TA_BLOCK__FUSE;
+ case RAS_BLOCK_ID__MCA:
+ return RAS_TA_BLOCK__MCA;
+ case RAS_BLOCK_ID__VCN:
+ return RAS_TA_BLOCK__VCN;
+ case RAS_BLOCK_ID__JPEG:
+ return RAS_TA_BLOCK__JPEG;
+ default:
+ return RAS_TA_BLOCK__UMC;
+ }
+}
+
+static enum ras_ta_error_type __get_ras_ta_err_type(enum ras_ecc_err_type error)
+{
+ switch (error) {
+ case RAS_ECC_ERR__NONE:
+ return RAS_TA_ERROR__NONE;
+ case RAS_ECC_ERR__PARITY:
+ return RAS_TA_ERROR__PARITY;
+ case RAS_ECC_ERR__SINGLE_CORRECTABLE:
+ return RAS_TA_ERROR__SINGLE_CORRECTABLE;
+ case RAS_ECC_ERR__MULTI_UNCORRECTABLE:
+ return RAS_TA_ERROR__MULTI_UNCORRECTABLE;
+ case RAS_ECC_ERR__POISON:
+ return RAS_TA_ERROR__POISON;
+ default:
+ return RAS_TA_ERROR__NONE;
+ }
+}
+
+static int ras_cmd_inject_error(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_inject_error_req *req =
+ (struct ras_cmd_inject_error_req *)cmd->input_buff_raw;
+ struct ras_cmd_inject_error_rsp *output_data =
+ (struct ras_cmd_inject_error_rsp *)cmd->output_buff_raw;
+ int ret = 0;
+ struct ras_ta_trigger_error_input block_info = {
+ .block_id = __get_ras_ta_block(req->block_id),
+ .sub_block_index = req->subblock_id,
+ .inject_error_type = __get_ras_ta_err_type(req->error_type),
+ .address = req->address,
+ .value = req->method,
+ };
+
+ ret = ras_psp_trigger_error(ras_core, &block_info, req->instance_mask);
+ if (!ret) {
+ output_data->version = 0;
+ output_data->address = block_info.address;
+ cmd->output_size = sizeof(struct ras_cmd_inject_error_rsp);
+ } else {
+ RAS_DEV_ERR(ras_core->dev, "ras inject block %u failed %d\n", req->block_id, ret);
+ ret = RAS_CMD__ERROR_ACCESS_DENIED;
+ }
+
+ return ret;
+}
+
+static struct ras_cmd_func_map ras_cmd_maps[] = {
+ {RAS_CMD__INJECT_ERROR, ras_cmd_inject_error},
+ {RAS_CMD__GET_BLOCK_ECC_STATUS, ras_get_block_ecc_info},
+ {RAS_CMD__GET_BAD_PAGES, ras_cmd_get_bad_pages},
+ {RAS_CMD__CLEAR_BAD_PAGE_INFO, ras_cmd_clear_bad_page_info},
+ {RAS_CMD__RESET_ALL_ERROR_COUNTS, ras_cmd_reset_all_error_counts},
+ {RAS_CMD__GET_CPER_SNAPSHOT, ras_cmd_get_cper_snapshot},
+ {RAS_CMD__GET_CPER_RECORD, ras_cmd_get_cper_records},
+ {RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, ras_cmd_get_batch_trace_snapshot},
+ {RAS_CMD__GET_BATCH_TRACE_RECORD, ras_cmd_get_batch_trace_records},
+};
+
+int rascore_handle_cmd(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data)
+{
+ struct ras_cmd_func_map *ras_cmd = NULL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ras_cmd_maps); i++) {
+ if (cmd->cmd_id == ras_cmd_maps[i].cmd_id) {
+ ras_cmd = &ras_cmd_maps[i];
+ break;
+ }
+ }
+
+ if (!ras_cmd)
+ return RAS_CMD__ERROR_UKNOWN_CMD;
+
+ return ras_cmd->func(ras_core, cmd, data);
+}
+
+int ras_cmd_init(struct ras_core_context *ras_core)
+{
+ return ras_cmd_add_device(ras_core);
+}
+
+int ras_cmd_fini(struct ras_core_context *ras_core)
+{
+ ras_cmd_remove_device(ras_core);
+ return 0;
+}
+
+int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
+ struct ras_query_interface_info_rsp *rsp)
+{
+ rsp->ras_cmd_major_ver = RAS_CMD_MAJOR_VERSION;
+ rsp->ras_cmd_minor_ver = RAS_CMD_MINOR_VERSION;
+
+ return 0;
+}
+
+int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
+ uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr)
+{
+ struct umc_bank_addr umc_bank = {0};
+ int ret;
+
+ ret = ras_umc_translate_soc_pa_and_bank(ras_core, &soc_pa, &umc_bank, false);
+ if (ret)
+ return RAS_CMD__ERROR_GENERIC;
+
+ bank_addr->stack_id = umc_bank.stack_id;
+ bank_addr->bank_group = umc_bank.bank_group;
+ bank_addr->bank = umc_bank.bank;
+ bank_addr->row = umc_bank.row;
+ bank_addr->column = umc_bank.column;
+ bank_addr->channel = umc_bank.channel;
+ bank_addr->subchannel = umc_bank.subchannel;
+
+ return 0;
+}
+
+int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
+ struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa)
+{
+ struct umc_bank_addr umc_bank = {0};
+
+ umc_bank.stack_id = bank_addr.stack_id;
+ umc_bank.bank_group = bank_addr.bank_group;
+ umc_bank.bank = bank_addr.bank;
+ umc_bank.row = bank_addr.row;
+ umc_bank.column = bank_addr.column;
+ umc_bank.channel = bank_addr.channel;
+ umc_bank.subchannel = bank_addr.subchannel;
+
+ return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, &umc_bank, true);
+}
+
+uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core)
+{
+ return ras_core->ras_cmd.dev_handle;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
new file mode 100644
index 000000000000..48a0715eb821
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h
@@ -0,0 +1,426 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_CMD_H__
+#define __RAS_CMD_H__
+#include "ras.h"
+#include "ras_eeprom.h"
+#include "ras_log_ring.h"
+#include "ras_cper.h"
+
+#define RAS_CMD_DEV_HANDLE_MAGIC 0xFEEDAD00UL
+
+#define RAS_CMD_MAX_IN_SIZE 256
+#define RAS_CMD_MAX_GPU_NUM 32
+#define RAS_CMD_MAX_BAD_PAGES_PER_GROUP 32
+
+/* position of instance value in sub_block_index of
+ * ta_ras_trigger_error_input, the sub block uses lower 12 bits
+ */
+#define RAS_TA_INST_MASK 0xfffff000
+#define RAS_TA_INST_SHIFT 0xc
+
+enum ras_cmd_interface_type {
+ RAS_CMD_INTERFACE_TYPE_NONE,
+ RAS_CMD_INTERFACE_TYPE_AMDGPU,
+ RAS_CMD_INTERFACE_TYPE_VF,
+ RAS_CMD_INTERFACE_TYPE_PF,
+};
+
+enum ras_cmd_id_range {
+ RAS_CMD_ID_COMMON_START = 0,
+ RAS_CMD_ID_COMMON_END = 0x10000,
+ RAS_CMD_ID_AMDGPU_START = RAS_CMD_ID_COMMON_END,
+ RAS_CMD_ID_AMDGPU_END = 0x20000,
+ RAS_CMD_ID_MXGPU_START = RAS_CMD_ID_AMDGPU_END,
+ RAS_CMD_ID_MXGPU_END = 0x30000,
+ RAS_CMD_ID_MXGPU_VF_START = RAS_CMD_ID_MXGPU_END,
+ RAS_CMD_ID_MXGPU_VF_END = 0x40000,
+};
+
+enum ras_cmd_id {
+ RAS_CMD__BEGIN = RAS_CMD_ID_COMMON_START,
+ RAS_CMD__QUERY_INTERFACE_INFO,
+ RAS_CMD__GET_DEVICES_INFO,
+ RAS_CMD__GET_BLOCK_ECC_STATUS,
+ RAS_CMD__INJECT_ERROR,
+ RAS_CMD__GET_BAD_PAGES,
+ RAS_CMD__CLEAR_BAD_PAGE_INFO,
+ RAS_CMD__RESET_ALL_ERROR_COUNTS,
+ RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES,
+ RAS_CMD__TRANSLATE_FB_ADDRESS,
+ RAS_CMD__GET_LINK_TOPOLOGY,
+ RAS_CMD__GET_CPER_SNAPSHOT,
+ RAS_CMD__GET_CPER_RECORD,
+ RAS_CMD__GET_BATCH_TRACE_SNAPSHOT,
+ RAS_CMD__GET_BATCH_TRACE_RECORD,
+ RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END,
+};
+
+enum ras_cmd_response {
+ RAS_CMD__SUCCESS = 0,
+ RAS_CMD__SUCCESS_EXEED_BUFFER,
+ RAS_CMD__ERROR_UKNOWN_CMD,
+ RAS_CMD__ERROR_INVALID_CMD,
+ RAS_CMD__ERROR_VERSION,
+ RAS_CMD__ERROR_INVALID_INPUT_SIZE,
+ RAS_CMD__ERROR_INVALID_INPUT_DATA,
+ RAS_CMD__ERROR_DRV_INIT_FAIL,
+ RAS_CMD__ERROR_ACCESS_DENIED,
+ RAS_CMD__ERROR_GENERIC,
+ RAS_CMD__ERROR_TIMEOUT,
+};
+
+enum ras_error_type {
+ RAS_TYPE_ERROR__NONE = 0,
+ RAS_TYPE_ERROR__PARITY = 1,
+ RAS_TYPE_ERROR__SINGLE_CORRECTABLE = 2,
+ RAS_TYPE_ERROR__MULTI_UNCORRECTABLE = 4,
+ RAS_TYPE_ERROR__POISON = 8,
+};
+
+struct ras_core_context;
+struct ras_cmd_ctx;
+
+struct ras_cmd_mgr {
+ struct list_head head;
+ struct ras_core_context *ras_core;
+ uint64_t dev_handle;
+};
+
+struct ras_cmd_func_map {
+ uint32_t cmd_id;
+ int (*func)(struct ras_core_context *ras_core,
+ struct ras_cmd_ctx *cmd, void *data);
+};
+
+struct ras_device_bdf {
+ union {
+ struct {
+ uint32_t function : 3;
+ uint32_t device : 5;
+ uint32_t bus : 8;
+ uint32_t domain : 16;
+ };
+ uint32_t u32_all;
+ };
+};
+
+struct ras_cmd_param {
+ uint32_t idx_vf;
+ void *data;
+};
+
+#pragma pack(push, 8)
+struct ras_cmd_ctx {
+ uint32_t magic;
+ union {
+ struct {
+ uint16_t ras_cmd_minor_ver : 10;
+ uint16_t ras_cmd_major_ver : 6;
+ };
+ uint16_t ras_cmd_ver;
+ };
+ union {
+ struct {
+ uint16_t plat_major_ver : 10;
+ uint16_t plat_minor_ver : 6;
+ };
+ uint16_t plat_ver;
+ };
+ uint32_t cmd_id;
+ uint32_t cmd_res;
+ uint32_t input_size;
+ uint32_t output_size;
+ uint32_t output_buf_size;
+ uint32_t reserved[5];
+ uint8_t input_buff_raw[RAS_CMD_MAX_IN_SIZE];
+ uint8_t output_buff_raw[];
+};
+
+struct ras_cmd_dev_handle {
+ uint64_t dev_handle;
+};
+
+struct ras_cmd_block_ecc_info_req {
+ struct ras_cmd_dev_handle dev;
+ uint32_t block_id;
+ uint32_t subblock_id;
+ uint32_t reserved[4];
+};
+
+struct ras_cmd_block_ecc_info_rsp {
+ uint32_t version;
+ uint32_t ce_count;
+ uint32_t ue_count;
+ uint32_t de_count;
+ uint32_t reserved[6];
+};
+
+struct ras_cmd_inject_error_req {
+ struct ras_cmd_dev_handle dev;
+ uint32_t block_id;
+ uint32_t subblock_id;
+ uint64_t address;
+ uint32_t error_type;
+ uint32_t instance_mask;
+ union {
+ struct {
+ /* vf index */
+ uint64_t vf_idx : 6;
+ /* method of error injection. i.e persistent, coherent etc */
+ uint64_t method : 10;
+ uint64_t rsv : 48;
+ };
+ uint64_t value;
+ };
+ uint32_t reserved[8];
+};
+
+struct ras_cmd_inject_error_rsp {
+ uint32_t version;
+ uint32_t reserved[5];
+ uint64_t address;
+};
+
+struct ras_cmd_dev_info {
+ uint64_t dev_handle;
+ uint32_t location_id;
+ uint32_t ecc_enabled;
+ uint32_t ecc_supported;
+ uint32_t vf_num;
+ uint32_t asic_type;
+ uint32_t oam_id;
+ uint32_t reserved[8];
+};
+
+struct ras_cmd_devices_info_rsp {
+ uint32_t version;
+ uint32_t dev_num;
+ uint32_t reserved[6];
+ struct ras_cmd_dev_info devs[RAS_CMD_MAX_GPU_NUM];
+};
+
+struct ras_cmd_bad_page_record {
+ union {
+ uint64_t address;
+ uint64_t offset;
+ };
+ uint64_t retired_page;
+ uint64_t ts;
+
+ uint32_t err_type;
+
+ union {
+ unsigned char bank;
+ unsigned char cu;
+ };
+
+ unsigned char mem_channel;
+ unsigned char mcumc_id;
+
+ unsigned char valid;
+ unsigned char reserved[8];
+};
+
+struct ras_cmd_bad_pages_info_req {
+ struct ras_cmd_dev_handle device;
+ uint32_t group_index;
+ uint32_t reserved[5];
+};
+
+struct ras_cmd_bad_pages_info_rsp {
+ uint32_t version;
+ uint32_t group_index;
+ uint32_t bp_in_group;
+ uint32_t bp_total_cnt;
+ uint32_t reserved[4];
+ struct ras_cmd_bad_page_record records[RAS_CMD_MAX_BAD_PAGES_PER_GROUP];
+};
+
+struct ras_query_interface_info_req {
+ uint32_t reserved[8];
+};
+
+struct ras_query_interface_info_rsp {
+ uint32_t version;
+ uint32_t ras_cmd_major_ver;
+ uint32_t ras_cmd_minor_ver;
+ uint32_t plat_major_ver;
+ uint32_t plat_minor_ver;
+ uint8_t interface_type;
+ uint8_t rsv[3];
+ uint32_t reserved[8];
+};
+
+#define RAS_MAX_NUM_SAFE_RANGES 64
+struct ras_cmd_ras_safe_fb_address_ranges_rsp {
+ uint32_t version;
+ uint32_t num_ranges;
+ uint32_t reserved[4];
+ struct {
+ uint64_t start;
+ uint64_t size;
+ uint32_t idx;
+ uint32_t reserved[3];
+ } range[RAS_MAX_NUM_SAFE_RANGES];
+};
+
+enum ras_fb_addr_type {
+ RAS_FB_ADDR_SOC_PHY, /* SPA */
+ RAS_FB_ADDR_BANK,
+ RAS_FB_ADDR_VF_PHY, /* GPA */
+ RAS_FB_ADDR_UNKNOWN
+};
+
+struct ras_fb_bank_addr {
+ uint32_t stack_id; /* SID */
+ uint32_t bank_group;
+ uint32_t bank;
+ uint32_t row;
+ uint32_t column;
+ uint32_t channel;
+ uint32_t subchannel; /* Also called Pseudochannel (PC) */
+ uint32_t reserved[3];
+};
+
+struct ras_fb_vf_phy_addr {
+ uint32_t vf_idx;
+ uint32_t reserved;
+ uint64_t addr;
+};
+
+union ras_translate_fb_address {
+ struct ras_fb_bank_addr bank_addr;
+ uint64_t soc_phy_addr;
+ struct ras_fb_vf_phy_addr vf_phy_addr;
+};
+
+struct ras_cmd_translate_fb_address_req {
+ struct ras_cmd_dev_handle dev;
+ enum ras_fb_addr_type src_addr_type;
+ enum ras_fb_addr_type dest_addr_type;
+ union ras_translate_fb_address trans_addr;
+};
+
+struct ras_cmd_translate_fb_address_rsp {
+ uint32_t version;
+ uint32_t reserved[5];
+ union ras_translate_fb_address trans_addr;
+};
+
+struct ras_dev_link_topology_req {
+ struct ras_cmd_dev_handle src;
+ struct ras_cmd_dev_handle dst;
+};
+
+struct ras_dev_link_topology_rsp {
+ uint32_t version;
+ uint32_t link_status; /* HW status of the link */
+ uint32_t link_type; /* type of the link */
+ uint32_t num_hops; /* number of hops */
+ uint32_t reserved[8];
+};
+
+struct ras_cmd_cper_snapshot_req {
+ struct ras_cmd_dev_handle dev;
+};
+
+struct ras_cmd_cper_snapshot_rsp {
+ uint32_t version;
+ uint32_t reserved[4];
+ uint32_t total_cper_num;
+ uint64_t start_cper_id;
+ uint64_t latest_cper_id;
+};
+
+struct ras_cmd_cper_record_req {
+ struct ras_cmd_dev_handle dev;
+ uint64_t cper_start_id;
+ uint32_t cper_num;
+ uint32_t buf_size;
+ uint64_t buf_ptr;
+ uint32_t reserved[4];
+};
+
+struct ras_cmd_cper_record_rsp {
+ uint32_t version;
+ uint32_t real_data_size;
+ uint32_t real_cper_num;
+ uint32_t remain_num;
+ uint32_t reserved[4];
+};
+
+struct ras_cmd_batch_trace_snapshot_req {
+ struct ras_cmd_dev_handle dev;
+};
+
+struct ras_cmd_batch_trace_snapshot_rsp {
+ uint32_t version;
+ uint32_t reserved[4];
+ uint32_t total_batch_num;
+ uint64_t start_batch_id;
+ uint64_t latest_batch_id;
+};
+
+struct ras_cmd_batch_trace_record_req {
+ struct ras_cmd_dev_handle dev;
+ uint64_t start_batch_id;
+ uint32_t batch_num;
+ uint32_t reserved[5];
+};
+
+struct batch_ras_trace_info {
+ uint64_t batch_id;
+ uint16_t offset;
+ uint8_t trace_num;
+ uint8_t rsv;
+ uint32_t reserved;
+};
+
+#define RAS_CMD_MAX_BATCH_NUM 300
+#define RAS_CMD_MAX_TRACE_NUM 300
+struct ras_cmd_batch_trace_record_rsp {
+ uint32_t version;
+ uint16_t real_batch_num;
+ uint16_t remain_num;
+ uint64_t start_batch_id;
+ uint32_t reserved[2];
+ struct batch_ras_trace_info batchs[RAS_CMD_MAX_BATCH_NUM];
+ struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM];
+};
+
+#pragma pack(pop)
+
+int ras_cmd_init(struct ras_core_context *ras_core);
+int ras_cmd_fini(struct ras_core_context *ras_core);
+int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data);
+uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core);
+int ras_cmd_query_interface_info(struct ras_core_context *ras_core,
+ struct ras_query_interface_info_rsp *rsp);
+int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core,
+ uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr);
+int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core,
+ struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
new file mode 100644
index 000000000000..01122b55c98a
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c
@@ -0,0 +1,603 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_core_status.h"
+
+#define RAS_SEQNO_FIFO_SIZE (128 * sizeof(uint64_t))
+
+#define IS_LEAP_YEAR(x) ((x % 4 == 0 && x % 100 != 0) || x % 400 == 0)
+
+static const char * const ras_block_name[] = {
+ "umc",
+ "sdma",
+ "gfx",
+ "mmhub",
+ "athub",
+ "pcie_bif",
+ "hdp",
+ "xgmi_wafl",
+ "df",
+ "smn",
+ "sem",
+ "mp0",
+ "mp1",
+ "fuse",
+ "mca",
+ "vcn",
+ "jpeg",
+ "ih",
+ "mpio",
+};
+
+const char *ras_core_get_ras_block_name(enum ras_block_id block_id)
+{
+ if (block_id >= ARRAY_SIZE(ras_block_name))
+ return "";
+
+ return ras_block_name[block_id];
+}
+
+int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core,
+ uint64_t timestamp, struct ras_time *tm)
+{
+ int days_in_month[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
+ uint64_t month = 0, day = 0, hour = 0, minute = 0, second = 0;
+ uint32_t year = 0;
+ int seconds_per_day = 24 * 60 * 60;
+ int seconds_per_hour = 60 * 60;
+ int seconds_per_minute = 60;
+ int days, remaining_seconds;
+
+ days = div64_u64_rem(timestamp, seconds_per_day, (uint64_t *)&remaining_seconds);
+
+ /* utc_timestamp follows the Unix epoch */
+ year = 1970;
+ while (days >= 365) {
+ if (IS_LEAP_YEAR(year)) {
+ if (days < 366)
+ break;
+ days -= 366;
+ } else {
+ days -= 365;
+ }
+ year++;
+ }
+
+ days_in_month[1] += IS_LEAP_YEAR(year);
+
+ month = 0;
+ while (days >= days_in_month[month]) {
+ days -= days_in_month[month];
+ month++;
+ }
+ month++;
+ day = days + 1;
+
+ if (remaining_seconds) {
+ hour = remaining_seconds / seconds_per_hour;
+ minute = (remaining_seconds % seconds_per_hour) / seconds_per_minute;
+ second = remaining_seconds % seconds_per_minute;
+ }
+
+ tm->tm_year = year;
+ tm->tm_mon = month;
+ tm->tm_mday = day;
+ tm->tm_hour = hour;
+ tm->tm_min = minute;
+ tm->tm_sec = second;
+
+ return 0;
+}
+
+bool ras_core_gpu_in_reset(struct ras_core_context *ras_core)
+{
+ uint32_t status = 0;
+
+ if (ras_core->sys_fn &&
+ ras_core->sys_fn->check_gpu_status)
+ ras_core->sys_fn->check_gpu_status(ras_core, &status);
+
+ return (status & RAS_GPU_STATUS__IN_RESET) ? true : false;
+}
+
+bool ras_core_gpu_is_vf(struct ras_core_context *ras_core)
+{
+ uint32_t status = 0;
+
+ if (ras_core->sys_fn &&
+ ras_core->sys_fn->check_gpu_status)
+ ras_core->sys_fn->check_gpu_status(ras_core, &status);
+
+ return (status & RAS_GPU_STATUS__IS_VF) ? true : false;
+}
+
+bool ras_core_gpu_is_rma(struct ras_core_context *ras_core)
+{
+ if (!ras_core)
+ return false;
+
+ return ras_core->is_rma;
+}
+
+static int ras_core_seqno_fifo_write(struct ras_core_context *ras_core,
+ enum ras_seqno_fifo fifo_type, uint64_t seqno)
+{
+ int ret = 0;
+ struct kfifo *seqno_fifo = NULL;
+
+ if (fifo_type == SEQNO_FIFO_POISON_CREATION)
+ seqno_fifo = &ras_core->de_seqno_fifo;
+ else if (fifo_type == SEQNO_FIFO_POISON_CONSUMPTION)
+ seqno_fifo = &ras_core->consumption_seqno_fifo;
+
+ if (seqno_fifo)
+ ret = kfifo_in_spinlocked(seqno_fifo,
+ &seqno, sizeof(seqno), &ras_core->seqno_lock);
+
+ return ret ? 0 : -EINVAL;
+}
+
+static int ras_core_seqno_fifo_read(struct ras_core_context *ras_core,
+ enum ras_seqno_fifo fifo_type, uint64_t *seqno, bool pop)
+{
+ int ret = 0;
+ struct kfifo *seqno_fifo = NULL;
+
+ if (fifo_type == SEQNO_FIFO_POISON_CREATION)
+ seqno_fifo = &ras_core->de_seqno_fifo;
+ else if (fifo_type == SEQNO_FIFO_POISON_CONSUMPTION)
+ seqno_fifo = &ras_core->consumption_seqno_fifo;
+
+ if (seqno_fifo) {
+ if (pop)
+ ret = kfifo_out_spinlocked(seqno_fifo,
+ seqno, sizeof(*seqno), &ras_core->seqno_lock);
+ else
+ ret = kfifo_out_peek(seqno_fifo, seqno, sizeof(*seqno));
+ }
+
+ return ret ? 0 : -EINVAL;
+}
+
+uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core,
+ enum ras_seqno_type type)
+{
+ uint64_t seqno = 0;
+
+ if (ras_core->sys_fn &&
+ ras_core->sys_fn->gen_seqno)
+ ras_core->sys_fn->gen_seqno(ras_core, type, &seqno);
+
+ return seqno;
+}
+
+int ras_core_put_seqno(struct ras_core_context *ras_core,
+ enum ras_seqno_type seqno_type, uint64_t seqno)
+{
+ int ret = 0;
+
+ if (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)
+ return -EINVAL;
+
+ if (seqno_type == RAS_SEQNO_TYPE_DE)
+ ret = ras_core_seqno_fifo_write(ras_core,
+ SEQNO_FIFO_POISON_CREATION, seqno);
+ else if (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)
+ ret = ras_core_seqno_fifo_write(ras_core,
+ SEQNO_FIFO_POISON_CONSUMPTION, seqno);
+ else
+ ret = -EINVAL;
+
+ return ret;
+}
+
+uint64_t ras_core_get_seqno(struct ras_core_context *ras_core,
+ enum ras_seqno_type seqno_type, bool pop)
+{
+ uint64_t seq_no;
+ int ret = -ENODATA;
+
+ if (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)
+ return 0;
+
+ if (seqno_type == RAS_SEQNO_TYPE_DE)
+ ret = ras_core_seqno_fifo_read(ras_core,
+ SEQNO_FIFO_POISON_CREATION, &seq_no, pop);
+ else if (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)
+ ret = ras_core_seqno_fifo_read(ras_core,
+ SEQNO_FIFO_POISON_CONSUMPTION, &seq_no, pop);
+
+ if (ret)
+ seq_no = ras_core_gen_seqno(ras_core, seqno_type);
+
+ return seq_no;
+}
+
+static int ras_core_eeprom_recovery(struct ras_core_context *ras_core)
+{
+ int count;
+ int ret;
+
+ count = ras_eeprom_get_record_count(ras_core);
+ if (!count)
+ return 0;
+
+ /* Avoid bad page to be loaded again after gpu reset */
+ if (ras_umc_get_saved_eeprom_count(ras_core) >= count)
+ return 0;
+
+ ret = ras_umc_load_bad_pages(ras_core);
+ if (ret) {
+ RAS_DEV_ERR(ras_core->dev, "ras_umc_load_bad_pages failed: %d\n", ret);
+ return ret;
+ }
+
+ ras_eeprom_sync_info(ras_core);
+
+ return ret;
+}
+
+struct ras_core_context *ras_core_create(struct ras_core_config *init_config)
+{
+ struct ras_core_context *ras_core;
+ struct ras_core_config *config;
+
+ ras_core = kzalloc(sizeof(*ras_core), GFP_KERNEL);
+ if (!ras_core)
+ return NULL;
+
+ config = kzalloc(sizeof(*config), GFP_KERNEL);
+ if (!config) {
+ kfree(ras_core);
+ return NULL;
+ }
+
+ memcpy(config, init_config, sizeof(*config));
+ ras_core->config = config;
+
+ return ras_core;
+}
+
+void ras_core_destroy(struct ras_core_context *ras_core)
+{
+ if (ras_core)
+ kfree(ras_core->config);
+
+ kfree(ras_core);
+}
+
+int ras_core_sw_init(struct ras_core_context *ras_core)
+{
+ int ret;
+
+ if (!ras_core->config) {
+ RAS_DEV_ERR(ras_core->dev, "No ras core config!\n");
+ return -EINVAL;
+ }
+
+ ras_core->sys_fn = ras_core->config->sys_fn;
+ if (!ras_core->sys_fn)
+ return -EINVAL;
+
+ ret = kfifo_alloc(&ras_core->de_seqno_fifo,
+ RAS_SEQNO_FIFO_SIZE, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ ret = kfifo_alloc(&ras_core->consumption_seqno_fifo,
+ RAS_SEQNO_FIFO_SIZE, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ spin_lock_init(&ras_core->seqno_lock);
+
+ ret = ras_aca_sw_init(ras_core);
+ if (ret)
+ return ret;
+
+ ret = ras_umc_sw_init(ras_core);
+ if (ret)
+ return ret;
+
+ ret = ras_cmd_init(ras_core);
+ if (ret)
+ return ret;
+
+ ret = ras_log_ring_sw_init(ras_core);
+ if (ret)
+ return ret;
+
+ ret = ras_psp_sw_init(ras_core);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+int ras_core_sw_fini(struct ras_core_context *ras_core)
+{
+ kfifo_free(&ras_core->de_seqno_fifo);
+ kfifo_free(&ras_core->consumption_seqno_fifo);
+
+ ras_psp_sw_fini(ras_core);
+ ras_log_ring_sw_fini(ras_core);
+ ras_cmd_fini(ras_core);
+ ras_umc_sw_fini(ras_core);
+ ras_aca_sw_fini(ras_core);
+
+ return 0;
+}
+
+int ras_core_hw_init(struct ras_core_context *ras_core)
+{
+ int ret;
+
+ ras_core->ras_eeprom_supported =
+ ras_core->config->ras_eeprom_supported;
+
+ ras_core->poison_supported = ras_core->config->poison_supported;
+
+ ret = ras_psp_hw_init(ras_core);
+ if (ret)
+ return ret;
+
+ ret = ras_aca_hw_init(ras_core);
+ if (ret)
+ goto init_err1;
+
+ ret = ras_mp1_hw_init(ras_core);
+ if (ret)
+ goto init_err2;
+
+ ret = ras_nbio_hw_init(ras_core);
+ if (ret)
+ goto init_err3;
+
+ ret = ras_umc_hw_init(ras_core);
+ if (ret)
+ goto init_err4;
+
+ ret = ras_gfx_hw_init(ras_core);
+ if (ret)
+ goto init_err5;
+
+ ret = ras_eeprom_hw_init(ras_core);
+ if (ret)
+ goto init_err6;
+
+ ret = ras_core_eeprom_recovery(ras_core);
+ if (ret) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to recovery ras core, ret:%d\n", ret);
+ goto init_err6;
+ }
+
+ ret = ras_eeprom_check_storage_status(ras_core);
+ if (ret)
+ goto init_err6;
+
+ ret = ras_process_init(ras_core);
+ if (ret)
+ goto init_err7;
+
+ ras_core->is_initialized = true;
+
+ return 0;
+
+init_err7:
+ ras_eeprom_hw_fini(ras_core);
+init_err6:
+ ras_gfx_hw_fini(ras_core);
+init_err5:
+ ras_umc_hw_fini(ras_core);
+init_err4:
+ ras_nbio_hw_fini(ras_core);
+init_err3:
+ ras_mp1_hw_fini(ras_core);
+init_err2:
+ ras_aca_hw_fini(ras_core);
+init_err1:
+ ras_psp_hw_fini(ras_core);
+ return ret;
+}
+
+int ras_core_hw_fini(struct ras_core_context *ras_core)
+{
+ ras_core->is_initialized = false;
+
+ ras_process_fini(ras_core);
+ ras_eeprom_hw_fini(ras_core);
+ ras_gfx_hw_fini(ras_core);
+ ras_nbio_hw_fini(ras_core);
+ ras_umc_hw_fini(ras_core);
+ ras_mp1_hw_fini(ras_core);
+ ras_aca_hw_fini(ras_core);
+ ras_psp_hw_fini(ras_core);
+
+ return 0;
+}
+
+bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data)
+{
+ return ras_nbio_handle_irq_error(ras_core, data);
+}
+
+int ras_core_handle_fatal_error(struct ras_core_context *ras_core)
+{
+ int ret = 0;
+
+ ras_aca_mark_fatal_flag(ras_core);
+
+ ret = ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__FATAL_ERROR_DETECTED, NULL);
+
+ return ret;
+}
+
+uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core)
+{
+ if (ras_core->ras_nbio.ip_func &&
+ ras_core->ras_nbio.ip_func->get_memory_partition_mode)
+ return ras_core->ras_nbio.ip_func->get_memory_partition_mode(ras_core);
+
+ RAS_DEV_ERR(ras_core->dev, "Failed to get gpu memory nps mode!\n");
+ return 0;
+}
+
+int ras_core_update_ecc_info(struct ras_core_context *ras_core)
+{
+ int ret;
+
+ ret = ras_aca_update_ecc(ras_core, RAS_ERR_TYPE__CE, NULL);
+ if (!ret)
+ ret = ras_aca_update_ecc(ras_core, RAS_ERR_TYPE__UE, NULL);
+
+ return ret;
+}
+
+int ras_core_query_block_ecc_data(struct ras_core_context *ras_core,
+ enum ras_block_id block, struct ras_ecc_count *ecc_count)
+{
+ int ret;
+
+ if (!ecc_count || (block >= RAS_BLOCK_ID__LAST) || !ras_core)
+ return -EINVAL;
+
+ ret = ras_aca_get_block_ecc_count(ras_core, block, ecc_count);
+ if (!ret)
+ ras_aca_clear_block_new_ecc_count(ras_core, block);
+
+ return ret;
+}
+
+int ras_core_set_status(struct ras_core_context *ras_core, bool enable)
+{
+ ras_core->ras_core_enabled = enable;
+
+ return 0;
+}
+
+bool ras_core_is_enabled(struct ras_core_context *ras_core)
+{
+ return ras_core->ras_core_enabled;
+}
+
+uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core)
+{
+ if (ras_core && ras_core->sys_fn &&
+ ras_core->sys_fn->get_utc_second_timestamp)
+ return ras_core->sys_fn->get_utc_second_timestamp(ras_core);
+
+ RAS_DEV_ERR(ras_core->dev, "Failed to get system timestamp!\n");
+ return 0;
+}
+
+int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
+ uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa)
+{
+ if (!ras_core || !soc_pa || !bank_addr)
+ return -EINVAL;
+
+ return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, bank_addr, bank_to_pa);
+}
+
+bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core)
+{
+ if (ras_core && ras_core->sys_fn &&
+ ras_core->sys_fn->detect_ras_interrupt)
+ return ras_core->sys_fn->detect_ras_interrupt(ras_core);
+
+ RAS_DEV_ERR(ras_core->dev, "Failed to detect ras interrupt!\n");
+ return false;
+}
+
+int ras_core_get_gpu_mem(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
+{
+ if (ras_core->sys_fn && ras_core->sys_fn->get_gpu_mem)
+ return ras_core->sys_fn->get_gpu_mem(ras_core, mem_type, gpu_mem);
+
+ RAS_DEV_ERR(ras_core->dev, "Not config get gpu memory API!\n");
+ return -EACCES;
+}
+
+int ras_core_put_gpu_mem(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem)
+{
+ if (ras_core->sys_fn && ras_core->sys_fn->put_gpu_mem)
+ return ras_core->sys_fn->put_gpu_mem(ras_core, mem_type, gpu_mem);
+
+ RAS_DEV_ERR(ras_core->dev, "Not config put gpu memory API!!\n");
+ return -EACCES;
+}
+
+bool ras_core_is_ready(struct ras_core_context *ras_core)
+{
+ return ras_core ? ras_core->is_initialized : false;
+}
+
+bool ras_core_check_safety_watermark(struct ras_core_context *ras_core)
+{
+ return ras_eeprom_check_safety_watermark(ras_core);
+}
+
+int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core)
+{
+ if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock)
+ return ras_core->sys_fn->gpu_reset_lock(ras_core, true, true);
+
+ return 1;
+}
+
+void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core)
+{
+ if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock)
+ ras_core->sys_fn->gpu_reset_lock(ras_core, true, false);
+}
+
+void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core)
+{
+ if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock)
+ ras_core->sys_fn->gpu_reset_lock(ras_core, false, false);
+}
+
+int ras_core_event_notify(struct ras_core_context *ras_core,
+ enum ras_notify_event event_id, void *data)
+{
+ if (ras_core && ras_core->sys_fn &&
+ ras_core->sys_fn->ras_notifier)
+ return ras_core->sys_fn->ras_notifier(ras_core, event_id, data);
+
+ return -RAS_CORE_NOT_SUPPORTED;
+}
+
+int ras_core_get_device_system_info(struct ras_core_context *ras_core,
+ struct device_system_info *dev_info)
+{
+ if (ras_core && ras_core->sys_fn &&
+ ras_core->sys_fn->get_device_system_info)
+ return ras_core->sys_fn->get_device_system_info(ras_core, dev_info);
+
+ return -RAS_CORE_NOT_SUPPORTED;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
new file mode 100644
index 000000000000..0fc7522b7ab6
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_core_status.h"
+#include "ras_log_ring.h"
+#include "ras_cper.h"
+
+static const struct ras_cper_guid MCE = CPER_NOTIFY__MCE;
+static const struct ras_cper_guid CMC = CPER_NOTIFY__CMC;
+static const struct ras_cper_guid BOOT = BOOT__TYPE;
+
+static const struct ras_cper_guid CRASHDUMP = GPU__CRASHDUMP;
+static const struct ras_cper_guid RUNTIME = GPU__NONSTANDARD_ERROR;
+
+static void cper_get_timestamp(struct ras_core_context *ras_core,
+ struct ras_cper_timestamp *timestamp, uint64_t utc_second_timestamp)
+{
+ struct ras_time tm = {0};
+
+ ras_core_convert_timestamp_to_time(ras_core, utc_second_timestamp, &tm);
+ timestamp->seconds = tm.tm_sec;
+ timestamp->minutes = tm.tm_min;
+ timestamp->hours = tm.tm_hour;
+ timestamp->flag = 0;
+ timestamp->day = tm.tm_mday;
+ timestamp->month = tm.tm_mon;
+ timestamp->year = tm.tm_year % 100;
+ timestamp->century = tm.tm_year / 100;
+}
+
+static void fill_section_hdr(struct ras_core_context *ras_core,
+ struct cper_section_hdr *hdr, enum ras_cper_type type,
+ enum ras_cper_severity sev, struct ras_log_info *trace)
+{
+ struct device_system_info dev_info = {0};
+ char record_id[32];
+
+ hdr->signature[0] = 'C';
+ hdr->signature[1] = 'P';
+ hdr->signature[2] = 'E';
+ hdr->signature[3] = 'R';
+ hdr->revision = CPER_HDR__REV_1;
+ hdr->signature_end = 0xFFFFFFFF;
+ hdr->error_severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev);
+
+ hdr->valid_bits.platform_id = 1;
+ hdr->valid_bits.timestamp = 1;
+
+ ras_core_get_device_system_info(ras_core, &dev_info);
+
+ cper_get_timestamp(ras_core, &hdr->timestamp, trace->timestamp);
+
+ snprintf(record_id, sizeof(record_id), "%d:%llX", dev_info.socket_id,
+ RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno));
+ memcpy(hdr->record_id, record_id, 8);
+
+ snprintf(hdr->platform_id, 16, "0x%04X:0x%04X",
+ dev_info.vendor_id, dev_info.device_id);
+ /* pmfw version should be part of creator_id according to CPER spec */
+ snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID__AMDGPU);
+
+ switch (type) {
+ case RAS_CPER_TYPE_BOOT:
+ hdr->notify_type = BOOT;
+ break;
+ case RAS_CPER_TYPE_FATAL:
+ case RAS_CPER_TYPE_RMA:
+ hdr->notify_type = MCE;
+ break;
+ case RAS_CPER_TYPE_RUNTIME:
+ if (sev == RAS_CPER_SEV_NON_FATAL_CE)
+ hdr->notify_type = CMC;
+ else
+ hdr->notify_type = MCE;
+ break;
+ default:
+ RAS_DEV_ERR(ras_core->dev, "Unknown CPER Type\n");
+ break;
+ }
+}
+
+static int fill_section_descriptor(struct ras_core_context *ras_core,
+ struct cper_section_descriptor *descriptor,
+ enum ras_cper_severity sev,
+ struct ras_cper_guid sec_type,
+ uint32_t section_offset,
+ uint32_t section_length)
+{
+ struct device_system_info dev_info = {0};
+
+ descriptor->revision_minor = CPER_SEC__MINOR_REV_1;
+ descriptor->revision_major = CPER_SEC__MAJOR_REV_22;
+ descriptor->sec_offset = section_offset;
+ descriptor->sec_length = section_length;
+ descriptor->valid_bits.fru_text = 1;
+ descriptor->flag_bits.primary = 1;
+ descriptor->severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev);
+ descriptor->sec_type = sec_type;
+
+ ras_core_get_device_system_info(ras_core, &dev_info);
+
+ snprintf(descriptor->fru_text, 20, "OAM%d", dev_info.socket_id);
+
+ if (sev == RAS_CPER_SEV_RMA)
+ descriptor->flag_bits.exceed_err_threshold = 1;
+
+ if (sev == RAS_CPER_SEV_NON_FATAL_UE)
+ descriptor->flag_bits.latent_err = 1;
+
+ return 0;
+}
+
+static int fill_section_fatal(struct ras_core_context *ras_core,
+ struct cper_section_fatal *fatal, struct ras_log_info *trace)
+{
+ fatal->data.reg_ctx_type = CPER_CTX_TYPE__CRASH;
+ fatal->data.reg_arr_size = sizeof(fatal->data.reg);
+
+ fatal->data.reg.status = trace->aca_reg.regs[RAS_CPER_ACA_REG_STATUS];
+ fatal->data.reg.addr = trace->aca_reg.regs[RAS_CPER_ACA_REG_ADDR];
+ fatal->data.reg.ipid = trace->aca_reg.regs[RAS_CPER_ACA_REG_IPID];
+ fatal->data.reg.synd = trace->aca_reg.regs[RAS_CPER_ACA_REG_SYND];
+
+ return 0;
+}
+
+static int fill_section_runtime(struct ras_core_context *ras_core,
+ struct cper_section_runtime *runtime, struct ras_log_info *trace,
+ enum ras_cper_severity sev)
+{
+ runtime->hdr.valid_bits.err_info_cnt = 1;
+ runtime->hdr.valid_bits.err_context_cnt = 1;
+
+ runtime->descriptor.error_type = RUNTIME;
+ runtime->descriptor.ms_chk_bits.err_type_valid = 1;
+ if (sev == RAS_CPER_SEV_RMA) {
+ runtime->descriptor.valid_bits.ms_chk = 1;
+ runtime->descriptor.ms_chk_bits.err_type = 1;
+ runtime->descriptor.ms_chk_bits.pcc = 1;
+ }
+
+ runtime->reg.reg_ctx_type = CPER_CTX_TYPE__CRASH;
+ runtime->reg.reg_arr_size = sizeof(runtime->reg.reg_dump);
+
+ runtime->reg.reg_dump[RAS_CPER_ACA_REG_CTL] = trace->aca_reg.regs[ACA_REG_IDX__CTL];
+ runtime->reg.reg_dump[RAS_CPER_ACA_REG_STATUS] = trace->aca_reg.regs[ACA_REG_IDX__STATUS];
+ runtime->reg.reg_dump[RAS_CPER_ACA_REG_ADDR] = trace->aca_reg.regs[ACA_REG_IDX__ADDR];
+ runtime->reg.reg_dump[RAS_CPER_ACA_REG_MISC0] = trace->aca_reg.regs[ACA_REG_IDX__MISC0];
+ runtime->reg.reg_dump[RAS_CPER_ACA_REG_CONFIG] = trace->aca_reg.regs[ACA_REG_IDX__CONFG];
+ runtime->reg.reg_dump[RAS_CPER_ACA_REG_IPID] = trace->aca_reg.regs[ACA_REG_IDX__IPID];
+ runtime->reg.reg_dump[RAS_CPER_ACA_REG_SYND] = trace->aca_reg.regs[ACA_REG_IDX__SYND];
+
+ return 0;
+}
+
+static int cper_generate_runtime_record(struct ras_core_context *ras_core,
+ struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t arr_num,
+ enum ras_cper_severity sev)
+{
+ struct cper_section_descriptor *descriptor;
+ struct cper_section_runtime *runtime;
+ int i;
+
+ fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, trace_arr[0]);
+ hdr->record_length = RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + RAS_NONSTD_SEC_LEN) * arr_num);
+ hdr->sec_cnt = arr_num;
+ for (i = 0; i < arr_num; i++) {
+ descriptor = (struct cper_section_descriptor *)((uint8_t *)hdr +
+ RAS_SEC_DESC_OFFSET(i));
+ runtime = (struct cper_section_runtime *)((uint8_t *)hdr +
+ RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i));
+
+ fill_section_descriptor(ras_core, descriptor, sev, RUNTIME,
+ RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i),
+ sizeof(struct cper_section_runtime));
+ fill_section_runtime(ras_core, runtime, trace_arr[i], sev);
+ }
+
+ return 0;
+}
+
+static int cper_generate_fatal_record(struct ras_core_context *ras_core,
+ uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num)
+{
+ struct ras_cper_fatal_record record = {0};
+ int i = 0;
+
+ for (i = 0; i < arr_num; i++) {
+ fill_section_hdr(ras_core, &record.hdr, RAS_CPER_TYPE_FATAL,
+ RAS_CPER_SEV_FATAL_UE, trace_arr[i]);
+ record.hdr.record_length = RAS_HDR_LEN + RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN;
+ record.hdr.sec_cnt = 1;
+
+ fill_section_descriptor(ras_core, &record.descriptor, RAS_CPER_SEV_FATAL_UE,
+ CRASHDUMP, offsetof(struct ras_cper_fatal_record, fatal),
+ sizeof(struct cper_section_fatal));
+
+ fill_section_fatal(ras_core, &record.fatal, trace_arr[i]);
+
+ memcpy(buffer + (i * record.hdr.record_length),
+ &record, record.hdr.record_length);
+ }
+
+ return 0;
+}
+
+static int cper_get_record_size(enum ras_cper_type type, uint16_t section_count)
+{
+ int size = 0;
+
+ size += RAS_HDR_LEN;
+ size += (RAS_SEC_DESC_LEN * section_count);
+
+ switch (type) {
+ case RAS_CPER_TYPE_RUNTIME:
+ case RAS_CPER_TYPE_RMA:
+ size += (RAS_NONSTD_SEC_LEN * section_count);
+ break;
+ case RAS_CPER_TYPE_FATAL:
+ size += (RAS_FATAL_SEC_LEN * section_count);
+ size += (RAS_HDR_LEN * (section_count - 1));
+ break;
+ case RAS_CPER_TYPE_BOOT:
+ size += (RAS_BOOT_SEC_LEN * section_count);
+ break;
+ default:
+ /* should never reach here */
+ break;
+ }
+
+ return size;
+}
+
+static enum ras_cper_type cper_ras_log_event_to_cper_type(enum ras_log_event event)
+{
+ switch (event) {
+ case RAS_LOG_EVENT_UE:
+ return RAS_CPER_TYPE_FATAL;
+ case RAS_LOG_EVENT_DE:
+ case RAS_LOG_EVENT_CE:
+ case RAS_LOG_EVENT_POISON_CREATION:
+ case RAS_LOG_EVENT_POISON_CONSUMPTION:
+ return RAS_CPER_TYPE_RUNTIME;
+ case RAS_LOG_EVENT_RMA:
+ return RAS_CPER_TYPE_RMA;
+ default:
+ /* should never reach here */
+ return RAS_CPER_TYPE_RUNTIME;
+ }
+}
+
+int ras_cper_generate_cper(struct ras_core_context *ras_core,
+ struct ras_log_info **trace_list, uint32_t count,
+ uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len)
+{
+ uint8_t *buffer = buf;
+ uint64_t buf_size = buf_len;
+ int record_size, saved_size = 0;
+ struct cper_section_hdr *hdr;
+
+ /* All the batch traces share the same event */
+ record_size = cper_get_record_size(
+ cper_ras_log_event_to_cper_type(trace_list[0]->event), count);
+
+ if ((record_size + saved_size) > buf_size)
+ return -ENOMEM;
+
+ hdr = (struct cper_section_hdr *)(buffer + saved_size);
+
+ switch (trace_list[0]->event) {
+ case RAS_LOG_EVENT_RMA:
+ cper_generate_runtime_record(ras_core, hdr, trace_list, count, RAS_CPER_SEV_RMA);
+ break;
+ case RAS_LOG_EVENT_DE:
+ cper_generate_runtime_record(ras_core,
+ hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_UE);
+ break;
+ case RAS_LOG_EVENT_CE:
+ cper_generate_runtime_record(ras_core,
+ hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_CE);
+ break;
+ case RAS_LOG_EVENT_UE:
+ cper_generate_fatal_record(ras_core, buffer + saved_size, trace_list, count);
+ break;
+ default:
+ RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n", trace_list[0]->event);
+ break;
+ }
+
+ saved_size += record_size;
+
+ *real_data_len = saved_size;
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
new file mode 100644
index 000000000000..076c1883c1ce
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_CPER_H__
+#define __RAS_CPER_H__
+
+#define CPER_UUID_MAX_SIZE 16
+struct ras_cper_guid {
+ uint8_t b[CPER_UUID_MAX_SIZE];
+};
+
+#define CPER_GUID__INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
+ ((struct ras_cper_guid) \
+ {{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+ (b) & 0xff, ((b) >> 8) & 0xff, \
+ (c) & 0xff, ((c) >> 8) & 0xff, \
+ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+#define CPER_HDR__REV_1 (0x100)
+#define CPER_SEC__MINOR_REV_1 (0x01)
+#define CPER_SEC__MAJOR_REV_22 (0x22)
+#define CPER_OAM_MAX_COUNT (8)
+
+#define CPER_CTX_TYPE__CRASH (1)
+#define CPER_CTX_TYPE__BOOT (9)
+
+#define CPER_CREATOR_ID__AMDGPU "amdgpu"
+
+#define CPER_NOTIFY__MCE \
+ CPER_GUID__INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \
+ 0xE1, 0x49, 0x13, 0xBB)
+#define CPER_NOTIFY__CMC \
+ CPER_GUID__INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \
+ 0xEB, 0xD4, 0xF8, 0x90)
+#define BOOT__TYPE \
+ CPER_GUID__INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \
+ 0xD4, 0x64, 0xB3, 0x8F)
+
+#define GPU__CRASHDUMP \
+ CPER_GUID__INIT(0x32AC0C78, 0x2623, 0x48F6, 0xB0, 0xD0, 0x73, 0x65, \
+ 0x72, 0x5F, 0xD6, 0xAE)
+#define GPU__NONSTANDARD_ERROR \
+ CPER_GUID__INIT(0x32AC0C78, 0x2623, 0x48F6, 0x81, 0xA2, 0xAC, 0x69, \
+ 0x17, 0x80, 0x55, 0x1D)
+#define PROC_ERR__SECTION_TYPE \
+ CPER_GUID__INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \
+ 0x24, 0x2B, 0x6E, 0x1D)
+
+enum ras_cper_type {
+ RAS_CPER_TYPE_RUNTIME,
+ RAS_CPER_TYPE_FATAL,
+ RAS_CPER_TYPE_BOOT,
+ RAS_CPER_TYPE_RMA,
+};
+
+enum ras_cper_severity {
+ RAS_CPER_SEV_NON_FATAL_UE = 0,
+ RAS_CPER_SEV_FATAL_UE = 1,
+ RAS_CPER_SEV_NON_FATAL_CE = 2,
+ RAS_CPER_SEV_RMA = 3,
+
+ RAS_CPER_SEV_UNUSED = 10,
+};
+
+enum ras_cper_aca_reg {
+ RAS_CPER_ACA_REG_CTL = 0,
+ RAS_CPER_ACA_REG_STATUS = 1,
+ RAS_CPER_ACA_REG_ADDR = 2,
+ RAS_CPER_ACA_REG_MISC0 = 3,
+ RAS_CPER_ACA_REG_CONFIG = 4,
+ RAS_CPER_ACA_REG_IPID = 5,
+ RAS_CPER_ACA_REG_SYND = 6,
+ RAS_CPER_ACA_REG_DESTAT = 8,
+ RAS_CPER_ACA_REG_DEADDR = 9,
+ RAS_CPER_ACA_REG_MASK = 10,
+
+ RAS_CPER_ACA_REG_COUNT = 16,
+};
+
+#pragma pack(push, 1)
+
+struct ras_cper_timestamp {
+ uint8_t seconds;
+ uint8_t minutes;
+ uint8_t hours;
+ uint8_t flag;
+ uint8_t day;
+ uint8_t month;
+ uint8_t year;
+ uint8_t century;
+};
+
+struct cper_section_hdr {
+ char signature[4]; /* "CPER" */
+ uint16_t revision;
+ uint32_t signature_end; /* 0xFFFFFFFF */
+ uint16_t sec_cnt;
+ enum ras_cper_severity error_severity;
+ union {
+ struct {
+ uint32_t platform_id : 1;
+ uint32_t timestamp : 1;
+ uint32_t partition_id : 1;
+ uint32_t reserved : 29;
+ } valid_bits;
+ uint32_t valid_mask;
+ };
+ uint32_t record_length; /* Total size of CPER Entry */
+ struct ras_cper_timestamp timestamp;
+ char platform_id[16];
+ struct ras_cper_guid partition_id; /* Reserved */
+ char creator_id[16];
+ struct ras_cper_guid notify_type; /* CMC, MCE */
+ char record_id[8]; /* Unique CPER Entry ID */
+ uint32_t flags; /* Reserved */
+ uint64_t persistence_info; /* Reserved */
+ uint8_t reserved[12]; /* Reserved */
+};
+
+struct cper_section_descriptor {
+ uint32_t sec_offset; /* Offset from the start of CPER entry */
+ uint32_t sec_length;
+ uint8_t revision_minor; /* CPER_SEC_MINOR_REV_1 */
+ uint8_t revision_major; /* CPER_SEC_MAJOR_REV_22 */
+ union {
+ struct {
+ uint8_t fru_id : 1;
+ uint8_t fru_text : 1;
+ uint8_t reserved : 6;
+ } valid_bits;
+ uint8_t valid_mask;
+ };
+ uint8_t reserved;
+ union {
+ struct {
+ uint32_t primary : 1;
+ uint32_t reserved1 : 2;
+ uint32_t exceed_err_threshold : 1;
+ uint32_t latent_err : 1;
+ uint32_t reserved2 : 27;
+ } flag_bits;
+ uint32_t flag_mask;
+ };
+ struct ras_cper_guid sec_type;
+ char fru_id[16];
+ enum ras_cper_severity severity;
+ char fru_text[20];
+};
+
+struct runtime_hdr {
+ union {
+ struct {
+ uint64_t apic_id : 1;
+ uint64_t fw_id : 1;
+ uint64_t err_info_cnt : 6;
+ uint64_t err_context_cnt : 6;
+ } valid_bits;
+ uint64_t valid_mask;
+ };
+ uint64_t apic_id;
+ char fw_id[48];
+};
+
+struct runtime_descriptor {
+ struct ras_cper_guid error_type;
+ union {
+ struct {
+ uint64_t ms_chk : 1;
+ uint64_t target_addr_id : 1;
+ uint64_t req_id : 1;
+ uint64_t resp_id : 1;
+ uint64_t instr_ptr : 1;
+ uint64_t reserved : 59;
+ } valid_bits;
+ uint64_t valid_mask;
+ };
+ union {
+ struct {
+ uint64_t err_type_valid : 1;
+ uint64_t pcc_valid : 1;
+ uint64_t uncorr_valid : 1;
+ uint64_t precise_ip_valid : 1;
+ uint64_t restartable_ip_valid : 1;
+ uint64_t overflow_valid : 1;
+ uint64_t reserved1 : 10;
+ uint64_t err_type : 2;
+ uint64_t pcc : 1;
+ uint64_t uncorr : 1;
+ uint64_t precised_ip : 1;
+ uint64_t restartable_ip : 1;
+ uint64_t overflow : 1;
+ uint64_t reserved2 : 41;
+ } ms_chk_bits;
+ uint64_t ms_chk_mask;
+ };
+ uint64_t target_addr_id;
+ uint64_t req_id;
+ uint64_t resp_id;
+ uint64_t instr_ptr;
+};
+
+struct runtime_error_reg {
+ uint16_t reg_ctx_type;
+ uint16_t reg_arr_size;
+ uint32_t msr_addr;
+ uint64_t mm_reg_addr;
+ uint64_t reg_dump[RAS_CPER_ACA_REG_COUNT];
+};
+
+struct cper_section_runtime {
+ struct runtime_hdr hdr;
+ struct runtime_descriptor descriptor;
+ struct runtime_error_reg reg;
+};
+
+struct crashdump_hdr {
+ uint64_t reserved1;
+ uint64_t reserved2;
+ char fw_id[48];
+ uint64_t reserved3[8];
+};
+
+struct fatal_reg_info {
+ uint64_t status;
+ uint64_t addr;
+ uint64_t ipid;
+ uint64_t synd;
+};
+
+struct crashdump_fatal {
+ uint16_t reg_ctx_type;
+ uint16_t reg_arr_size;
+ uint32_t reserved1;
+ uint64_t reserved2;
+ struct fatal_reg_info reg;
+};
+
+struct crashdump_boot {
+ uint16_t reg_ctx_type;
+ uint16_t reg_arr_size;
+ uint32_t reserved1;
+ uint64_t reserved2;
+ uint64_t msg[CPER_OAM_MAX_COUNT];
+};
+
+struct cper_section_fatal {
+ struct crashdump_hdr hdr;
+ struct crashdump_fatal data;
+};
+
+struct cper_section_boot {
+ struct crashdump_hdr hdr;
+ struct crashdump_boot data;
+};
+
+struct ras_cper_fatal_record {
+ struct cper_section_hdr hdr;
+ struct cper_section_descriptor descriptor;
+ struct cper_section_fatal fatal;
+};
+#pragma pack(pop)
+
+#define RAS_HDR_LEN (sizeof(struct cper_section_hdr))
+#define RAS_SEC_DESC_LEN (sizeof(struct cper_sec_desc))
+
+#define RAS_BOOT_SEC_LEN (sizeof(struct cper_sec_crashdump_boot))
+#define RAS_FATAL_SEC_LEN (sizeof(struct cper_sec_crashdump_fatal))
+#define RAS_NONSTD_SEC_LEN (sizeof(struct cper_sec_nonstd_err))
+
+#define RAS_SEC_DESC_OFFSET(idx) (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * idx))
+
+#define RAS_BOOT_SEC_OFFSET(count, idx) \
+ (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * count) + (RAS_BOOT_SEC_LEN * idx))
+#define RAS_FATAL_SEC_OFFSET(count, idx) \
+ (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * count) + (RAS_FATAL_SEC_LEN * idx))
+#define RAS_NONSTD_SEC_OFFSET(count, idx) \
+ (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * count) + (RAS_NONSTD_SEC_LEN * idx))
+
+struct ras_core_context;
+struct ras_log_info;
+int ras_cper_generate_cper(struct ras_core_context *ras_core,
+ struct ras_log_info **trace_list, uint32_t count,
+ uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.c b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.c
new file mode 100644
index 000000000000..cd6b057bdaf3
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.c
@@ -0,0 +1,1339 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "ras_eeprom.h"
+#include "ras.h"
+
+/* These are memory addresses as would be seen by one or more EEPROM
+ * chips strung on the I2C bus, usually by manipulating pins 1-3 of a
+ * set of EEPROM devices. They form a continuous memory space.
+ *
+ * The I2C device address includes the device type identifier, 1010b,
+ * which is a reserved value and indicates that this is an I2C EEPROM
+ * device. It also includes the top 3 bits of the 19 bit EEPROM memory
+ * address, namely bits 18, 17, and 16. This makes up the 7 bit
+ * address sent on the I2C bus with bit 0 being the direction bit,
+ * which is not represented here, and sent by the hardware directly.
+ *
+ * For instance,
+ * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0.
+ * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h.
+ * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h.
+ * Depending on the size of the I2C EEPROM device(s), bits 18:16 may
+ * address memory in a device or a device on the I2C bus, depending on
+ * the status of pins 1-3.
+ *
+ * The RAS table lives either at address 0 or address 40000h of EEPROM.
+ */
+#define EEPROM_I2C_MADDR_0 0x0
+#define EEPROM_I2C_MADDR_4 0x40000
+
+#define EEPROM_PAGE_BITS 8
+#define EEPROM_PAGE_SIZE (1U << EEPROM_PAGE_BITS)
+#define EEPROM_PAGE_MASK (EEPROM_PAGE_SIZE - 1)
+
+#define EEPROM_OFFSET_SIZE 2
+#define MAKE_I2C_ADDR(_aa) ((0xA << 3) | (((_aa) >> 16) & 0xF))
+
+/*
+ * The 2 macros bellow represent the actual size in bytes that
+ * those entities occupy in the EEPROM memory.
+ * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_umc_record) which
+ * uses uint64 to store 6b fields such as retired_page.
+ */
+#define RAS_TABLE_HEADER_SIZE 20
+#define RAS_TABLE_RECORD_SIZE 24
+
+/* Table hdr is 'AMDR' */
+#define RAS_TABLE_HDR_VAL 0x414d4452
+
+/* Bad GPU tag ‘BADG’ */
+#define RAS_TABLE_HDR_BAD 0x42414447
+
+/*
+ * EEPROM Table structure v1
+ * ---------------------------------
+ * | |
+ * | EEPROM TABLE HEADER |
+ * | ( size 20 Bytes ) |
+ * | |
+ * ---------------------------------
+ * | |
+ * | BAD PAGE RECORD AREA |
+ * | |
+ * ---------------------------------
+ */
+
+/* Assume 2-Mbit size EEPROM and take up the whole space. */
+#define RAS_TBL_SIZE_BYTES (256 * 1024)
+#define RAS_TABLE_START 0
+#define RAS_HDR_START RAS_TABLE_START
+#define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
+#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
+ / RAS_TABLE_RECORD_SIZE)
+
+/*
+ * EEPROM Table structrue v2.1
+ * ---------------------------------
+ * | |
+ * | EEPROM TABLE HEADER |
+ * | ( size 20 Bytes ) |
+ * | |
+ * ---------------------------------
+ * | |
+ * | EEPROM TABLE RAS INFO |
+ * | (available info size 4 Bytes) |
+ * | ( reserved size 252 Bytes ) |
+ * | |
+ * ---------------------------------
+ * | |
+ * | BAD PAGE RECORD AREA |
+ * | |
+ * ---------------------------------
+ */
+
+/* EEPROM Table V2_1 */
+#define RAS_TABLE_V2_1_INFO_SIZE 256
+#define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE
+#define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \
+ RAS_TABLE_V2_1_INFO_SIZE)
+#define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \
+ RAS_TABLE_V2_1_INFO_SIZE) \
+ / RAS_TABLE_RECORD_SIZE)
+
+/* Given a zero-based index of an EEPROM RAS record, yields the EEPROM
+ * offset off of RAS_TABLE_START. That is, this is something you can
+ * add to control->i2c_address, and then tell I2C layer to read
+ * from/write to there. _N is the so called absolute index,
+ * because it starts right after the table header.
+ */
+#define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \
+ (_N) * RAS_TABLE_RECORD_SIZE)
+
+#define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
+ (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
+
+/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
+ * of "fri", return the absolute record index off of the end of
+ * the table header.
+ */
+#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
+ (_C)->ras_max_record_count)
+
+#define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
+ RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
+
+#define RAS_NUM_RECS_V2_1(_tbl_hdr) (((_tbl_hdr)->tbl_size - \
+ RAS_TABLE_HEADER_SIZE - \
+ RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE)
+
+#define to_ras_core_context(x) (container_of(x, struct ras_core_context, ras_eeprom))
+
+static bool __is_ras_eeprom_supported(struct ras_core_context *ras_core)
+{
+ return ras_core->ras_eeprom_supported;
+}
+
+static bool __get_eeprom_i2c_addr(struct ras_core_context *ras_core,
+ struct ras_eeprom_control *control)
+{
+ int ret = -EINVAL;
+
+ if (control->sys_func &&
+ control->sys_func->update_eeprom_i2c_config)
+ ret = control->sys_func->update_eeprom_i2c_config(ras_core);
+ else
+ RAS_DEV_WARN(ras_core->dev,
+ "No eeprom i2c system config!\n");
+
+ return !ret ? true : false;
+}
+
+static int __ras_eeprom_xfer(struct ras_core_context *ras_core, u32 eeprom_addr,
+ u8 *eeprom_buf, u32 buf_size, bool read)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ int ret;
+
+ if (control->sys_func && control->sys_func->eeprom_i2c_xfer) {
+ ret = control->sys_func->eeprom_i2c_xfer(ras_core,
+ eeprom_addr, eeprom_buf, buf_size, read);
+
+ if ((ret > 0) && !read) {
+ /* According to EEPROM specs the length of the
+ * self-writing cycle, tWR (tW), is 10 ms.
+ *
+ * TODO: Use polling on ACK, aka Acknowledge
+ * Polling, to minimize waiting for the
+ * internal write cycle to complete, as it is
+ * usually smaller than tWR (tW).
+ */
+ msleep(10);
+ }
+
+ return ret;
+ }
+
+ RAS_DEV_ERR(ras_core->dev, "Error: No eeprom i2c system xfer function!\n");
+ return -EINVAL;
+}
+
+static int __eeprom_xfer(struct ras_core_context *ras_core, u32 eeprom_addr,
+ u8 *eeprom_buf, u32 buf_size, bool read)
+{
+ u16 limit;
+ u16 ps; /* Partial size */
+ int res = 0, r;
+
+ if (read)
+ limit = ras_core->ras_eeprom.max_read_len;
+ else
+ limit = ras_core->ras_eeprom.max_write_len;
+
+ if (limit && (limit <= EEPROM_OFFSET_SIZE)) {
+ RAS_DEV_ERR(ras_core->dev,
+ "maddr:0x%04X size:0x%02X:quirk max_%s_len must be > %d",
+ eeprom_addr, buf_size,
+ read ? "read" : "write", EEPROM_OFFSET_SIZE);
+ return -EINVAL;
+ }
+
+ ras_core_down_gpu_reset_lock(ras_core);
+
+ if (limit == 0) {
+ res = __ras_eeprom_xfer(ras_core, eeprom_addr,
+ eeprom_buf, buf_size, read);
+ } else {
+ /* The "limit" includes all data bytes sent/received,
+ * which would include the EEPROM_OFFSET_SIZE bytes.
+ * Account for them here.
+ */
+ limit -= EEPROM_OFFSET_SIZE;
+ for ( ; buf_size > 0;
+ buf_size -= ps, eeprom_addr += ps, eeprom_buf += ps) {
+ ps = (buf_size < limit) ? buf_size : limit;
+
+ r = __ras_eeprom_xfer(ras_core, eeprom_addr,
+ eeprom_buf, ps, read);
+ if (r < 0)
+ break;
+
+ res += r;
+ }
+ }
+
+ ras_core_up_gpu_reset_lock(ras_core);
+
+ return res;
+}
+
+static int __eeprom_read(struct ras_core_context *ras_core,
+ u32 eeprom_addr, u8 *eeprom_buf, u32 bytes)
+{
+ return __eeprom_xfer(ras_core, eeprom_addr,
+ eeprom_buf, bytes, true);
+}
+
+static int __eeprom_write(struct ras_core_context *ras_core,
+ u32 eeprom_addr, u8 *eeprom_buf, u32 bytes)
+{
+ return __eeprom_xfer(ras_core, eeprom_addr,
+ eeprom_buf, bytes, false);
+}
+
+static void
+__encode_table_header_to_buf(struct ras_eeprom_table_header *hdr,
+ unsigned char *buf)
+{
+ u32 *pp = (uint32_t *)buf;
+
+ pp[0] = cpu_to_le32(hdr->header);
+ pp[1] = cpu_to_le32(hdr->version);
+ pp[2] = cpu_to_le32(hdr->first_rec_offset);
+ pp[3] = cpu_to_le32(hdr->tbl_size);
+ pp[4] = cpu_to_le32(hdr->checksum);
+}
+
+static void
+__decode_table_header_from_buf(struct ras_eeprom_table_header *hdr,
+ unsigned char *buf)
+{
+ u32 *pp = (uint32_t *)buf;
+
+ hdr->header = le32_to_cpu(pp[0]);
+ hdr->version = le32_to_cpu(pp[1]);
+ hdr->first_rec_offset = le32_to_cpu(pp[2]);
+ hdr->tbl_size = le32_to_cpu(pp[3]);
+ hdr->checksum = le32_to_cpu(pp[4]);
+}
+
+static int __write_table_header(struct ras_eeprom_control *control)
+{
+ u8 buf[RAS_TABLE_HEADER_SIZE];
+ struct ras_core_context *ras_core = to_ras_core_context(control);
+ int res;
+
+ memset(buf, 0, sizeof(buf));
+ __encode_table_header_to_buf(&control->tbl_hdr, buf);
+
+ /* i2c may be unstable in gpu reset */
+ res = __eeprom_write(ras_core,
+ control->i2c_address +
+ control->ras_header_offset,
+ buf, RAS_TABLE_HEADER_SIZE);
+
+ if (res < 0) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to write EEPROM table header:%d\n", res);
+ } else if (res < RAS_TABLE_HEADER_SIZE) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Short write:%d out of %d\n", res, RAS_TABLE_HEADER_SIZE);
+ res = -EIO;
+ } else {
+ res = 0;
+ }
+
+ return res;
+}
+
+static void
+__encode_table_ras_info_to_buf(struct ras_eeprom_table_ras_info *rai,
+ unsigned char *buf)
+{
+ u32 *pp = (uint32_t *)buf;
+ u32 tmp;
+
+ tmp = ((uint32_t)(rai->rma_status) & 0xFF) |
+ (((uint32_t)(rai->health_percent) << 8) & 0xFF00) |
+ (((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000);
+ pp[0] = cpu_to_le32(tmp);
+}
+
+static void
+__decode_table_ras_info_from_buf(struct ras_eeprom_table_ras_info *rai,
+ unsigned char *buf)
+{
+ u32 *pp = (uint32_t *)buf;
+ u32 tmp;
+
+ tmp = le32_to_cpu(pp[0]);
+ rai->rma_status = tmp & 0xFF;
+ rai->health_percent = (tmp >> 8) & 0xFF;
+ rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF;
+}
+
+static int __write_table_ras_info(struct ras_eeprom_control *control)
+{
+ struct ras_core_context *ras_core = to_ras_core_context(control);
+ u8 *buf;
+ int res;
+
+ buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
+ if (!buf) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to alloc buf to write table ras info\n");
+ return -ENOMEM;
+ }
+
+ __encode_table_ras_info_to_buf(&control->tbl_rai, buf);
+
+ /* i2c may be unstable in gpu reset */
+ res = __eeprom_write(ras_core,
+ control->i2c_address +
+ control->ras_info_offset,
+ buf, RAS_TABLE_V2_1_INFO_SIZE);
+
+ if (res < 0) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to write EEPROM table ras info:%d\n", res);
+ } else if (res < RAS_TABLE_V2_1_INFO_SIZE) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Short write:%d out of %d\n", res, RAS_TABLE_V2_1_INFO_SIZE);
+ res = -EIO;
+ } else {
+ res = 0;
+ }
+
+ kfree(buf);
+
+ return res;
+}
+
+static u8 __calc_hdr_byte_sum(const struct ras_eeprom_control *control)
+{
+ int ii;
+ u8 *pp, csum;
+ u32 sz;
+
+ /* Header checksum, skip checksum field in the calculation */
+ sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum);
+ pp = (u8 *) &control->tbl_hdr;
+ csum = 0;
+ for (ii = 0; ii < sz; ii++, pp++)
+ csum += *pp;
+
+ return csum;
+}
+
+static u8 __calc_ras_info_byte_sum(const struct ras_eeprom_control *control)
+{
+ int ii;
+ u8 *pp, csum;
+ u32 sz;
+
+ sz = sizeof(control->tbl_rai);
+ pp = (u8 *) &control->tbl_rai;
+ csum = 0;
+ for (ii = 0; ii < sz; ii++, pp++)
+ csum += *pp;
+
+ return csum;
+}
+
+static int ras_eeprom_correct_header_tag(
+ struct ras_eeprom_control *control,
+ uint32_t header)
+{
+ struct ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ u8 *hh;
+ int res;
+ u8 csum;
+
+ csum = -hdr->checksum;
+
+ hh = (void *) &hdr->header;
+ csum -= (hh[0] + hh[1] + hh[2] + hh[3]);
+ hh = (void *) &header;
+ csum += hh[0] + hh[1] + hh[2] + hh[3];
+ csum = -csum;
+ mutex_lock(&control->ras_tbl_mutex);
+ hdr->header = header;
+ hdr->checksum = csum;
+ res = __write_table_header(control);
+ mutex_unlock(&control->ras_tbl_mutex);
+
+ return res;
+}
+
+static void ras_set_eeprom_table_version(struct ras_eeprom_control *control)
+{
+ struct ras_eeprom_table_header *hdr = &control->tbl_hdr;
+
+ hdr->version = RAS_TABLE_VER_V3;
+}
+
+int ras_eeprom_reset_table(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ struct ras_eeprom_table_header *hdr = &control->tbl_hdr;
+ struct ras_eeprom_table_ras_info *rai = &control->tbl_rai;
+ u8 csum;
+ int res;
+
+ mutex_lock(&control->ras_tbl_mutex);
+
+ hdr->header = RAS_TABLE_HDR_VAL;
+ ras_set_eeprom_table_version(control);
+
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
+ hdr->first_rec_offset = RAS_RECORD_START_V2_1;
+ hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
+ RAS_TABLE_V2_1_INFO_SIZE;
+ rai->rma_status = RAS_GPU_HEALTH_USABLE;
+ /**
+ * GPU health represented as a percentage.
+ * 0 means worst health, 100 means fully health.
+ */
+ rai->health_percent = 100;
+ /* ecc_page_threshold = 0 means disable bad page retirement */
+ rai->ecc_page_threshold = control->record_threshold_count;
+ } else {
+ hdr->first_rec_offset = RAS_RECORD_START;
+ hdr->tbl_size = RAS_TABLE_HEADER_SIZE;
+ }
+
+ csum = __calc_hdr_byte_sum(control);
+ if (hdr->version >= RAS_TABLE_VER_V2_1)
+ csum += __calc_ras_info_byte_sum(control);
+ csum = -csum;
+ hdr->checksum = csum;
+ res = __write_table_header(control);
+ if (!res && hdr->version > RAS_TABLE_VER_V1)
+ res = __write_table_ras_info(control);
+
+ control->ras_num_recs = 0;
+ control->ras_fri = 0;
+
+ control->bad_channel_bitmap = 0;
+ ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM,
+ &control->ras_num_recs);
+ ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP,
+ &control->bad_channel_bitmap);
+ control->update_channel_flag = false;
+
+ mutex_unlock(&control->ras_tbl_mutex);
+
+ return res;
+}
+
+static void
+__encode_table_record_to_buf(struct ras_eeprom_control *control,
+ struct eeprom_umc_record *record,
+ unsigned char *buf)
+{
+ __le64 tmp = 0;
+ int i = 0;
+
+ /* Next are all record fields according to EEPROM page spec in LE foramt */
+ buf[i++] = record->err_type;
+
+ buf[i++] = record->bank;
+
+ tmp = cpu_to_le64(record->ts);
+ memcpy(buf + i, &tmp, 8);
+ i += 8;
+
+ tmp = cpu_to_le64((record->offset & 0xffffffffffff));
+ memcpy(buf + i, &tmp, 6);
+ i += 6;
+
+ buf[i++] = record->mem_channel;
+ buf[i++] = record->mcumc_id;
+
+ tmp = cpu_to_le64((record->retired_row_pfn & 0xffffffffffff));
+ memcpy(buf + i, &tmp, 6);
+}
+
+static void
+__decode_table_record_from_buf(struct ras_eeprom_control *control,
+ struct eeprom_umc_record *record,
+ unsigned char *buf)
+{
+ __le64 tmp = 0;
+ int i = 0;
+
+ /* Next are all record fields according to EEPROM page spec in LE foramt */
+ record->err_type = buf[i++];
+
+ record->bank = buf[i++];
+
+ memcpy(&tmp, buf + i, 8);
+ record->ts = le64_to_cpu(tmp);
+ i += 8;
+
+ memcpy(&tmp, buf + i, 6);
+ record->offset = (le64_to_cpu(tmp) & 0xffffffffffff);
+ i += 6;
+
+ record->mem_channel = buf[i++];
+ record->mcumc_id = buf[i++];
+
+ memcpy(&tmp, buf + i, 6);
+ record->retired_row_pfn = (le64_to_cpu(tmp) & 0xffffffffffff);
+}
+
+bool ras_eeprom_check_safety_watermark(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ bool ret = false;
+ int bad_page_count;
+
+ if (!__is_ras_eeprom_supported(ras_core) ||
+ !control->record_threshold_config)
+ return false;
+
+ bad_page_count = ras_umc_get_badpage_count(ras_core);
+ if (control->tbl_hdr.header == RAS_TABLE_HDR_BAD) {
+ if (bad_page_count > control->record_threshold_count)
+ RAS_DEV_WARN(ras_core->dev, "RAS records:%d exceed threshold:%d",
+ bad_page_count, control->record_threshold_count);
+
+ if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) ||
+ (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n");
+ ret = false;
+ } else {
+ ras_core->is_rma = true;
+ RAS_DEV_WARN(ras_core->dev,
+ "Please consider adjusting the customized threshold.\n");
+ ret = true;
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * __ras_eeprom_write -- write indexed from buffer to EEPROM
+ * @control: pointer to control structure
+ * @buf: pointer to buffer containing data to write
+ * @fri: start writing at this index
+ * @num: number of records to write
+ *
+ * The caller must hold the table mutex in @control.
+ * Return 0 on success, -errno otherwise.
+ */
+static int __ras_eeprom_write(struct ras_eeprom_control *control,
+ u8 *buf, const u32 fri, const u32 num)
+{
+ struct ras_core_context *ras_core = to_ras_core_context(control);
+ u32 buf_size;
+ int res;
+
+ /* i2c may be unstable in gpu reset */
+ buf_size = num * RAS_TABLE_RECORD_SIZE;
+ res = __eeprom_write(ras_core,
+ control->i2c_address + RAS_INDEX_TO_OFFSET(control, fri),
+ buf, buf_size);
+ if (res < 0) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Writing %d EEPROM table records error:%d\n", num, res);
+ } else if (res < buf_size) {
+ /* Short write, return error.*/
+ RAS_DEV_ERR(ras_core->dev,
+ "Wrote %d records out of %d\n",
+ (res/RAS_TABLE_RECORD_SIZE), num);
+ res = -EIO;
+ } else {
+ res = 0;
+ }
+
+ return res;
+}
+
+static int ras_eeprom_append_table(struct ras_eeprom_control *control,
+ struct eeprom_umc_record *record,
+ const u32 num)
+{
+ u32 a, b, i;
+ u8 *buf, *pp;
+ int res;
+
+ buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Encode all of them in one go.
+ */
+ pp = buf;
+ for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
+ __encode_table_record_to_buf(control, &record[i], pp);
+
+ /* update bad channel bitmap */
+ if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
+ !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+ control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+ control->update_channel_flag = true;
+ }
+ }
+
+ /* a, first record index to write into.
+ * b, last record index to write into.
+ * a = first index to read (fri) + number of records in the table,
+ * b = a + @num - 1.
+ * Let N = control->ras_max_num_record_count, then we have,
+ * case 0: 0 <= a <= b < N,
+ * just append @num records starting at a;
+ * case 1: 0 <= a < N <= b,
+ * append (N - a) records starting at a, and
+ * append the remainder, b % N + 1, starting at 0.
+ * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases,
+ * case 2a: 0 <= a <= b < N
+ * append num records starting at a; and fix fri if b overwrote it,
+ * and since a <= b, if b overwrote it then a must've also,
+ * and if b didn't overwrite it, then a didn't also.
+ * case 2b: 0 <= b < a < N
+ * write num records starting at a, which wraps around 0=N
+ * and overwrite fri unconditionally. Now from case 2a,
+ * this means that b eclipsed fri to overwrite it and wrap
+ * around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally
+ * set fri = b + 1 (mod N).
+ * Now, since fri is updated in every case, except the trivial case 0,
+ * the number of records present in the table after writing, is,
+ * num_recs - 1 = b - fri (mod N), and we take the positive value,
+ * by adding an arbitrary multiple of N before taking the modulo N
+ * as shown below.
+ */
+ a = control->ras_fri + control->ras_num_recs;
+ b = a + num - 1;
+ if (b < control->ras_max_record_count) {
+ res = __ras_eeprom_write(control, buf, a, num);
+ } else if (a < control->ras_max_record_count) {
+ u32 g0, g1;
+
+ g0 = control->ras_max_record_count - a;
+ g1 = b % control->ras_max_record_count + 1;
+ res = __ras_eeprom_write(control, buf, a, g0);
+ if (res)
+ goto Out;
+ res = __ras_eeprom_write(control,
+ buf + g0 * RAS_TABLE_RECORD_SIZE,
+ 0, g1);
+ if (res)
+ goto Out;
+ if (g1 > control->ras_fri)
+ control->ras_fri = g1 % control->ras_max_record_count;
+ } else {
+ a %= control->ras_max_record_count;
+ b %= control->ras_max_record_count;
+
+ if (a <= b) {
+ /* Note that, b - a + 1 = num. */
+ res = __ras_eeprom_write(control, buf, a, num);
+ if (res)
+ goto Out;
+ if (b >= control->ras_fri)
+ control->ras_fri = (b + 1) % control->ras_max_record_count;
+ } else {
+ u32 g0, g1;
+
+ /* b < a, which means, we write from
+ * a to the end of the table, and from
+ * the start of the table to b.
+ */
+ g0 = control->ras_max_record_count - a;
+ g1 = b + 1;
+ res = __ras_eeprom_write(control, buf, a, g0);
+ if (res)
+ goto Out;
+ res = __ras_eeprom_write(control,
+ buf + g0 * RAS_TABLE_RECORD_SIZE, 0, g1);
+ if (res)
+ goto Out;
+ control->ras_fri = g1 % control->ras_max_record_count;
+ }
+ }
+ control->ras_num_recs = 1 +
+ (control->ras_max_record_count + b - control->ras_fri)
+ % control->ras_max_record_count;
+Out:
+ kfree(buf);
+ return res;
+}
+
+static int ras_eeprom_update_header(struct ras_eeprom_control *control)
+{
+ struct ras_core_context *ras_core = to_ras_core_context(control);
+ int threshold_config = control->record_threshold_config;
+ u8 *buf, *pp, csum;
+ u32 buf_size;
+ int bad_page_count;
+ int res;
+
+ bad_page_count = ras_umc_get_badpage_count(ras_core);
+ /* Modify the header if it exceeds.
+ */
+ if (threshold_config != 0 &&
+ bad_page_count > control->record_threshold_count) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Saved bad pages %d reaches threshold value %d\n",
+ bad_page_count, control->record_threshold_count);
+ control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
+ control->tbl_rai.rma_status = RAS_GPU_RETIRED__ECC_REACH_THRESHOLD;
+ control->tbl_rai.health_percent = 0;
+ }
+
+ if ((threshold_config != WARN_NONSTOP_OVER_THRESHOLD) &&
+ (threshold_config != NONSTOP_OVER_THRESHOLD))
+ ras_core->is_rma = true;
+
+ /* ignore the -ENOTSUPP return value */
+ ras_core_event_notify(ras_core, RAS_EVENT_ID__DEVICE_RMA, NULL);
+ }
+
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
+ RAS_TABLE_V2_1_INFO_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ else
+ control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ control->tbl_hdr.checksum = 0;
+
+ buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
+ if (!buf) {
+ RAS_DEV_ERR(ras_core->dev,
+ "allocating memory for table of size %d bytes failed\n",
+ control->tbl_hdr.tbl_size);
+ res = -ENOMEM;
+ goto Out;
+ }
+
+ res = __eeprom_read(ras_core,
+ control->i2c_address +
+ control->ras_record_offset,
+ buf, buf_size);
+ if (res < 0) {
+ RAS_DEV_ERR(ras_core->dev,
+ "EEPROM failed reading records:%d\n", res);
+ goto Out;
+ } else if (res < buf_size) {
+ RAS_DEV_ERR(ras_core->dev,
+ "EEPROM read %d out of %d bytes\n", res, buf_size);
+ res = -EIO;
+ goto Out;
+ }
+
+ /**
+ * bad page records have been stored in eeprom,
+ * now calculate gpu health percent
+ */
+ if (threshold_config != 0 &&
+ control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
+ bad_page_count <= control->record_threshold_count)
+ control->tbl_rai.health_percent = ((control->record_threshold_count -
+ bad_page_count) * 100) / control->record_threshold_count;
+
+ /* Recalc the checksum.
+ */
+ csum = 0;
+ for (pp = buf; pp < buf + buf_size; pp++)
+ csum += *pp;
+
+ csum += __calc_hdr_byte_sum(control);
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ csum += __calc_ras_info_byte_sum(control);
+ /* avoid sign extension when assigning to "checksum" */
+ csum = -csum;
+ control->tbl_hdr.checksum = csum;
+ res = __write_table_header(control);
+ if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1)
+ res = __write_table_ras_info(control);
+Out:
+ kfree(buf);
+ return res;
+}
+
+/**
+ * ras_core_eeprom_append -- append records to the EEPROM RAS table
+ * @control: pointer to control structure
+ * @record: array of records to append
+ * @num: number of records in @record array
+ *
+ * Append @num records to the table, calculate the checksum and write
+ * the table back to EEPROM. The maximum number of records that
+ * can be appended is between 1 and control->ras_max_record_count,
+ * regardless of how many records are already stored in the table.
+ *
+ * Return 0 on success or if EEPROM is not supported, -errno on error.
+ */
+int ras_eeprom_append(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, const u32 num)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ int res;
+
+ if (!__is_ras_eeprom_supported(ras_core))
+ return 0;
+
+ if (num == 0) {
+ RAS_DEV_ERR(ras_core->dev, "will not append 0 records\n");
+ return -EINVAL;
+ } else if ((num + control->ras_num_recs) > control->ras_max_record_count) {
+ RAS_DEV_ERR(ras_core->dev,
+ "cannot append %d records than the size of table %d\n",
+ num, control->ras_max_record_count);
+ return -EINVAL;
+ }
+
+ mutex_lock(&control->ras_tbl_mutex);
+ res = ras_eeprom_append_table(control, record, num);
+ if (!res)
+ res = ras_eeprom_update_header(control);
+
+ mutex_unlock(&control->ras_tbl_mutex);
+
+ return res;
+}
+
+/**
+ * __ras_eeprom_read -- read indexed from EEPROM into buffer
+ * @control: pointer to control structure
+ * @buf: pointer to buffer to read into
+ * @fri: first record index, start reading at this index, absolute index
+ * @num: number of records to read
+ *
+ * The caller must hold the table mutex in @control.
+ * Return 0 on success, -errno otherwise.
+ */
+static int __ras_eeprom_read(struct ras_eeprom_control *control,
+ u8 *buf, const u32 fri, const u32 num)
+{
+ struct ras_core_context *ras_core = to_ras_core_context(control);
+ u32 buf_size;
+ int res;
+
+ /* i2c may be unstable in gpu reset */
+ buf_size = num * RAS_TABLE_RECORD_SIZE;
+ res = __eeprom_read(ras_core,
+ control->i2c_address +
+ RAS_INDEX_TO_OFFSET(control, fri),
+ buf, buf_size);
+ if (res < 0) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Reading %d EEPROM table records error:%d\n", num, res);
+ } else if (res < buf_size) {
+ /* Short read, return error.
+ */
+ RAS_DEV_ERR(ras_core->dev,
+ "Read %d records out of %d\n",
+ (res/RAS_TABLE_RECORD_SIZE), num);
+ res = -EIO;
+ } else {
+ res = 0;
+ }
+
+ return res;
+}
+
+int ras_eeprom_read(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, const u32 num)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ int i, res;
+ u8 *buf, *pp;
+ u32 g0, g1;
+
+ if (!__is_ras_eeprom_supported(ras_core))
+ return 0;
+
+ if (num == 0) {
+ RAS_DEV_ERR(ras_core->dev, "will not read 0 records\n");
+ return -EINVAL;
+ } else if (num > control->ras_num_recs) {
+ RAS_DEV_ERR(ras_core->dev,
+ "too many records to read:%d available:%d\n",
+ num, control->ras_num_recs);
+ return -EINVAL;
+ }
+
+ buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ /* Determine how many records to read, from the first record
+ * index, fri, to the end of the table, and from the beginning
+ * of the table, such that the total number of records is
+ * @num, and we handle wrap around when fri > 0 and
+ * fri + num > RAS_MAX_RECORD_COUNT.
+ *
+ * First we compute the index of the last element
+ * which would be fetched from each region,
+ * g0 is in [fri, fri + num - 1], and
+ * g1 is in [0, RAS_MAX_RECORD_COUNT - 1].
+ * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of
+ * the last element to fetch, we set g0 to _the number_
+ * of elements to fetch, @num, since we know that the last
+ * indexed to be fetched does not exceed the table.
+ *
+ * If, however, g0 >= RAS_MAX_RECORD_COUNT, then
+ * we set g0 to the number of elements to read
+ * until the end of the table, and g1 to the number of
+ * elements to read from the beginning of the table.
+ */
+ g0 = control->ras_fri + num - 1;
+ g1 = g0 % control->ras_max_record_count;
+ if (g0 < control->ras_max_record_count) {
+ g0 = num;
+ g1 = 0;
+ } else {
+ g0 = control->ras_max_record_count - control->ras_fri;
+ g1 += 1;
+ }
+
+ mutex_lock(&control->ras_tbl_mutex);
+ res = __ras_eeprom_read(control, buf, control->ras_fri, g0);
+ if (res)
+ goto Out;
+ if (g1) {
+ res = __ras_eeprom_read(control,
+ buf + g0 * RAS_TABLE_RECORD_SIZE, 0, g1);
+ if (res)
+ goto Out;
+ }
+
+ res = 0;
+
+ /* Read up everything? Then transform.
+ */
+ pp = buf;
+ for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
+ __decode_table_record_from_buf(control, &record[i], pp);
+
+ /* update bad channel bitmap */
+ if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) &&
+ !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
+ control->bad_channel_bitmap |= 1 << record[i].mem_channel;
+ control->update_channel_flag = true;
+ }
+ }
+Out:
+ kfree(buf);
+ mutex_unlock(&control->ras_tbl_mutex);
+
+ return res;
+}
+
+uint32_t ras_eeprom_max_record_count(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+
+ /* get available eeprom table version first before eeprom table init */
+ ras_set_eeprom_table_version(control);
+
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ return RAS_MAX_RECORD_COUNT_V2_1;
+ else
+ return RAS_MAX_RECORD_COUNT;
+}
+
+/**
+ * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
+ * @control: pointer to control structure
+ *
+ * Check the checksum of the stored in EEPROM RAS table.
+ *
+ * Return 0 if the checksum is correct,
+ * positive if it is not correct, and
+ * -errno on I/O error.
+ */
+static int __verify_ras_table_checksum(struct ras_eeprom_control *control)
+{
+ struct ras_core_context *ras_core = to_ras_core_context(control);
+ int buf_size, res;
+ u8 csum, *buf, *pp;
+
+ if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
+ buf_size = RAS_TABLE_HEADER_SIZE +
+ RAS_TABLE_V2_1_INFO_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+ else
+ buf_size = RAS_TABLE_HEADER_SIZE +
+ control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
+
+ buf = kzalloc(buf_size, GFP_KERNEL);
+ if (!buf) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Out of memory checking RAS table checksum.\n");
+ return -ENOMEM;
+ }
+
+ res = __eeprom_read(ras_core,
+ control->i2c_address +
+ control->ras_header_offset,
+ buf, buf_size);
+ if (res < buf_size) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Partial read for checksum, res:%d\n", res);
+ /* On partial reads, return -EIO.
+ */
+ if (res >= 0)
+ res = -EIO;
+ goto Out;
+ }
+
+ csum = 0;
+ for (pp = buf; pp < buf + buf_size; pp++)
+ csum += *pp;
+Out:
+ kfree(buf);
+ return res < 0 ? res : csum;
+}
+
+static int __read_table_ras_info(struct ras_eeprom_control *control)
+{
+ struct ras_eeprom_table_ras_info *rai = &control->tbl_rai;
+ struct ras_core_context *ras_core = to_ras_core_context(control);
+ unsigned char *buf;
+ int res;
+
+ buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL);
+ if (!buf) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to alloc buf to read EEPROM table ras info\n");
+ return -ENOMEM;
+ }
+
+ /**
+ * EEPROM table V2_1 supports ras info,
+ * read EEPROM table ras info
+ */
+ res = __eeprom_read(ras_core,
+ control->i2c_address + control->ras_info_offset,
+ buf, RAS_TABLE_V2_1_INFO_SIZE);
+ if (res < RAS_TABLE_V2_1_INFO_SIZE) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to read EEPROM table ras info, res:%d\n", res);
+ res = res >= 0 ? -EIO : res;
+ goto Out;
+ }
+
+ __decode_table_ras_info_from_buf(rai, buf);
+
+Out:
+ kfree(buf);
+ return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
+}
+
+static int __check_ras_table_status(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
+ struct ras_eeprom_table_header *hdr;
+ int res;
+
+ hdr = &control->tbl_hdr;
+
+ if (!__is_ras_eeprom_supported(ras_core))
+ return 0;
+
+ if (!__get_eeprom_i2c_addr(ras_core, control))
+ return -EINVAL;
+
+ control->ras_header_offset = RAS_HDR_START;
+ control->ras_info_offset = RAS_TABLE_V2_1_INFO_START;
+ mutex_init(&control->ras_tbl_mutex);
+
+ /* Read the table header from EEPROM address */
+ res = __eeprom_read(ras_core,
+ control->i2c_address + control->ras_header_offset,
+ buf, RAS_TABLE_HEADER_SIZE);
+ if (res < RAS_TABLE_HEADER_SIZE) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to read EEPROM table header, res:%d\n", res);
+ return res >= 0 ? -EIO : res;
+ }
+
+ __decode_table_header_from_buf(hdr, buf);
+
+ if (hdr->header != RAS_TABLE_HDR_VAL &&
+ hdr->header != RAS_TABLE_HDR_BAD) {
+ RAS_DEV_INFO(ras_core->dev, "Creating a new EEPROM table");
+ return ras_eeprom_reset_table(ras_core);
+ }
+
+ switch (hdr->version) {
+ case RAS_TABLE_VER_V2_1:
+ case RAS_TABLE_VER_V3:
+ control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
+ control->ras_record_offset = RAS_RECORD_START_V2_1;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
+ break;
+ case RAS_TABLE_VER_V1:
+ control->ras_num_recs = RAS_NUM_RECS(hdr);
+ control->ras_record_offset = RAS_RECORD_START;
+ control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+ break;
+ default:
+ RAS_DEV_ERR(ras_core->dev,
+ "RAS header invalid, unsupported version: %u",
+ hdr->version);
+ return -EINVAL;
+ }
+
+ if (control->ras_num_recs > control->ras_max_record_count) {
+ RAS_DEV_ERR(ras_core->dev,
+ "RAS header invalid, records in header: %u max allowed :%u",
+ control->ras_num_recs, control->ras_max_record_count);
+ return -EINVAL;
+ }
+
+ control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
+
+ return 0;
+}
+
+int ras_eeprom_check_storage_status(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ struct ras_eeprom_table_header *hdr;
+ int bad_page_count;
+ int res = 0;
+
+ if (!__is_ras_eeprom_supported(ras_core))
+ return 0;
+
+ if (!__get_eeprom_i2c_addr(ras_core, control))
+ return -EINVAL;
+
+ hdr = &control->tbl_hdr;
+
+ bad_page_count = ras_umc_get_badpage_count(ras_core);
+ if (hdr->header == RAS_TABLE_HDR_VAL) {
+ RAS_DEV_INFO(ras_core->dev,
+ "Found existing EEPROM table with %d records\n",
+ bad_page_count);
+
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
+ res = __read_table_ras_info(control);
+ if (res)
+ return res;
+ }
+
+ res = __verify_ras_table_checksum(control);
+ if (res)
+ RAS_DEV_ERR(ras_core->dev,
+ "RAS table incorrect checksum or error:%d\n", res);
+
+ /* Warn if we are at 90% of the threshold or above
+ */
+ if (10 * bad_page_count >= 9 * control->record_threshold_count)
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS records:%u exceeds 90%% of threshold:%d\n",
+ bad_page_count,
+ control->record_threshold_count);
+
+ } else if (hdr->header == RAS_TABLE_HDR_BAD &&
+ control->record_threshold_config != 0) {
+ if (hdr->version >= RAS_TABLE_VER_V2_1) {
+ res = __read_table_ras_info(control);
+ if (res)
+ return res;
+ }
+
+ res = __verify_ras_table_checksum(control);
+ if (res)
+ RAS_DEV_ERR(ras_core->dev,
+ "RAS Table incorrect checksum or error:%d\n", res);
+
+ if (control->record_threshold_count >= bad_page_count) {
+ /* This means that, the threshold was increased since
+ * the last time the system was booted, and now,
+ * ras->record_threshold_count - control->num_recs > 0,
+ * so that at least one more record can be saved,
+ * before the page count threshold is reached.
+ */
+ RAS_DEV_INFO(ras_core->dev,
+ "records:%d threshold:%d, resetting RAS table header signature",
+ bad_page_count,
+ control->record_threshold_count);
+ res = ras_eeprom_correct_header_tag(control, RAS_TABLE_HDR_VAL);
+ } else {
+ RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d",
+ bad_page_count, control->record_threshold_count);
+ if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) ||
+ (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n");
+ res = 0;
+ } else {
+ ras_core->is_rma = true;
+ RAS_DEV_ERR(ras_core->dev,
+ "User defined threshold is set, runtime service will be halt when threshold is reached\n");
+ }
+ }
+ }
+
+ return res < 0 ? res : 0;
+}
+
+int ras_eeprom_hw_init(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control;
+ struct ras_eeprom_config *eeprom_cfg;
+
+ if (!ras_core)
+ return -EINVAL;
+
+ ras_core->is_rma = false;
+
+ control = &ras_core->ras_eeprom;
+
+ memset(control, 0, sizeof(*control));
+
+ eeprom_cfg = &ras_core->config->eeprom_cfg;
+ control->record_threshold_config =
+ eeprom_cfg->eeprom_record_threshold_config;
+
+ control->record_threshold_count = ras_eeprom_max_record_count(ras_core);
+ if (eeprom_cfg->eeprom_record_threshold_count <
+ control->record_threshold_count)
+ control->record_threshold_count =
+ eeprom_cfg->eeprom_record_threshold_count;
+
+ control->sys_func = eeprom_cfg->eeprom_sys_fn;
+ control->max_read_len = eeprom_cfg->max_i2c_read_len;
+ control->max_write_len = eeprom_cfg->max_i2c_write_len;
+ control->i2c_adapter = eeprom_cfg->eeprom_i2c_adapter;
+ control->i2c_port = eeprom_cfg->eeprom_i2c_port;
+ control->i2c_address = eeprom_cfg->eeprom_i2c_addr;
+
+ control->update_channel_flag = false;
+
+ return __check_ras_table_status(ras_core);
+}
+
+int ras_eeprom_hw_fini(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control;
+
+ if (!ras_core)
+ return -EINVAL;
+
+ control = &ras_core->ras_eeprom;
+ mutex_destroy(&control->ras_tbl_mutex);
+
+ return 0;
+}
+
+uint32_t ras_eeprom_get_record_count(struct ras_core_context *ras_core)
+{
+ if (!ras_core)
+ return 0;
+
+ return ras_core->ras_eeprom.ras_num_recs;
+}
+
+void ras_eeprom_sync_info(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control;
+
+ if (!ras_core)
+ return;
+
+ control = &ras_core->ras_eeprom;
+ ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM,
+ &control->ras_num_recs);
+ ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP,
+ &control->bad_channel_bitmap);
+}
+
+enum ras_gpu_health_status
+ ras_eeprom_check_gpu_status(struct ras_core_context *ras_core)
+{
+ struct ras_eeprom_control *control = &ras_core->ras_eeprom;
+ struct ras_eeprom_table_ras_info *rai = &control->tbl_rai;
+
+ if (!__is_ras_eeprom_supported(ras_core) ||
+ !control->record_threshold_config)
+ return RAS_GPU_HEALTH_NONE;
+
+ if (control->tbl_hdr.header == RAS_TABLE_HDR_BAD)
+ return RAS_GPU_IN_BAD_STATUS;
+
+ return rai->rma_status;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
new file mode 100644
index 000000000000..2abe566c18b6
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
@@ -0,0 +1,197 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_EEPROM_H__
+#define __RAS_EEPROM_H__
+#include "ras_sys.h"
+
+#define RAS_TABLE_VER_V1 0x00010000
+#define RAS_TABLE_VER_V2_1 0x00021000
+#define RAS_TABLE_VER_V3 0x00030000
+
+#define NONSTOP_OVER_THRESHOLD -2
+#define WARN_NONSTOP_OVER_THRESHOLD -1
+#define DISABLE_RETIRE_PAGE 0
+
+/*
+ * Bad address pfn : eeprom_umc_record.retired_row_pfn[39:0],
+ * nps mode: eeprom_umc_record.retired_row_pfn[47:40]
+ */
+#define EEPROM_RECORD_UMC_ADDR_MASK 0xFFFFFFFFFFULL
+#define EEPROM_RECORD_UMC_NPS_MASK 0xFF0000000000ULL
+#define EEPROM_RECORD_UMC_NPS_SHIFT 40
+
+#define EEPROM_RECORD_UMC_NPS_MODE(RECORD) \
+ (((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_NPS_MASK) >> \
+ EEPROM_RECORD_UMC_NPS_SHIFT)
+
+#define EEPROM_RECORD_UMC_ADDR_PFN(RECORD) \
+ ((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_ADDR_MASK)
+
+#define EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(RECORD, ADDR, NPS) \
+do { \
+ uint64_t tmp = (NPS); \
+ tmp = ((tmp << EEPROM_RECORD_UMC_NPS_SHIFT) & EEPROM_RECORD_UMC_NPS_MASK); \
+ tmp |= (ADDR) & EEPROM_RECORD_UMC_ADDR_MASK; \
+ (RECORD)->retired_row_pfn = tmp; \
+} while (0)
+
+enum ras_gpu_health_status {
+ RAS_GPU_HEALTH_NONE = 0,
+ RAS_GPU_HEALTH_USABLE = 1,
+ RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2,
+ RAS_GPU_IN_BAD_STATUS = 3,
+};
+
+enum ras_eeprom_err_type {
+ RAS_EEPROM_ERR_NA,
+ RAS_EEPROM_ERR_RECOVERABLE,
+ RAS_EEPROM_ERR_NON_RECOVERABLE,
+ RAS_EEPROM_ERR_COUNT,
+};
+
+struct ras_eeprom_table_header {
+ uint32_t header;
+ uint32_t version;
+ uint32_t first_rec_offset;
+ uint32_t tbl_size;
+ uint32_t checksum;
+} __packed;
+
+struct ras_eeprom_table_ras_info {
+ u8 rma_status;
+ u8 health_percent;
+ u16 ecc_page_threshold;
+ u32 padding[64 - 1];
+} __packed;
+
+struct ras_eeprom_control {
+ struct ras_eeprom_table_header tbl_hdr;
+ struct ras_eeprom_table_ras_info tbl_rai;
+
+ /* record threshold */
+ int record_threshold_config;
+ uint32_t record_threshold_count;
+ bool update_channel_flag;
+
+ const struct ras_eeprom_sys_func *sys_func;
+ void *i2c_adapter;
+ u32 i2c_port;
+ u16 max_read_len;
+ u16 max_write_len;
+
+ /* Base I2C EEPPROM 19-bit memory address,
+ * where the table is located. For more information,
+ * see top of amdgpu_eeprom.c.
+ */
+ u32 i2c_address;
+
+ /* The byte offset off of @i2c_address
+ * where the table header is found,
+ * and where the records start--always
+ * right after the header.
+ */
+ u32 ras_header_offset;
+ u32 ras_info_offset;
+ u32 ras_record_offset;
+
+ /* Number of records in the table.
+ */
+ u32 ras_num_recs;
+
+ /* First record index to read, 0-based.
+ * Range is [0, num_recs-1]. This is
+ * an absolute index, starting right after
+ * the table header.
+ */
+ u32 ras_fri;
+
+ /* Maximum possible number of records
+ * we could store, i.e. the maximum capacity
+ * of the table.
+ */
+ u32 ras_max_record_count;
+
+ /* Protect table access via this mutex.
+ */
+ struct mutex ras_tbl_mutex;
+
+ /* Record channel info which occurred bad pages
+ */
+ u32 bad_channel_bitmap;
+};
+
+/*
+ * Represents single table record. Packed to be easily serialized into byte
+ * stream.
+ */
+struct eeprom_umc_record {
+
+ union {
+ uint64_t address;
+ uint64_t offset;
+ };
+
+ uint64_t retired_row_pfn;
+ uint64_t ts;
+
+ enum ras_eeprom_err_type err_type;
+
+ union {
+ unsigned char bank;
+ unsigned char cu;
+ };
+
+ unsigned char mem_channel;
+ unsigned char mcumc_id;
+
+ /* The following variables will not be saved to eeprom.
+ */
+ uint64_t cur_nps_retired_row_pfn;
+ uint32_t cur_nps_bank;
+ uint32_t cur_nps;
+};
+
+struct ras_core_context;
+int ras_eeprom_hw_init(struct ras_core_context *ras_core);
+int ras_eeprom_hw_fini(struct ras_core_context *ras_core);
+
+int ras_eeprom_reset_table(struct ras_core_context *ras_core);
+
+bool ras_eeprom_check_safety_watermark(struct ras_core_context *ras_core);
+
+int ras_eeprom_read(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *records, const u32 num);
+
+int ras_eeprom_append(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *records, const u32 num);
+
+uint32_t ras_eeprom_max_record_count(struct ras_core_context *ras_core);
+uint32_t ras_eeprom_get_record_count(struct ras_core_context *ras_core);
+void ras_eeprom_sync_info(struct ras_core_context *ras_core);
+
+int ras_eeprom_check_storage_status(struct ras_core_context *ras_core);
+enum ras_gpu_health_status
+ ras_eeprom_check_gpu_status(struct ras_core_context *ras_core);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx.c b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.c
new file mode 100644
index 000000000000..f5ce28777705
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "ras.h"
+#include "ras_gfx_v9_0.h"
+#include "ras_gfx.h"
+#include "ras_core_status.h"
+
+static const struct ras_gfx_ip_func *ras_gfx_get_ip_funcs(
+ struct ras_core_context *ras_core, uint32_t ip_version)
+{
+ switch (ip_version) {
+ case IP_VERSION(9, 4, 3):
+ case IP_VERSION(9, 4, 4):
+ case IP_VERSION(9, 5, 0):
+ return &gfx_ras_func_v9_0;
+ default:
+ RAS_DEV_ERR(ras_core->dev,
+ "GFX ip version(0x%x) is not supported!\n", ip_version);
+ break;
+ }
+
+ return NULL;
+}
+
+int ras_gfx_get_ta_subblock(struct ras_core_context *ras_core,
+ uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock)
+{
+ struct ras_gfx *gfx = &ras_core->ras_gfx;
+
+ return gfx->ip_func->get_ta_subblock(ras_core,
+ error_type, subblock, ta_subblock);
+}
+
+int ras_gfx_hw_init(struct ras_core_context *ras_core)
+{
+ struct ras_gfx *gfx = &ras_core->ras_gfx;
+
+ gfx->gfx_ip_version = ras_core->config->gfx_ip_version;
+
+ gfx->ip_func = ras_gfx_get_ip_funcs(ras_core, gfx->gfx_ip_version);
+
+ return gfx->ip_func ? RAS_CORE_OK : -EINVAL;
+}
+
+int ras_gfx_hw_fini(struct ras_core_context *ras_core)
+{
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx.h b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.h
new file mode 100644
index 000000000000..8a42d69fb0ad
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_GFX_H__
+#define __RAS_GFX_H__
+
+struct ras_gfx_ip_func {
+ int (*get_ta_subblock)(struct ras_core_context *ras_core,
+ uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock);
+};
+
+struct ras_gfx {
+ uint32_t gfx_ip_version;
+ const struct ras_gfx_ip_func *ip_func;
+};
+
+int ras_gfx_hw_init(struct ras_core_context *ras_core);
+int ras_gfx_hw_fini(struct ras_core_context *ras_core);
+
+int ras_gfx_get_ta_subblock(struct ras_core_context *ras_core,
+ uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock);
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.c
new file mode 100644
index 000000000000..6213d3f125be
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_gfx_v9_0.h"
+#include "ras_core_status.h"
+
+enum ta_gfx_v9_subblock {
+ /*CPC*/
+ TA_GFX_V9__GFX_CPC_INDEX_START = 0,
+ TA_GFX_V9__GFX_CPC_SCRATCH = TA_GFX_V9__GFX_CPC_INDEX_START,
+ TA_GFX_V9__GFX_CPC_UCODE,
+ TA_GFX_V9__GFX_DC_STATE_ME1,
+ TA_GFX_V9__GFX_DC_CSINVOC_ME1,
+ TA_GFX_V9__GFX_DC_RESTORE_ME1,
+ TA_GFX_V9__GFX_DC_STATE_ME2,
+ TA_GFX_V9__GFX_DC_CSINVOC_ME2,
+ TA_GFX_V9__GFX_DC_RESTORE_ME2,
+ TA_GFX_V9__GFX_CPC_INDEX_END = TA_GFX_V9__GFX_DC_RESTORE_ME2,
+ /* CPF*/
+ TA_GFX_V9__GFX_CPF_INDEX_START,
+ TA_GFX_V9__GFX_CPF_ROQ_ME2 = TA_GFX_V9__GFX_CPF_INDEX_START,
+ TA_GFX_V9__GFX_CPF_ROQ_ME1,
+ TA_GFX_V9__GFX_CPF_TAG,
+ TA_GFX_V9__GFX_CPF_INDEX_END = TA_GFX_V9__GFX_CPF_TAG,
+ /* CPG*/
+ TA_GFX_V9__GFX_CPG_INDEX_START,
+ TA_GFX_V9__GFX_CPG_DMA_ROQ = TA_GFX_V9__GFX_CPG_INDEX_START,
+ TA_GFX_V9__GFX_CPG_DMA_TAG,
+ TA_GFX_V9__GFX_CPG_TAG,
+ TA_GFX_V9__GFX_CPG_INDEX_END = TA_GFX_V9__GFX_CPG_TAG,
+ /* GDS*/
+ TA_GFX_V9__GFX_GDS_INDEX_START,
+ TA_GFX_V9__GFX_GDS_MEM = TA_GFX_V9__GFX_GDS_INDEX_START,
+ TA_GFX_V9__GFX_GDS_INPUT_QUEUE,
+ TA_GFX_V9__GFX_GDS_OA_PHY_CMD_RAM_MEM,
+ TA_GFX_V9__GFX_GDS_OA_PHY_DATA_RAM_MEM,
+ TA_GFX_V9__GFX_GDS_OA_PIPE_MEM,
+ TA_GFX_V9__GFX_GDS_INDEX_END = TA_GFX_V9__GFX_GDS_OA_PIPE_MEM,
+ /* SPI*/
+ TA_GFX_V9__GFX_SPI_SR_MEM,
+ /* SQ*/
+ TA_GFX_V9__GFX_SQ_INDEX_START,
+ TA_GFX_V9__GFX_SQ_SGPR = TA_GFX_V9__GFX_SQ_INDEX_START,
+ TA_GFX_V9__GFX_SQ_LDS_D,
+ TA_GFX_V9__GFX_SQ_LDS_I,
+ TA_GFX_V9__GFX_SQ_VGPR, /* VGPR = SP*/
+ TA_GFX_V9__GFX_SQ_INDEX_END = TA_GFX_V9__GFX_SQ_VGPR,
+ /* SQC (3 ranges)*/
+ TA_GFX_V9__GFX_SQC_INDEX_START,
+ /* SQC range 0*/
+ TA_GFX_V9__GFX_SQC_INDEX0_START = TA_GFX_V9__GFX_SQC_INDEX_START,
+ TA_GFX_V9__GFX_SQC_INST_UTCL1_LFIFO =
+ TA_GFX_V9__GFX_SQC_INDEX0_START,
+ TA_GFX_V9__GFX_SQC_DATA_CU0_WRITE_DATA_BUF,
+ TA_GFX_V9__GFX_SQC_DATA_CU0_UTCL1_LFIFO,
+ TA_GFX_V9__GFX_SQC_DATA_CU1_WRITE_DATA_BUF,
+ TA_GFX_V9__GFX_SQC_DATA_CU1_UTCL1_LFIFO,
+ TA_GFX_V9__GFX_SQC_DATA_CU2_WRITE_DATA_BUF,
+ TA_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
+ TA_GFX_V9__GFX_SQC_INDEX0_END =
+ TA_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
+ /* SQC range 1*/
+ TA_GFX_V9__GFX_SQC_INDEX1_START,
+ TA_GFX_V9__GFX_SQC_INST_BANKA_TAG_RAM =
+ TA_GFX_V9__GFX_SQC_INDEX1_START,
+ TA_GFX_V9__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO,
+ TA_GFX_V9__GFX_SQC_INST_BANKA_MISS_FIFO,
+ TA_GFX_V9__GFX_SQC_INST_BANKA_BANK_RAM,
+ TA_GFX_V9__GFX_SQC_DATA_BANKA_TAG_RAM,
+ TA_GFX_V9__GFX_SQC_DATA_BANKA_HIT_FIFO,
+ TA_GFX_V9__GFX_SQC_DATA_BANKA_MISS_FIFO,
+ TA_GFX_V9__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM,
+ TA_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM,
+ TA_GFX_V9__GFX_SQC_INDEX1_END =
+ TA_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM,
+ /* SQC range 2*/
+ TA_GFX_V9__GFX_SQC_INDEX2_START,
+ TA_GFX_V9__GFX_SQC_INST_BANKB_TAG_RAM =
+ TA_GFX_V9__GFX_SQC_INDEX2_START,
+ TA_GFX_V9__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO,
+ TA_GFX_V9__GFX_SQC_INST_BANKB_MISS_FIFO,
+ TA_GFX_V9__GFX_SQC_INST_BANKB_BANK_RAM,
+ TA_GFX_V9__GFX_SQC_DATA_BANKB_TAG_RAM,
+ TA_GFX_V9__GFX_SQC_DATA_BANKB_HIT_FIFO,
+ TA_GFX_V9__GFX_SQC_DATA_BANKB_MISS_FIFO,
+ TA_GFX_V9__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM,
+ TA_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM,
+ TA_GFX_V9__GFX_SQC_INDEX2_END =
+ TA_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM,
+ TA_GFX_V9__GFX_SQC_INDEX_END = TA_GFX_V9__GFX_SQC_INDEX2_END,
+ /* TA*/
+ TA_GFX_V9__GFX_TA_INDEX_START,
+ TA_GFX_V9__GFX_TA_FS_DFIFO = TA_GFX_V9__GFX_TA_INDEX_START,
+ TA_GFX_V9__GFX_TA_FS_AFIFO,
+ TA_GFX_V9__GFX_TA_FL_LFIFO,
+ TA_GFX_V9__GFX_TA_FX_LFIFO,
+ TA_GFX_V9__GFX_TA_FS_CFIFO,
+ TA_GFX_V9__GFX_TA_INDEX_END = TA_GFX_V9__GFX_TA_FS_CFIFO,
+ /* TCA*/
+ TA_GFX_V9__GFX_TCA_INDEX_START,
+ TA_GFX_V9__GFX_TCA_HOLE_FIFO = TA_GFX_V9__GFX_TCA_INDEX_START,
+ TA_GFX_V9__GFX_TCA_REQ_FIFO,
+ TA_GFX_V9__GFX_TCA_INDEX_END = TA_GFX_V9__GFX_TCA_REQ_FIFO,
+ /* TCC (5 sub-ranges)*/
+ TA_GFX_V9__GFX_TCC_INDEX_START,
+ /* TCC range 0*/
+ TA_GFX_V9__GFX_TCC_INDEX0_START = TA_GFX_V9__GFX_TCC_INDEX_START,
+ TA_GFX_V9__GFX_TCC_CACHE_DATA = TA_GFX_V9__GFX_TCC_INDEX0_START,
+ TA_GFX_V9__GFX_TCC_CACHE_DATA_BANK_0_1,
+ TA_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_0,
+ TA_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_1,
+ TA_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_0,
+ TA_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_1,
+ TA_GFX_V9__GFX_TCC_HIGH_RATE_TAG,
+ TA_GFX_V9__GFX_TCC_LOW_RATE_TAG,
+ TA_GFX_V9__GFX_TCC_INDEX0_END = TA_GFX_V9__GFX_TCC_LOW_RATE_TAG,
+ /* TCC range 1*/
+ TA_GFX_V9__GFX_TCC_INDEX1_START,
+ TA_GFX_V9__GFX_TCC_IN_USE_DEC = TA_GFX_V9__GFX_TCC_INDEX1_START,
+ TA_GFX_V9__GFX_TCC_IN_USE_TRANSFER,
+ TA_GFX_V9__GFX_TCC_INDEX1_END =
+ TA_GFX_V9__GFX_TCC_IN_USE_TRANSFER,
+ /* TCC range 2*/
+ TA_GFX_V9__GFX_TCC_INDEX2_START,
+ TA_GFX_V9__GFX_TCC_RETURN_DATA = TA_GFX_V9__GFX_TCC_INDEX2_START,
+ TA_GFX_V9__GFX_TCC_RETURN_CONTROL,
+ TA_GFX_V9__GFX_TCC_UC_ATOMIC_FIFO,
+ TA_GFX_V9__GFX_TCC_WRITE_RETURN,
+ TA_GFX_V9__GFX_TCC_WRITE_CACHE_READ,
+ TA_GFX_V9__GFX_TCC_SRC_FIFO,
+ TA_GFX_V9__GFX_TCC_SRC_FIFO_NEXT_RAM,
+ TA_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO,
+ TA_GFX_V9__GFX_TCC_INDEX2_END =
+ TA_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO,
+ /* TCC range 3*/
+ TA_GFX_V9__GFX_TCC_INDEX3_START,
+ TA_GFX_V9__GFX_TCC_LATENCY_FIFO = TA_GFX_V9__GFX_TCC_INDEX3_START,
+ TA_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
+ TA_GFX_V9__GFX_TCC_INDEX3_END =
+ TA_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
+ /* TCC range 4*/
+ TA_GFX_V9__GFX_TCC_INDEX4_START,
+ TA_GFX_V9__GFX_TCC_WRRET_TAG_WRITE_RETURN =
+ TA_GFX_V9__GFX_TCC_INDEX4_START,
+ TA_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER,
+ TA_GFX_V9__GFX_TCC_INDEX4_END =
+ TA_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER,
+ TA_GFX_V9__GFX_TCC_INDEX_END = TA_GFX_V9__GFX_TCC_INDEX4_END,
+ /* TCI*/
+ TA_GFX_V9__GFX_TCI_WRITE_RAM,
+ /* TCP*/
+ TA_GFX_V9__GFX_TCP_INDEX_START,
+ TA_GFX_V9__GFX_TCP_CACHE_RAM = TA_GFX_V9__GFX_TCP_INDEX_START,
+ TA_GFX_V9__GFX_TCP_LFIFO_RAM,
+ TA_GFX_V9__GFX_TCP_CMD_FIFO,
+ TA_GFX_V9__GFX_TCP_VM_FIFO,
+ TA_GFX_V9__GFX_TCP_DB_RAM,
+ TA_GFX_V9__GFX_TCP_UTCL1_LFIFO0,
+ TA_GFX_V9__GFX_TCP_UTCL1_LFIFO1,
+ TA_GFX_V9__GFX_TCP_INDEX_END = TA_GFX_V9__GFX_TCP_UTCL1_LFIFO1,
+ /* TD*/
+ TA_GFX_V9__GFX_TD_INDEX_START,
+ TA_GFX_V9__GFX_TD_SS_FIFO_LO = TA_GFX_V9__GFX_TD_INDEX_START,
+ TA_GFX_V9__GFX_TD_SS_FIFO_HI,
+ TA_GFX_V9__GFX_TD_CS_FIFO,
+ TA_GFX_V9__GFX_TD_INDEX_END = TA_GFX_V9__GFX_TD_CS_FIFO,
+ /* EA (3 sub-ranges)*/
+ TA_GFX_V9__GFX_EA_INDEX_START,
+ /* EA range 0*/
+ TA_GFX_V9__GFX_EA_INDEX0_START = TA_GFX_V9__GFX_EA_INDEX_START,
+ TA_GFX_V9__GFX_EA_DRAMRD_CMDMEM = TA_GFX_V9__GFX_EA_INDEX0_START,
+ TA_GFX_V9__GFX_EA_DRAMWR_CMDMEM,
+ TA_GFX_V9__GFX_EA_DRAMWR_DATAMEM,
+ TA_GFX_V9__GFX_EA_RRET_TAGMEM,
+ TA_GFX_V9__GFX_EA_WRET_TAGMEM,
+ TA_GFX_V9__GFX_EA_GMIRD_CMDMEM,
+ TA_GFX_V9__GFX_EA_GMIWR_CMDMEM,
+ TA_GFX_V9__GFX_EA_GMIWR_DATAMEM,
+ TA_GFX_V9__GFX_EA_INDEX0_END = TA_GFX_V9__GFX_EA_GMIWR_DATAMEM,
+ /* EA range 1*/
+ TA_GFX_V9__GFX_EA_INDEX1_START,
+ TA_GFX_V9__GFX_EA_DRAMRD_PAGEMEM = TA_GFX_V9__GFX_EA_INDEX1_START,
+ TA_GFX_V9__GFX_EA_DRAMWR_PAGEMEM,
+ TA_GFX_V9__GFX_EA_IORD_CMDMEM,
+ TA_GFX_V9__GFX_EA_IOWR_CMDMEM,
+ TA_GFX_V9__GFX_EA_IOWR_DATAMEM,
+ TA_GFX_V9__GFX_EA_GMIRD_PAGEMEM,
+ TA_GFX_V9__GFX_EA_GMIWR_PAGEMEM,
+ TA_GFX_V9__GFX_EA_INDEX1_END = TA_GFX_V9__GFX_EA_GMIWR_PAGEMEM,
+ /* EA range 2*/
+ TA_GFX_V9__GFX_EA_INDEX2_START,
+ TA_GFX_V9__GFX_EA_MAM_D0MEM = TA_GFX_V9__GFX_EA_INDEX2_START,
+ TA_GFX_V9__GFX_EA_MAM_D1MEM,
+ TA_GFX_V9__GFX_EA_MAM_D2MEM,
+ TA_GFX_V9__GFX_EA_MAM_D3MEM,
+ TA_GFX_V9__GFX_EA_INDEX2_END = TA_GFX_V9__GFX_EA_MAM_D3MEM,
+ TA_GFX_V9__GFX_EA_INDEX_END = TA_GFX_V9__GFX_EA_INDEX2_END,
+ /* UTC VM L2 bank*/
+ TA_GFX_V9__UTC_VML2_BANK_CACHE,
+ /* UTC VM walker*/
+ TA_GFX_V9__UTC_VML2_WALKER,
+ /* UTC ATC L2 2MB cache*/
+ TA_GFX_V9__UTC_ATCL2_CACHE_2M_BANK,
+ /* UTC ATC L2 4KB cache*/
+ TA_GFX_V9__UTC_ATCL2_CACHE_4K_BANK,
+ TA_GFX_V9__GFX_MAX
+};
+
+struct ras_gfx_subblock_t {
+ unsigned char *name;
+ int ta_subblock;
+ int hw_supported_error_type;
+ int sw_supported_error_type;
+};
+
+#define RAS_GFX_SUB_BLOCK(subblock, a, b, c, d, e, f, g, h) \
+ [RAS_GFX_V9__##subblock] = { \
+ #subblock, \
+ TA_GFX_V9__##subblock, \
+ ((a) | ((b) << 1) | ((c) << 2) | ((d) << 3)), \
+ (((e) << 1) | ((f) << 3) | (g) | ((h) << 2)), \
+ }
+
+const struct ras_gfx_subblock_t ras_gfx_v9_0_subblocks[] = {
+ RAS_GFX_SUB_BLOCK(GFX_CPC_SCRATCH, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_CPC_UCODE, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_DC_STATE_ME1, 1, 0, 0, 1, 0, 0, 1, 0),
+ RAS_GFX_SUB_BLOCK(GFX_DC_CSINVOC_ME1, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_DC_RESTORE_ME1, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_DC_STATE_ME2, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_DC_CSINVOC_ME2, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_DC_RESTORE_ME2, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_CPF_ROQ_ME2, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_CPF_ROQ_ME1, 1, 0, 0, 1, 0, 0, 1, 0),
+ RAS_GFX_SUB_BLOCK(GFX_CPF_TAG, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_CPG_DMA_ROQ, 1, 0, 0, 1, 0, 0, 1, 0),
+ RAS_GFX_SUB_BLOCK(GFX_CPG_DMA_TAG, 0, 1, 1, 1, 0, 1, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_CPG_TAG, 0, 1, 1, 1, 1, 1, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_GDS_MEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_GDS_INPUT_QUEUE, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_GDS_OA_PHY_CMD_RAM_MEM, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_GDS_OA_PHY_DATA_RAM_MEM, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_GDS_OA_PIPE_MEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SPI_SR_MEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQ_SGPR, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQ_LDS_D, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_SQ_LDS_I, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQ_VGPR, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_UTCL1_LFIFO, 0, 1, 1, 1, 0, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU0_WRITE_DATA_BUF, 0, 1, 1, 1, 0, 0,
+ 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU0_UTCL1_LFIFO, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU1_WRITE_DATA_BUF, 0, 1, 1, 1, 0, 0,
+ 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU1_UTCL1_LFIFO, 0, 1, 1, 1, 1, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU2_WRITE_DATA_BUF, 0, 1, 1, 1, 0, 0,
+ 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU2_UTCL1_LFIFO, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_TAG_RAM, 0, 1, 1, 1, 1, 0, 0,
+ 1),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, 1, 0, 0, 1, 0,
+ 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_BANK_RAM, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_TAG_RAM, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_HIT_FIFO, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, 1, 0, 0, 1, 0, 0,
+ 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_BANK_RAM, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_TAG_RAM, 0, 1, 1, 1, 1, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, 1, 0, 0, 1, 0,
+ 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_BANK_RAM, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_TAG_RAM, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_HIT_FIFO, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, 1, 0, 0, 1, 0, 0,
+ 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_BANK_RAM, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_TA_FS_DFIFO, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_TA_FS_AFIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TA_FL_LFIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TA_FX_LFIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TA_FS_CFIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCA_HOLE_FIFO, 1, 0, 0, 1, 0, 1, 1, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCA_REQ_FIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA_BANK_0_1, 0, 1, 1, 1, 1, 0, 0,
+ 1),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA_BANK_1_0, 0, 1, 1, 1, 1, 0, 0,
+ 1),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA_BANK_1_1, 0, 1, 1, 1, 1, 0, 0,
+ 1),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DIRTY_BANK_0, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DIRTY_BANK_1, 0, 1, 1, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_HIGH_RATE_TAG, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_LOW_RATE_TAG, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_IN_USE_DEC, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_IN_USE_TRANSFER, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_RETURN_DATA, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_RETURN_CONTROL, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_UC_ATOMIC_FIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_WRITE_RETURN, 1, 0, 0, 1, 0, 1, 1, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_WRITE_CACHE_READ, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_SRC_FIFO, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_SRC_FIFO_NEXT_RAM, 1, 0, 0, 1, 0, 0, 1, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_TAG_PROBE_FIFO, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_LATENCY_FIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_LATENCY_FIFO_NEXT_RAM, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_WRRET_TAG_WRITE_RETURN, 1, 0, 0, 1, 0, 0,
+ 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCC_ATOMIC_RETURN_BUFFER, 1, 0, 0, 1, 0, 0, 0,
+ 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCI_WRITE_RAM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCP_CACHE_RAM, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_TCP_LFIFO_RAM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCP_CMD_FIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCP_VM_FIFO, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCP_DB_RAM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCP_UTCL1_LFIFO0, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TCP_UTCL1_LFIFO1, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TD_SS_FIFO_LO, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_TD_SS_FIFO_HI, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_TD_CS_FIFO, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_DRAMRD_CMDMEM, 0, 1, 1, 1, 1, 0, 0, 1),
+ RAS_GFX_SUB_BLOCK(GFX_EA_DRAMWR_CMDMEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_DRAMWR_DATAMEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_RRET_TAGMEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_WRET_TAGMEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_GMIRD_CMDMEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_GMIWR_CMDMEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_GMIWR_DATAMEM, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_DRAMRD_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_DRAMWR_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_IORD_CMDMEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_IOWR_CMDMEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_IOWR_DATAMEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_GMIRD_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_GMIWR_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D0MEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D1MEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D2MEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D3MEM, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(UTC_VML2_BANK_CACHE, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(UTC_VML2_WALKER, 0, 1, 1, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(UTC_ATCL2_CACHE_2M_BANK, 1, 0, 0, 1, 0, 0, 0, 0),
+ RAS_GFX_SUB_BLOCK(UTC_ATCL2_CACHE_4K_BANK, 0, 1, 1, 1, 0, 0, 0, 0),
+};
+
+static int gfx_v9_0_get_ta_subblock(struct ras_core_context *ras_core,
+ uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock)
+{
+ const struct ras_gfx_subblock_t *gfx_subblock;
+
+ if (subblock >= ARRAY_SIZE(ras_gfx_v9_0_subblocks))
+ return -EINVAL;
+
+ gfx_subblock = &ras_gfx_v9_0_subblocks[subblock];
+ if (!gfx_subblock->name)
+ return -EPERM;
+
+ if (!(gfx_subblock->hw_supported_error_type & error_type)) {
+ RAS_DEV_ERR(ras_core->dev, "GFX Subblock %s, hardware do not support type 0x%x\n",
+ gfx_subblock->name, error_type);
+ return -EPERM;
+ }
+
+ if (!(gfx_subblock->sw_supported_error_type & error_type)) {
+ RAS_DEV_ERR(ras_core->dev, "GFX Subblock %s, driver do not support type 0x%x\n",
+ gfx_subblock->name, error_type);
+ return -EPERM;
+ }
+
+ *ta_subblock = gfx_subblock->ta_subblock;
+
+ return 0;
+}
+
+const struct ras_gfx_ip_func gfx_ras_func_v9_0 = {
+ .get_ta_subblock = gfx_v9_0_get_ta_subblock,
+};
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.h
new file mode 100644
index 000000000000..659b56619747
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_GFX_V9_0_H__
+#define __RAS_GFX_V9_0_H__
+
+enum ras_gfx_v9_subblock {
+ /* CPC */
+ RAS_GFX_V9__GFX_CPC_INDEX_START = 0,
+ RAS_GFX_V9__GFX_CPC_SCRATCH =
+ RAS_GFX_V9__GFX_CPC_INDEX_START,
+ RAS_GFX_V9__GFX_CPC_UCODE,
+ RAS_GFX_V9__GFX_DC_STATE_ME1,
+ RAS_GFX_V9__GFX_DC_CSINVOC_ME1,
+ RAS_GFX_V9__GFX_DC_RESTORE_ME1,
+ RAS_GFX_V9__GFX_DC_STATE_ME2,
+ RAS_GFX_V9__GFX_DC_CSINVOC_ME2,
+ RAS_GFX_V9__GFX_DC_RESTORE_ME2,
+ RAS_GFX_V9__GFX_CPC_INDEX_END =
+ RAS_GFX_V9__GFX_DC_RESTORE_ME2,
+ /* CPF */
+ RAS_GFX_V9__GFX_CPF_INDEX_START,
+ RAS_GFX_V9__GFX_CPF_ROQ_ME2 =
+ RAS_GFX_V9__GFX_CPF_INDEX_START,
+ RAS_GFX_V9__GFX_CPF_ROQ_ME1,
+ RAS_GFX_V9__GFX_CPF_TAG,
+ RAS_GFX_V9__GFX_CPF_INDEX_END = RAS_GFX_V9__GFX_CPF_TAG,
+ /* CPG */
+ RAS_GFX_V9__GFX_CPG_INDEX_START,
+ RAS_GFX_V9__GFX_CPG_DMA_ROQ =
+ RAS_GFX_V9__GFX_CPG_INDEX_START,
+ RAS_GFX_V9__GFX_CPG_DMA_TAG,
+ RAS_GFX_V9__GFX_CPG_TAG,
+ RAS_GFX_V9__GFX_CPG_INDEX_END = RAS_GFX_V9__GFX_CPG_TAG,
+ /* GDS */
+ RAS_GFX_V9__GFX_GDS_INDEX_START,
+ RAS_GFX_V9__GFX_GDS_MEM = RAS_GFX_V9__GFX_GDS_INDEX_START,
+ RAS_GFX_V9__GFX_GDS_INPUT_QUEUE,
+ RAS_GFX_V9__GFX_GDS_OA_PHY_CMD_RAM_MEM,
+ RAS_GFX_V9__GFX_GDS_OA_PHY_DATA_RAM_MEM,
+ RAS_GFX_V9__GFX_GDS_OA_PIPE_MEM,
+ RAS_GFX_V9__GFX_GDS_INDEX_END =
+ RAS_GFX_V9__GFX_GDS_OA_PIPE_MEM,
+ /* SPI */
+ RAS_GFX_V9__GFX_SPI_SR_MEM,
+ /* SQ */
+ RAS_GFX_V9__GFX_SQ_INDEX_START,
+ RAS_GFX_V9__GFX_SQ_SGPR = RAS_GFX_V9__GFX_SQ_INDEX_START,
+ RAS_GFX_V9__GFX_SQ_LDS_D,
+ RAS_GFX_V9__GFX_SQ_LDS_I,
+ RAS_GFX_V9__GFX_SQ_VGPR,
+ RAS_GFX_V9__GFX_SQ_INDEX_END = RAS_GFX_V9__GFX_SQ_VGPR,
+ /* SQC (3 ranges) */
+ RAS_GFX_V9__GFX_SQC_INDEX_START,
+ /* SQC range 0 */
+ RAS_GFX_V9__GFX_SQC_INDEX0_START =
+ RAS_GFX_V9__GFX_SQC_INDEX_START,
+ RAS_GFX_V9__GFX_SQC_INST_UTCL1_LFIFO =
+ RAS_GFX_V9__GFX_SQC_INDEX0_START,
+ RAS_GFX_V9__GFX_SQC_DATA_CU0_WRITE_DATA_BUF,
+ RAS_GFX_V9__GFX_SQC_DATA_CU0_UTCL1_LFIFO,
+ RAS_GFX_V9__GFX_SQC_DATA_CU1_WRITE_DATA_BUF,
+ RAS_GFX_V9__GFX_SQC_DATA_CU1_UTCL1_LFIFO,
+ RAS_GFX_V9__GFX_SQC_DATA_CU2_WRITE_DATA_BUF,
+ RAS_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
+ RAS_GFX_V9__GFX_SQC_INDEX0_END =
+ RAS_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO,
+ /* SQC range 1 */
+ RAS_GFX_V9__GFX_SQC_INDEX1_START,
+ RAS_GFX_V9__GFX_SQC_INST_BANKA_TAG_RAM =
+ RAS_GFX_V9__GFX_SQC_INDEX1_START,
+ RAS_GFX_V9__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO,
+ RAS_GFX_V9__GFX_SQC_INST_BANKA_MISS_FIFO,
+ RAS_GFX_V9__GFX_SQC_INST_BANKA_BANK_RAM,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKA_TAG_RAM,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKA_HIT_FIFO,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKA_MISS_FIFO,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM,
+ RAS_GFX_V9__GFX_SQC_INDEX1_END =
+ RAS_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM,
+ /* SQC range 2 */
+ RAS_GFX_V9__GFX_SQC_INDEX2_START,
+ RAS_GFX_V9__GFX_SQC_INST_BANKB_TAG_RAM =
+ RAS_GFX_V9__GFX_SQC_INDEX2_START,
+ RAS_GFX_V9__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO,
+ RAS_GFX_V9__GFX_SQC_INST_BANKB_MISS_FIFO,
+ RAS_GFX_V9__GFX_SQC_INST_BANKB_BANK_RAM,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKB_TAG_RAM,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKB_HIT_FIFO,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKB_MISS_FIFO,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM,
+ RAS_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM,
+ RAS_GFX_V9__GFX_SQC_INDEX2_END =
+ RAS_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM,
+ RAS_GFX_V9__GFX_SQC_INDEX_END =
+ RAS_GFX_V9__GFX_SQC_INDEX2_END,
+ /* TA */
+ RAS_GFX_V9__GFX_TA_INDEX_START,
+ RAS_GFX_V9__GFX_TA_FS_DFIFO =
+ RAS_GFX_V9__GFX_TA_INDEX_START,
+ RAS_GFX_V9__GFX_TA_FS_AFIFO,
+ RAS_GFX_V9__GFX_TA_FL_LFIFO,
+ RAS_GFX_V9__GFX_TA_FX_LFIFO,
+ RAS_GFX_V9__GFX_TA_FS_CFIFO,
+ RAS_GFX_V9__GFX_TA_INDEX_END = RAS_GFX_V9__GFX_TA_FS_CFIFO,
+ /* TCA */
+ RAS_GFX_V9__GFX_TCA_INDEX_START,
+ RAS_GFX_V9__GFX_TCA_HOLE_FIFO =
+ RAS_GFX_V9__GFX_TCA_INDEX_START,
+ RAS_GFX_V9__GFX_TCA_REQ_FIFO,
+ RAS_GFX_V9__GFX_TCA_INDEX_END =
+ RAS_GFX_V9__GFX_TCA_REQ_FIFO,
+ /* TCC (5 sub-ranges) */
+ RAS_GFX_V9__GFX_TCC_INDEX_START,
+ /* TCC range 0 */
+ RAS_GFX_V9__GFX_TCC_INDEX0_START =
+ RAS_GFX_V9__GFX_TCC_INDEX_START,
+ RAS_GFX_V9__GFX_TCC_CACHE_DATA =
+ RAS_GFX_V9__GFX_TCC_INDEX0_START,
+ RAS_GFX_V9__GFX_TCC_CACHE_DATA_BANK_0_1,
+ RAS_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_0,
+ RAS_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_1,
+ RAS_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_0,
+ RAS_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_1,
+ RAS_GFX_V9__GFX_TCC_HIGH_RATE_TAG,
+ RAS_GFX_V9__GFX_TCC_LOW_RATE_TAG,
+ RAS_GFX_V9__GFX_TCC_INDEX0_END =
+ RAS_GFX_V9__GFX_TCC_LOW_RATE_TAG,
+ /* TCC range 1 */
+ RAS_GFX_V9__GFX_TCC_INDEX1_START,
+ RAS_GFX_V9__GFX_TCC_IN_USE_DEC =
+ RAS_GFX_V9__GFX_TCC_INDEX1_START,
+ RAS_GFX_V9__GFX_TCC_IN_USE_TRANSFER,
+ RAS_GFX_V9__GFX_TCC_INDEX1_END =
+ RAS_GFX_V9__GFX_TCC_IN_USE_TRANSFER,
+ /* TCC range 2 */
+ RAS_GFX_V9__GFX_TCC_INDEX2_START,
+ RAS_GFX_V9__GFX_TCC_RETURN_DATA =
+ RAS_GFX_V9__GFX_TCC_INDEX2_START,
+ RAS_GFX_V9__GFX_TCC_RETURN_CONTROL,
+ RAS_GFX_V9__GFX_TCC_UC_ATOMIC_FIFO,
+ RAS_GFX_V9__GFX_TCC_WRITE_RETURN,
+ RAS_GFX_V9__GFX_TCC_WRITE_CACHE_READ,
+ RAS_GFX_V9__GFX_TCC_SRC_FIFO,
+ RAS_GFX_V9__GFX_TCC_SRC_FIFO_NEXT_RAM,
+ RAS_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO,
+ RAS_GFX_V9__GFX_TCC_INDEX2_END =
+ RAS_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO,
+ /* TCC range 3 */
+ RAS_GFX_V9__GFX_TCC_INDEX3_START,
+ RAS_GFX_V9__GFX_TCC_LATENCY_FIFO =
+ RAS_GFX_V9__GFX_TCC_INDEX3_START,
+ RAS_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
+ RAS_GFX_V9__GFX_TCC_INDEX3_END =
+ RAS_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM,
+ /* TCC range 4 */
+ RAS_GFX_V9__GFX_TCC_INDEX4_START,
+ RAS_GFX_V9__GFX_TCC_WRRET_TAG_WRITE_RETURN =
+ RAS_GFX_V9__GFX_TCC_INDEX4_START,
+ RAS_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER,
+ RAS_GFX_V9__GFX_TCC_INDEX4_END =
+ RAS_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER,
+ RAS_GFX_V9__GFX_TCC_INDEX_END =
+ RAS_GFX_V9__GFX_TCC_INDEX4_END,
+ /* TCI */
+ RAS_GFX_V9__GFX_TCI_WRITE_RAM,
+ /* TCP */
+ RAS_GFX_V9__GFX_TCP_INDEX_START,
+ RAS_GFX_V9__GFX_TCP_CACHE_RAM =
+ RAS_GFX_V9__GFX_TCP_INDEX_START,
+ RAS_GFX_V9__GFX_TCP_LFIFO_RAM,
+ RAS_GFX_V9__GFX_TCP_CMD_FIFO,
+ RAS_GFX_V9__GFX_TCP_VM_FIFO,
+ RAS_GFX_V9__GFX_TCP_DB_RAM,
+ RAS_GFX_V9__GFX_TCP_UTCL1_LFIFO0,
+ RAS_GFX_V9__GFX_TCP_UTCL1_LFIFO1,
+ RAS_GFX_V9__GFX_TCP_INDEX_END =
+ RAS_GFX_V9__GFX_TCP_UTCL1_LFIFO1,
+ /* TD */
+ RAS_GFX_V9__GFX_TD_INDEX_START,
+ RAS_GFX_V9__GFX_TD_SS_FIFO_LO =
+ RAS_GFX_V9__GFX_TD_INDEX_START,
+ RAS_GFX_V9__GFX_TD_SS_FIFO_HI,
+ RAS_GFX_V9__GFX_TD_CS_FIFO,
+ RAS_GFX_V9__GFX_TD_INDEX_END = RAS_GFX_V9__GFX_TD_CS_FIFO,
+ /* EA (3 sub-ranges) */
+ RAS_GFX_V9__GFX_EA_INDEX_START,
+ /* EA range 0 */
+ RAS_GFX_V9__GFX_EA_INDEX0_START =
+ RAS_GFX_V9__GFX_EA_INDEX_START,
+ RAS_GFX_V9__GFX_EA_DRAMRD_CMDMEM =
+ RAS_GFX_V9__GFX_EA_INDEX0_START,
+ RAS_GFX_V9__GFX_EA_DRAMWR_CMDMEM,
+ RAS_GFX_V9__GFX_EA_DRAMWR_DATAMEM,
+ RAS_GFX_V9__GFX_EA_RRET_TAGMEM,
+ RAS_GFX_V9__GFX_EA_WRET_TAGMEM,
+ RAS_GFX_V9__GFX_EA_GMIRD_CMDMEM,
+ RAS_GFX_V9__GFX_EA_GMIWR_CMDMEM,
+ RAS_GFX_V9__GFX_EA_GMIWR_DATAMEM,
+ RAS_GFX_V9__GFX_EA_INDEX0_END =
+ RAS_GFX_V9__GFX_EA_GMIWR_DATAMEM,
+ /* EA range 1 */
+ RAS_GFX_V9__GFX_EA_INDEX1_START,
+ RAS_GFX_V9__GFX_EA_DRAMRD_PAGEMEM =
+ RAS_GFX_V9__GFX_EA_INDEX1_START,
+ RAS_GFX_V9__GFX_EA_DRAMWR_PAGEMEM,
+ RAS_GFX_V9__GFX_EA_IORD_CMDMEM,
+ RAS_GFX_V9__GFX_EA_IOWR_CMDMEM,
+ RAS_GFX_V9__GFX_EA_IOWR_DATAMEM,
+ RAS_GFX_V9__GFX_EA_GMIRD_PAGEMEM,
+ RAS_GFX_V9__GFX_EA_GMIWR_PAGEMEM,
+ RAS_GFX_V9__GFX_EA_INDEX1_END =
+ RAS_GFX_V9__GFX_EA_GMIWR_PAGEMEM,
+ /* EA range 2 */
+ RAS_GFX_V9__GFX_EA_INDEX2_START,
+ RAS_GFX_V9__GFX_EA_MAM_D0MEM =
+ RAS_GFX_V9__GFX_EA_INDEX2_START,
+ RAS_GFX_V9__GFX_EA_MAM_D1MEM,
+ RAS_GFX_V9__GFX_EA_MAM_D2MEM,
+ RAS_GFX_V9__GFX_EA_MAM_D3MEM,
+ RAS_GFX_V9__GFX_EA_INDEX2_END =
+ RAS_GFX_V9__GFX_EA_MAM_D3MEM,
+ RAS_GFX_V9__GFX_EA_INDEX_END =
+ RAS_GFX_V9__GFX_EA_INDEX2_END,
+ /* UTC VM L2 bank */
+ RAS_GFX_V9__UTC_VML2_BANK_CACHE,
+ /* UTC VM walker */
+ RAS_GFX_V9__UTC_VML2_WALKER,
+ /* UTC ATC L2 2MB cache */
+ RAS_GFX_V9__UTC_ATCL2_CACHE_2M_BANK,
+ /* UTC ATC L2 4KB cache */
+ RAS_GFX_V9__UTC_ATCL2_CACHE_4K_BANK,
+ RAS_GFX_V9__GFX_MAX
+};
+
+extern const struct ras_gfx_ip_func gfx_ras_func_v9_0;
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
new file mode 100644
index 000000000000..0a838fdcb2f6
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_core_status.h"
+#include "ras_log_ring.h"
+
+#define RAS_LOG_MAX_QUERY_SIZE 0xC000
+#define RAS_LOG_MEM_TEMP_SIZE 0x200
+#define RAS_LOG_MEMPOOL_SIZE \
+ (RAS_LOG_MAX_QUERY_SIZE + RAS_LOG_MEM_TEMP_SIZE)
+
+#define BATCH_IDX_TO_TREE_IDX(batch_idx, sn) (((batch_idx) << 8) | (sn))
+
+static const uint64_t ras_rma_aca_reg[ACA_REG_MAX_COUNT] = {
+ [ACA_REG_IDX__CTL] = 0x1,
+ [ACA_REG_IDX__STATUS] = 0xB000000000000137,
+ [ACA_REG_IDX__ADDR] = 0x0,
+ [ACA_REG_IDX__MISC0] = 0x0,
+ [ACA_REG_IDX__CONFG] = 0x1ff00000002,
+ [ACA_REG_IDX__IPID] = 0x9600000000,
+ [ACA_REG_IDX__SYND] = 0x0,
+};
+
+static uint64_t ras_log_ring_get_logged_ecc_count(struct ras_core_context *ras_core)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ uint64_t count = 0;
+
+ if (log_ring->logged_ecc_count < 0) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Error: the logged ras count should not less than 0!\n");
+ count = 0;
+ } else {
+ count = log_ring->logged_ecc_count;
+ }
+
+ if (count > RAS_LOG_MEMPOOL_SIZE)
+ RAS_DEV_WARN(ras_core->dev,
+ "Error: the logged ras count is out of range!\n");
+
+ return count;
+}
+
+static int ras_log_ring_add_data(struct ras_core_context *ras_core,
+ struct ras_log_info *log, struct ras_log_batch_tag *batch_tag)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ unsigned long flags = 0;
+ int ret = 0;
+
+ if (batch_tag && (batch_tag->sub_seqno >= MAX_RECORD_PER_BATCH)) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Invalid batch sub seqno:%d, batch:0x%llx\n",
+ batch_tag->sub_seqno, batch_tag->batch_id);
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&log_ring->spin_lock, flags);
+ if (batch_tag) {
+ log->seqno =
+ BATCH_IDX_TO_TREE_IDX(batch_tag->batch_id, batch_tag->sub_seqno);
+ batch_tag->sub_seqno++;
+ } else {
+ log->seqno = BATCH_IDX_TO_TREE_IDX(log_ring->mono_upward_batch_id, 0);
+ log_ring->mono_upward_batch_id++;
+ }
+ ret = radix_tree_insert(&log_ring->ras_log_root, log->seqno, log);
+ if (!ret)
+ log_ring->logged_ecc_count++;
+ spin_unlock_irqrestore(&log_ring->spin_lock, flags);
+
+ if (ret) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to add ras log! seqno:0x%llx, ret:%d\n",
+ log->seqno, ret);
+ mempool_free(log, log_ring->ras_log_mempool);
+ }
+
+ return ret;
+}
+
+static int ras_log_ring_delete_data(struct ras_core_context *ras_core, uint32_t count)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ unsigned long flags = 0;
+ uint32_t i = 0, j = 0;
+ uint64_t batch_id, idx;
+ void *data;
+ int ret = -ENODATA;
+
+ if (count > ras_log_ring_get_logged_ecc_count(ras_core))
+ return -EINVAL;
+
+ spin_lock_irqsave(&log_ring->spin_lock, flags);
+ batch_id = log_ring->last_del_batch_id;
+ while (batch_id < log_ring->mono_upward_batch_id) {
+ for (j = 0; j < MAX_RECORD_PER_BATCH; j++) {
+ idx = BATCH_IDX_TO_TREE_IDX(batch_id, j);
+ data = radix_tree_delete(&log_ring->ras_log_root, idx);
+ if (data) {
+ mempool_free(data, log_ring->ras_log_mempool);
+ log_ring->logged_ecc_count--;
+ i++;
+ }
+ }
+ batch_id = ++log_ring->last_del_batch_id;
+ if (i >= count) {
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&log_ring->spin_lock, flags);
+
+ return ret;
+}
+
+static void ras_log_ring_clear_log_tree(struct ras_core_context *ras_core)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ uint64_t batch_id, idx;
+ unsigned long flags = 0;
+ void *data;
+ int j;
+
+ if ((log_ring->mono_upward_batch_id <= log_ring->last_del_batch_id) &&
+ !log_ring->logged_ecc_count)
+ return;
+
+ spin_lock_irqsave(&log_ring->spin_lock, flags);
+ batch_id = log_ring->last_del_batch_id;
+ while (batch_id < log_ring->mono_upward_batch_id) {
+ for (j = 0; j < MAX_RECORD_PER_BATCH; j++) {
+ idx = BATCH_IDX_TO_TREE_IDX(batch_id, j);
+ data = radix_tree_delete(&log_ring->ras_log_root, idx);
+ if (data) {
+ mempool_free(data, log_ring->ras_log_mempool);
+ log_ring->logged_ecc_count--;
+ }
+ }
+ batch_id++;
+ }
+ spin_unlock_irqrestore(&log_ring->spin_lock, flags);
+
+}
+
+int ras_log_ring_sw_init(struct ras_core_context *ras_core)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+
+ memset(log_ring, 0, sizeof(*log_ring));
+
+ log_ring->ras_log_mempool = mempool_create_kmalloc_pool(
+ RAS_LOG_MEMPOOL_SIZE, sizeof(struct ras_log_info));
+ if (!log_ring->ras_log_mempool)
+ return -ENOMEM;
+
+ INIT_RADIX_TREE(&log_ring->ras_log_root, GFP_KERNEL);
+
+ spin_lock_init(&log_ring->spin_lock);
+
+ return 0;
+}
+
+int ras_log_ring_sw_fini(struct ras_core_context *ras_core)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+
+ ras_log_ring_clear_log_tree(ras_core);
+ log_ring->logged_ecc_count = 0;
+ log_ring->last_del_batch_id = 0;
+ log_ring->mono_upward_batch_id = 0;
+
+ mempool_destroy(log_ring->ras_log_mempool);
+
+ return 0;
+}
+
+struct ras_log_batch_tag *ras_log_ring_create_batch_tag(struct ras_core_context *ras_core)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ struct ras_log_batch_tag *batch_tag;
+ unsigned long flags = 0;
+
+ batch_tag = kzalloc(sizeof(*batch_tag), GFP_KERNEL);
+ if (!batch_tag)
+ return NULL;
+
+ spin_lock_irqsave(&log_ring->spin_lock, flags);
+ batch_tag->batch_id = log_ring->mono_upward_batch_id;
+ log_ring->mono_upward_batch_id++;
+ spin_unlock_irqrestore(&log_ring->spin_lock, flags);
+
+ batch_tag->sub_seqno = 0;
+ batch_tag->timestamp = ras_core_get_utc_second_timestamp(ras_core);
+ return batch_tag;
+}
+
+void ras_log_ring_destroy_batch_tag(struct ras_core_context *ras_core,
+ struct ras_log_batch_tag *batch_tag)
+{
+ kfree(batch_tag);
+}
+
+void ras_log_ring_add_log_event(struct ras_core_context *ras_core,
+ enum ras_log_event event, void *data, struct ras_log_batch_tag *batch_tag)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ struct device_system_info dev_info = {0};
+ struct ras_log_info *log;
+ uint64_t socket_id;
+ void *obj;
+
+ obj = mempool_alloc_preallocated(log_ring->ras_log_mempool);
+ if (!obj ||
+ (ras_log_ring_get_logged_ecc_count(ras_core) >= RAS_LOG_MEMPOOL_SIZE)) {
+ ras_log_ring_delete_data(ras_core, RAS_LOG_MEM_TEMP_SIZE);
+ if (!obj)
+ obj = mempool_alloc_preallocated(log_ring->ras_log_mempool);
+ }
+
+ if (!obj) {
+ RAS_DEV_ERR(ras_core->dev, "ERROR: Failed to alloc ras log buffer!\n");
+ return;
+ }
+
+ log = (struct ras_log_info *)obj;
+
+ memset(log, 0, sizeof(*log));
+ log->timestamp =
+ batch_tag ? batch_tag->timestamp : ras_core_get_utc_second_timestamp(ras_core);
+ log->event = event;
+
+ if (data)
+ memcpy(&log->aca_reg, data, sizeof(log->aca_reg));
+
+ if (event == RAS_LOG_EVENT_RMA) {
+ memcpy(&log->aca_reg, ras_rma_aca_reg, sizeof(log->aca_reg));
+ ras_core_get_device_system_info(ras_core, &dev_info);
+ socket_id = dev_info.socket_id;
+ log->aca_reg.regs[ACA_REG_IDX__IPID] |= ((socket_id / 4) & 0x01);
+ log->aca_reg.regs[ACA_REG_IDX__IPID] |= (((socket_id % 4) & 0x3) << 44);
+ }
+
+ ras_log_ring_add_data(ras_core, log, batch_tag);
+}
+
+static struct ras_log_info *ras_log_ring_lookup_data(struct ras_core_context *ras_core,
+ uint64_t idx)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ unsigned long flags = 0;
+ void *data;
+
+ spin_lock_irqsave(&log_ring->spin_lock, flags);
+ data = radix_tree_lookup(&log_ring->ras_log_root, idx);
+ spin_unlock_irqrestore(&log_ring->spin_lock, flags);
+
+ return (struct ras_log_info *)data;
+}
+
+int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id,
+ struct ras_log_info **log_arr, uint32_t arr_num)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+ uint32_t i, idx, count = 0;
+ void *data;
+
+ if ((batch_id >= log_ring->mono_upward_batch_id) ||
+ (batch_id < log_ring->last_del_batch_id))
+ return -EINVAL;
+
+ for (i = 0; i < MAX_RECORD_PER_BATCH; i++) {
+ idx = BATCH_IDX_TO_TREE_IDX(batch_id, i);
+ data = ras_log_ring_lookup_data(ras_core, idx);
+ if (data) {
+ log_arr[count++] = data;
+ if (count >= arr_num)
+ break;
+ }
+ }
+
+ return count;
+}
+
+int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core,
+ struct ras_log_batch_overview *overview)
+{
+ struct ras_log_ring *log_ring = &ras_core->ras_log_ring;
+
+ overview->logged_batch_count =
+ log_ring->mono_upward_batch_id - log_ring->last_del_batch_id;
+ overview->last_batch_id = log_ring->mono_upward_batch_id;
+ overview->first_batch_id = log_ring->last_del_batch_id;
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
new file mode 100644
index 000000000000..0ff6cc35678d
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_LOG_RING_H__
+#define __RAS_LOG_RING_H__
+#include "ras_aca.h"
+
+#define MAX_RECORD_PER_BATCH 32
+
+#define RAS_LOG_SEQNO_TO_BATCH_IDX(seqno) ((seqno) >> 8)
+
+enum ras_log_event {
+ RAS_LOG_EVENT_NONE,
+ RAS_LOG_EVENT_UE,
+ RAS_LOG_EVENT_DE,
+ RAS_LOG_EVENT_CE,
+ RAS_LOG_EVENT_POISON_CREATION,
+ RAS_LOG_EVENT_POISON_CONSUMPTION,
+ RAS_LOG_EVENT_RMA,
+ RAS_LOG_EVENT_COUNT_MAX,
+};
+
+struct ras_aca_reg {
+ uint64_t regs[ACA_REG_MAX_COUNT];
+};
+
+struct ras_log_info {
+ uint64_t seqno;
+ uint64_t timestamp;
+ enum ras_log_event event;
+ union {
+ struct ras_aca_reg aca_reg;
+ };
+};
+
+struct ras_log_batch_tag {
+ uint64_t batch_id;
+ uint64_t timestamp;
+ uint32_t sub_seqno;
+};
+
+struct ras_log_ring {
+ void *ras_log_mempool;
+ struct radix_tree_root ras_log_root;
+ spinlock_t spin_lock;
+ uint64_t mono_upward_batch_id;
+ uint64_t last_del_batch_id;
+ int logged_ecc_count;
+};
+
+struct ras_log_batch_overview {
+ uint64_t first_batch_id;
+ uint64_t last_batch_id;
+ uint32_t logged_batch_count;
+};
+
+struct ras_core_context;
+
+int ras_log_ring_sw_init(struct ras_core_context *ras_core);
+int ras_log_ring_sw_fini(struct ras_core_context *ras_core);
+
+struct ras_log_batch_tag *ras_log_ring_create_batch_tag(struct ras_core_context *ras_core);
+void ras_log_ring_destroy_batch_tag(struct ras_core_context *ras_core,
+ struct ras_log_batch_tag *tag);
+void ras_log_ring_add_log_event(struct ras_core_context *ras_core,
+ enum ras_log_event event, void *data, struct ras_log_batch_tag *tag);
+
+int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_idx,
+ struct ras_log_info **log_arr, uint32_t arr_num);
+
+int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core,
+ struct ras_log_batch_overview *overview);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1.c b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.c
new file mode 100644
index 000000000000..f3321df85021
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "ras.h"
+#include "ras_mp1.h"
+#include "ras_mp1_v13_0.h"
+
+static const struct ras_mp1_ip_func *ras_mp1_get_ip_funcs(
+ struct ras_core_context *ras_core, uint32_t ip_version)
+{
+ switch (ip_version) {
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
+ case IP_VERSION(13, 0, 12):
+ return &mp1_ras_func_v13_0;
+ default:
+ RAS_DEV_ERR(ras_core->dev,
+ "MP1 ip version(0x%x) is not supported!\n", ip_version);
+ break;
+ }
+
+ return NULL;
+}
+
+int ras_mp1_get_bank_count(struct ras_core_context *ras_core,
+ enum ras_err_type type, u32 *count)
+{
+ struct ras_mp1 *mp1 = &ras_core->ras_mp1;
+
+ return mp1->ip_func->get_valid_bank_count(ras_core, type, count);
+}
+
+int ras_mp1_dump_bank(struct ras_core_context *ras_core,
+ u32 type, u32 idx, u32 reg_idx, u64 *val)
+{
+ struct ras_mp1 *mp1 = &ras_core->ras_mp1;
+
+ return mp1->ip_func->dump_valid_bank(ras_core, type, idx, reg_idx, val);
+}
+
+int ras_mp1_hw_init(struct ras_core_context *ras_core)
+{
+ struct ras_mp1 *mp1 = &ras_core->ras_mp1;
+
+ mp1->mp1_ip_version = ras_core->config->mp1_ip_version;
+ mp1->sys_func = ras_core->config->mp1_cfg.mp1_sys_fn;
+ if (!mp1->sys_func) {
+ RAS_DEV_ERR(ras_core->dev, "RAS mp1 sys function not configured!\n");
+ return -EINVAL;
+ }
+
+ mp1->ip_func = ras_mp1_get_ip_funcs(ras_core, mp1->mp1_ip_version);
+
+ return mp1->ip_func ? RAS_CORE_OK : -EINVAL;
+}
+
+int ras_mp1_hw_fini(struct ras_core_context *ras_core)
+{
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1.h b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.h
new file mode 100644
index 000000000000..de1d08286f41
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_MP1_H__
+#define __RAS_MP1_H__
+#include "ras.h"
+
+enum ras_err_type;
+struct ras_mp1_ip_func {
+ int (*get_valid_bank_count)(struct ras_core_context *ras_core,
+ enum ras_err_type type, u32 *count);
+ int (*dump_valid_bank)(struct ras_core_context *ras_core,
+ enum ras_err_type type, u32 idx, u32 reg_idx, u64 *val);
+};
+
+struct ras_mp1 {
+ uint32_t mp1_ip_version;
+ const struct ras_mp1_ip_func *ip_func;
+ const struct ras_mp1_sys_func *sys_func;
+};
+
+int ras_mp1_hw_init(struct ras_core_context *ras_core);
+int ras_mp1_hw_fini(struct ras_core_context *ras_core);
+
+int ras_mp1_get_bank_count(struct ras_core_context *ras_core,
+ enum ras_err_type type, u32 *count);
+
+int ras_mp1_dump_bank(struct ras_core_context *ras_core,
+ u32 ecc_type, u32 idx, u32 reg_idx, u64 *val);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.c
new file mode 100644
index 000000000000..310d39fc816b
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_mp1.h"
+#include "ras_core_status.h"
+#include "ras_mp1_v13_0.h"
+
+#define RAS_MP1_MSG_QueryValidMcaCount 0x36
+#define RAS_MP1_MSG_McaBankDumpDW 0x37
+#define RAS_MP1_MSG_ClearMcaOnRead 0x39
+#define RAS_MP1_MSG_QueryValidMcaCeCount 0x3A
+#define RAS_MP1_MSG_McaBankCeDumpDW 0x3B
+
+#define MAX_UE_BANKS_PER_QUERY 12
+#define MAX_CE_BANKS_PER_QUERY 12
+
+static int mp1_v13_0_get_bank_count(struct ras_core_context *ras_core,
+ enum ras_err_type type, u32 *count)
+{
+ struct ras_mp1 *mp1 = &ras_core->ras_mp1;
+ const struct ras_mp1_sys_func *sys_func = mp1->sys_func;
+ uint32_t bank_count = 0;
+ u32 msg;
+ int ret;
+
+ if (!count)
+ return -EINVAL;
+
+ if (!sys_func || !sys_func->mp1_get_valid_bank_count)
+ return -RAS_CORE_NOT_SUPPORTED;
+
+ switch (type) {
+ case RAS_ERR_TYPE__UE:
+ msg = RAS_MP1_MSG_QueryValidMcaCount;
+ break;
+ case RAS_ERR_TYPE__CE:
+ case RAS_ERR_TYPE__DE:
+ msg = RAS_MP1_MSG_QueryValidMcaCeCount;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ ret = sys_func->mp1_get_valid_bank_count(ras_core, msg, &bank_count);
+ if (!ret) {
+ if (((type == RAS_ERR_TYPE__UE) && (bank_count >= MAX_UE_BANKS_PER_QUERY)) ||
+ ((type == RAS_ERR_TYPE__CE) && (bank_count >= MAX_CE_BANKS_PER_QUERY)))
+ return -EINVAL;
+
+ *count = bank_count;
+ }
+
+ return ret;
+}
+
+static int mp1_v13_0_dump_bank(struct ras_core_context *ras_core,
+ enum ras_err_type type, u32 idx, u32 reg_idx, u64 *val)
+{
+ struct ras_mp1 *mp1 = &ras_core->ras_mp1;
+ const struct ras_mp1_sys_func *sys_func = mp1->sys_func;
+ u32 msg;
+
+ if (!sys_func || !sys_func->mp1_dump_valid_bank)
+ return -RAS_CORE_NOT_SUPPORTED;
+
+ switch (type) {
+ case RAS_ERR_TYPE__UE:
+ msg = RAS_MP1_MSG_McaBankDumpDW;
+ break;
+ case RAS_ERR_TYPE__CE:
+ case RAS_ERR_TYPE__DE:
+ msg = RAS_MP1_MSG_McaBankCeDumpDW;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return sys_func->mp1_dump_valid_bank(ras_core, msg, idx, reg_idx, val);
+}
+
+const struct ras_mp1_ip_func mp1_ras_func_v13_0 = {
+ .get_valid_bank_count = mp1_v13_0_get_bank_count,
+ .dump_valid_bank = mp1_v13_0_dump_bank,
+};
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.h
new file mode 100644
index 000000000000..2edfdb5f6a75
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_MP1_V13_0_H__
+#define __RAS_MP1_V13_0_H__
+#include "ras_mp1.h"
+
+extern const struct ras_mp1_ip_func mp1_ras_func_v13_0;
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c
new file mode 100644
index 000000000000..bfddd104d548
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "ras.h"
+#include "ras_nbio.h"
+#include "ras_nbio_v7_9.h"
+
+static const struct ras_nbio_ip_func *ras_nbio_get_ip_funcs(
+ struct ras_core_context *ras_core, uint32_t ip_version)
+{
+ switch (ip_version) {
+ case IP_VERSION(7, 9, 0):
+ case IP_VERSION(7, 9, 1):
+ return &ras_nbio_v7_9;
+ default:
+ RAS_DEV_ERR(ras_core->dev,
+ "NBIO ip version(0x%x) is not supported!\n", ip_version);
+ break;
+ }
+
+ return NULL;
+}
+
+int ras_nbio_hw_init(struct ras_core_context *ras_core)
+{
+ struct ras_nbio *nbio = &ras_core->ras_nbio;
+
+ nbio->nbio_ip_version = ras_core->config->nbio_ip_version;
+ nbio->sys_func = ras_core->config->nbio_cfg.nbio_sys_fn;
+ if (!nbio->sys_func) {
+ RAS_DEV_ERR(ras_core->dev, "RAS nbio sys function not configured!\n");
+ return -EINVAL;
+ }
+
+ nbio->ip_func = ras_nbio_get_ip_funcs(ras_core, nbio->nbio_ip_version);
+ if (!nbio->ip_func)
+ return -EINVAL;
+
+ if (nbio->sys_func) {
+ if (nbio->sys_func->set_ras_controller_irq_state)
+ nbio->sys_func->set_ras_controller_irq_state(ras_core, true);
+ if (nbio->sys_func->set_ras_err_event_athub_irq_state)
+ nbio->sys_func->set_ras_err_event_athub_irq_state(ras_core, true);
+ }
+
+ return 0;
+}
+
+int ras_nbio_hw_fini(struct ras_core_context *ras_core)
+{
+ struct ras_nbio *nbio = &ras_core->ras_nbio;
+
+ if (nbio->sys_func) {
+ if (nbio->sys_func->set_ras_controller_irq_state)
+ nbio->sys_func->set_ras_controller_irq_state(ras_core, false);
+ if (nbio->sys_func->set_ras_err_event_athub_irq_state)
+ nbio->sys_func->set_ras_err_event_athub_irq_state(ras_core, false);
+ }
+
+ return 0;
+}
+
+bool ras_nbio_handle_irq_error(struct ras_core_context *ras_core, void *data)
+{
+ struct ras_nbio *nbio = &ras_core->ras_nbio;
+
+ if (nbio->ip_func) {
+ if (nbio->ip_func->handle_ras_controller_intr_no_bifring)
+ nbio->ip_func->handle_ras_controller_intr_no_bifring(ras_core);
+ if (nbio->ip_func->handle_ras_err_event_athub_intr_no_bifring)
+ nbio->ip_func->handle_ras_err_event_athub_intr_no_bifring(ras_core);
+ }
+
+ return true;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio.h b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.h
new file mode 100644
index 000000000000..0a1313e59a02
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_NBIO_H__
+#define __RAS_NBIO_H__
+#include "ras.h"
+
+struct ras_core_context;
+
+struct ras_nbio_ip_func {
+ int (*handle_ras_controller_intr_no_bifring)(struct ras_core_context *ras_core);
+ int (*handle_ras_err_event_athub_intr_no_bifring)(struct ras_core_context *ras_core);
+ uint32_t (*get_memory_partition_mode)(struct ras_core_context *ras_core);
+};
+
+struct ras_nbio {
+ uint32_t nbio_ip_version;
+ const struct ras_nbio_ip_func *ip_func;
+ const struct ras_nbio_sys_func *sys_func;
+};
+
+int ras_nbio_hw_init(struct ras_core_context *ras_core);
+int ras_nbio_hw_fini(struct ras_core_context *ras_core);
+bool ras_nbio_handle_irq_error(struct ras_core_context *ras_core, void *data);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.c b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.c
new file mode 100644
index 000000000000..f17d708ec668
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "ras.h"
+#include "ras_nbio_v7_9.h"
+
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR__SHIFT 0x12
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR_MASK 0x00040000L
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS__SHIFT 0x2
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS_MASK 0x00000004L
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_CLEAR__SHIFT 0x11
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_CLEAR_MASK 0x00020000L
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_STATUS__SHIFT 0x1
+#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_STATUS_MASK 0x00000002L
+
+#define regBIF_BX0_BIF_DOORBELL_INT_CNTL_BASE_IDX 2
+#define regBIF_BX0_BIF_DOORBELL_INT_CNTL 0x00fe
+
+#define regBIF_BX0_BIF_INTR_CNTL 0x0101
+#define regBIF_BX0_BIF_INTR_CNTL_BASE_IDX 2
+
+/* BIF_BX0_BIF_INTR_CNTL */
+#define BIF_BX0_BIF_INTR_CNTL__RAS_INTR_VEC_SEL__SHIFT 0x0
+#define BIF_BX0_BIF_INTR_CNTL__RAS_INTR_VEC_SEL_MASK 0x00000001L
+
+#define regBIF_BX_PF0_PARTITION_MEM_STATUS 0x0164
+#define regBIF_BX_PF0_PARTITION_MEM_STATUS_BASE_IDX 2
+/* BIF_BX_PF0_PARTITION_MEM_STATUS */
+#define BIF_BX_PF0_PARTITION_MEM_STATUS__CHANGE_STATUE__SHIFT 0x0
+#define BIF_BX_PF0_PARTITION_MEM_STATUS__NPS_MODE__SHIFT 0x4
+#define BIF_BX_PF0_PARTITION_MEM_STATUS__CHANGE_STATUE_MASK 0x0000000FL
+#define BIF_BX_PF0_PARTITION_MEM_STATUS__NPS_MODE_MASK 0x00000FF0L
+
+
+static int nbio_v7_9_handle_ras_controller_intr_no_bifring(struct ras_core_context *ras_core)
+{
+ uint32_t bif_doorbell_intr_cntl = 0;
+
+ bif_doorbell_intr_cntl =
+ RAS_DEV_RREG32_SOC15(ras_core->dev, NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+
+ if (REG_GET_FIELD(bif_doorbell_intr_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) {
+ /* driver has to clear the interrupt status when bif ring is disabled */
+ bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ RAS_CNTLR_INTERRUPT_CLEAR, 1);
+
+ RAS_DEV_WREG32_SOC15(ras_core->dev,
+ NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
+
+ /* TODO: handle ras controller interrupt */
+ }
+
+ return 0;
+}
+
+static int nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct ras_core_context *ras_core)
+{
+ uint32_t bif_doorbell_intr_cntl = 0;
+ int ret = 0;
+
+ bif_doorbell_intr_cntl =
+ RAS_DEV_RREG32_SOC15(ras_core->dev, NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+
+ if (REG_GET_FIELD(bif_doorbell_intr_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+ /* driver has to clear the interrupt status when bif ring is disabled */
+ bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
+
+ RAS_DEV_WREG32_SOC15(ras_core->dev,
+ NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
+
+ ret = ras_core_handle_fatal_error(ras_core);
+ }
+
+ return ret;
+}
+
+static uint32_t nbio_v7_9_get_memory_partition_mode(struct ras_core_context *ras_core)
+{
+ uint32_t mem_status;
+ uint32_t mem_mode;
+
+ mem_status =
+ RAS_DEV_RREG32_SOC15(ras_core->dev, NBIO, 0, regBIF_BX_PF0_PARTITION_MEM_STATUS);
+
+ /* Each bit represents a mode 1-8*/
+ mem_mode = REG_GET_FIELD(mem_status, BIF_BX_PF0_PARTITION_MEM_STATUS, NPS_MODE);
+
+ return ffs(mem_mode);
+}
+
+const struct ras_nbio_ip_func ras_nbio_v7_9 = {
+ .handle_ras_controller_intr_no_bifring =
+ nbio_v7_9_handle_ras_controller_intr_no_bifring,
+ .handle_ras_err_event_athub_intr_no_bifring =
+ nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring,
+ .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode,
+};
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.h b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.h
new file mode 100644
index 000000000000..8711c82a927f
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_NBIO_V7_9_H__
+#define __RAS_NBIO_V7_9_H__
+#include "ras_nbio.h"
+
+extern const struct ras_nbio_ip_func ras_nbio_v7_9;
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.c b/drivers/gpu/drm/amd/ras/rascore/ras_process.c
new file mode 100644
index 000000000000..3267dcdb169c
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_process.h"
+
+#define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req))
+
+#define RAS_POLLING_ECC_TIMEOUT 300
+
+static int ras_process_put_event(struct ras_core_context *ras_core,
+ struct ras_event_req *req)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+ int ret;
+
+ ret = kfifo_in_spinlocked(&ras_proc->event_fifo,
+ req, sizeof(*req), &ras_proc->fifo_spinlock);
+ if (!ret) {
+ RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n");
+ return -ENOSPC;
+ }
+
+ return 0;
+}
+
+static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core,
+ uint32_t reset_cause)
+{
+ struct ras_event_req req = {0};
+
+ req.reset = reset_cause;
+
+ return ras_process_put_event(ras_core, &req);
+}
+
+static int ras_process_get_event(struct ras_core_context *ras_core,
+ struct ras_event_req *req)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+
+ return kfifo_out_spinlocked(&ras_proc->event_fifo,
+ req, sizeof(*req), &ras_proc->fifo_spinlock);
+}
+
+static void ras_process_clear_event_fifo(struct ras_core_context *ras_core)
+{
+ struct ras_event_req req;
+ int ret;
+
+ do {
+ ret = ras_process_get_event(ras_core, &req);
+ } while (ret);
+}
+
+#define AMDGPU_RAS_WAITING_DATA_READY 200
+static int ras_process_umc_event(struct ras_core_context *ras_core,
+ uint32_t event_count)
+{
+ struct ras_ecc_count ecc_data;
+ int ret = 0;
+ uint32_t timeout = 0;
+ uint32_t detected_de_count = 0;
+
+ do {
+ memset(&ecc_data, 0, sizeof(ecc_data));
+ ret = ras_core_update_ecc_info(ras_core);
+ if (ret)
+ return ret;
+
+ ret = ras_core_query_block_ecc_data(ras_core, RAS_BLOCK_ID__UMC, &ecc_data);
+ if (ret)
+ return ret;
+
+ if (ecc_data.new_de_count) {
+ detected_de_count += ecc_data.new_de_count;
+ timeout = 0;
+ } else {
+ if (!timeout && event_count)
+ timeout = AMDGPU_RAS_WAITING_DATA_READY;
+
+ if (timeout) {
+ if (!--timeout)
+ break;
+
+ msleep(1);
+ }
+ }
+ } while (detected_de_count < event_count);
+
+ if (detected_de_count && ras_core_gpu_is_rma(ras_core))
+ ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA);
+
+ return 0;
+}
+
+static int ras_process_non_umc_event(struct ras_core_context *ras_core)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+ struct ras_event_req req;
+ uint32_t event_count = kfifo_len(&ras_proc->event_fifo);
+ uint32_t reset_flags = 0;
+ int ret = 0, i;
+
+ for (i = 0; i < event_count; i++) {
+ memset(&req, 0, sizeof(req));
+ ret = ras_process_get_event(ras_core, &req);
+ if (!ret)
+ continue;
+
+ ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__POISON_CONSUMPTION, &req);
+
+ reset_flags |= req.reset;
+
+ if (req.reset == GPU_RESET_CAUSE_RMA)
+ continue;
+
+ if (req.reset)
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} GPU reset for %s RAS poison consumption is issued!\n",
+ req.seqno, ras_core_get_ras_block_name(req.block));
+ else
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} %s RAS poison consumption is issued!\n",
+ req.seqno, ras_core_get_ras_block_name(req.block));
+ }
+
+ if (reset_flags) {
+ ret = ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__RESET_GPU, &reset_flags);
+ if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA))
+ return -RAS_CORE_GPU_IN_MODE1_RESET;
+ }
+
+ return ret;
+}
+
+int ras_process_handle_ras_event(struct ras_core_context *ras_core)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+ uint32_t umc_event_count;
+ int ret;
+
+ ret = ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL);
+ if (ret)
+ return ret;
+
+ ras_aca_clear_fatal_flag(ras_core);
+ ras_umc_log_pending_bad_bank(ras_core);
+
+ do {
+ umc_event_count = atomic_read(&ras_proc->umc_interrupt_count);
+ ret = ras_process_umc_event(ras_core, umc_event_count);
+ if (ret == -RAS_CORE_GPU_IN_MODE1_RESET)
+ break;
+
+ if (umc_event_count)
+ atomic_sub(umc_event_count, &ras_proc->umc_interrupt_count);
+ } while (atomic_read(&ras_proc->umc_interrupt_count));
+
+ if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) &&
+ (kfifo_len(&ras_proc->event_fifo)))
+ ret = ras_process_non_umc_event(ras_core);
+
+ if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) {
+ /* Clear poison fifo */
+ ras_process_clear_event_fifo(ras_core);
+ atomic_set(&ras_proc->umc_interrupt_count, 0);
+ }
+
+ ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL);
+ return ret;
+}
+
+static int thread_wait_condition(void *param)
+{
+ struct ras_process *ras_proc = (struct ras_process *)param;
+
+ return (kthread_should_stop() ||
+ atomic_read(&ras_proc->ras_interrupt_req));
+}
+
+static int ras_process_thread(void *context)
+{
+ struct ras_core_context *ras_core = (struct ras_core_context *)context;
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+
+ while (!kthread_should_stop()) {
+ ras_wait_event_interruptible_timeout(&ras_proc->ras_process_wq,
+ thread_wait_condition, ras_proc,
+ msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT));
+
+ if (kthread_should_stop())
+ break;
+
+ if (!ras_core->is_initialized)
+ continue;
+
+ atomic_set(&ras_proc->ras_interrupt_req, 0);
+
+ if (ras_core_gpu_in_reset(ras_core))
+ continue;
+
+ if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event)
+ ras_core->sys_fn->async_handle_ras_event(ras_core, NULL);
+ else
+ ras_process_handle_ras_event(ras_core);
+ }
+
+ return 0;
+}
+
+int ras_process_init(struct ras_core_context *ras_core)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+ int ret;
+
+ ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL);
+ if (ret)
+ return ret;
+
+ spin_lock_init(&ras_proc->fifo_spinlock);
+
+ init_waitqueue_head(&ras_proc->ras_process_wq);
+
+ ras_proc->ras_process_thread = kthread_run(ras_process_thread,
+ (void *)ras_core, "ras_process_thread");
+ if (!ras_proc->ras_process_thread) {
+ RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n");
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ return 0;
+
+err:
+ ras_process_fini(ras_core);
+ return ret;
+}
+
+int ras_process_fini(struct ras_core_context *ras_core)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+
+ if (ras_proc->ras_process_thread) {
+ kthread_stop(ras_proc->ras_process_thread);
+ ras_proc->ras_process_thread = NULL;
+ }
+
+ kfifo_free(&ras_proc->event_fifo);
+
+ return 0;
+}
+
+static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core,
+ struct ras_event_req *req)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+
+ atomic_inc(&ras_proc->umc_interrupt_count);
+ atomic_inc(&ras_proc->ras_interrupt_req);
+
+ wake_up(&ras_proc->ras_process_wq);
+ return 0;
+}
+
+static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core,
+ struct ras_event_req *req)
+{
+ struct ras_process *ras_proc = &ras_core->ras_proc;
+ int ret;
+
+ ret = ras_process_put_event(ras_core, req);
+ if (!ret) {
+ atomic_inc(&ras_proc->ras_interrupt_req);
+ wake_up(&ras_proc->ras_process_wq);
+ }
+
+ return ret;
+}
+
+int ras_process_add_interrupt_req(struct ras_core_context *ras_core,
+ struct ras_event_req *req, bool is_umc)
+{
+ int ret;
+
+ if (!ras_core)
+ return -EINVAL;
+
+ if (!ras_core->is_initialized)
+ return -EPERM;
+
+ if (is_umc)
+ ret = ras_process_add_umc_interrupt_req(ras_core, req);
+ else
+ ret = ras_process_add_non_umc_interrupt_req(ras_core, req);
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.h b/drivers/gpu/drm/amd/ras/rascore/ras_process.h
new file mode 100644
index 000000000000..28458b50510e
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_PROCESS_H__
+#define __RAS_PROCESS_H__
+
+struct ras_event_req {
+ uint64_t seqno;
+ uint32_t idx_vf;
+ uint32_t block;
+ uint16_t pasid;
+ uint32_t reset;
+ void *pasid_fn;
+ void *data;
+};
+
+struct ras_process {
+ void *dev;
+ void *ras_process_thread;
+ wait_queue_head_t ras_process_wq;
+ atomic_t ras_interrupt_req;
+ atomic_t umc_interrupt_count;
+ struct kfifo event_fifo;
+ spinlock_t fifo_spinlock;
+};
+
+struct ras_core_context;
+int ras_process_init(struct ras_core_context *ras_core);
+int ras_process_fini(struct ras_core_context *ras_core);
+int ras_process_handle_ras_event(struct ras_core_context *ras_core);
+int ras_process_add_interrupt_req(struct ras_core_context *ras_core,
+ struct ras_event_req *req, bool is_umc);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp.c b/drivers/gpu/drm/amd/ras/rascore/ras_psp.c
new file mode 100644
index 000000000000..ccdb42d2dd60
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp.c
@@ -0,0 +1,750 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_ta_if.h"
+#include "ras_psp.h"
+#include "ras_psp_v13_0.h"
+
+/* position of instance value in sub_block_index of
+ * ta_ras_trigger_error_input, the sub block uses lower 12 bits
+ */
+#define RAS_TA_INST_MASK 0xfffff000
+#define RAS_TA_INST_SHIFT 0xc
+
+static const struct ras_psp_ip_func *ras_psp_get_ip_funcs(
+ struct ras_core_context *ras_core, uint32_t ip_version)
+{
+ switch (ip_version) {
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
+ case IP_VERSION(13, 0, 12):
+ return &ras_psp_v13_0;
+ default:
+ RAS_DEV_ERR(ras_core->dev,
+ "psp ip version(0x%x) is not supported!\n", ip_version);
+ break;
+ }
+
+ return NULL;
+}
+
+static int ras_psp_sync_system_ras_psp_status(struct ras_core_context *ras_core)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+ struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
+ struct ras_psp_sys_status status = {0};
+ int ret;
+
+ if (psp->sys_func && psp->sys_func->get_ras_psp_system_status) {
+ ret = psp->sys_func->get_ras_psp_system_status(ras_core, &status);
+ if (ret)
+ return ret;
+
+ if (status.initialized) {
+ ta_ctx->preload_ras_ta_enabled = true;
+ ta_ctx->ras_ta_initialized = status.initialized;
+ ta_ctx->session_id = status.session_id;
+ }
+
+ psp_ctx->external_mutex = status.psp_cmd_mutex;
+ }
+
+ return 0;
+}
+
+static int ras_psp_get_ras_ta_init_param(struct ras_core_context *ras_core,
+ struct ras_ta_init_param *ras_ta_param)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+
+ if (psp->sys_func && psp->sys_func->get_ras_ta_init_param)
+ return psp->sys_func->get_ras_ta_init_param(ras_core, ras_ta_param);
+
+ RAS_DEV_ERR(ras_core->dev, "Not config get_ras_ta_init_param API!!\n");
+ return -EACCES;
+}
+
+static struct gpu_mem_block *ras_psp_get_gpu_mem(struct ras_core_context *ras_core,
+ enum gpu_mem_type mem_type)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+ struct gpu_mem_block *gpu_mem = NULL;
+ int ret;
+
+ switch (mem_type) {
+ case GPU_MEM_TYPE_RAS_PSP_RING:
+ gpu_mem = &psp->psp_ring.ras_ring_gpu_mem;
+ break;
+ case GPU_MEM_TYPE_RAS_PSP_CMD:
+ gpu_mem = &psp->psp_ctx.psp_cmd_gpu_mem;
+ break;
+ case GPU_MEM_TYPE_RAS_PSP_FENCE:
+ gpu_mem = &psp->psp_ctx.out_fence_gpu_mem;
+ break;
+ case GPU_MEM_TYPE_RAS_TA_FW:
+ gpu_mem = &psp->ta_ctx.fw_gpu_mem;
+ break;
+ case GPU_MEM_TYPE_RAS_TA_CMD:
+ gpu_mem = &psp->ta_ctx.cmd_gpu_mem;
+ break;
+ default:
+ return NULL;
+ }
+
+ if (!gpu_mem->ref_count) {
+ ret = ras_core_get_gpu_mem(ras_core, mem_type, gpu_mem);
+ if (ret)
+ return NULL;
+ gpu_mem->mem_type = mem_type;
+ }
+
+ gpu_mem->ref_count++;
+
+ return gpu_mem;
+}
+
+static int ras_psp_put_gpu_mem(struct ras_core_context *ras_core,
+ struct gpu_mem_block *gpu_mem)
+{
+ if (!gpu_mem)
+ return 0;
+
+ gpu_mem->ref_count--;
+
+ if (gpu_mem->ref_count > 0) {
+ return 0;
+ } else if (gpu_mem->ref_count < 0) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Duplicate free gpu memory %u\n", gpu_mem->mem_type);
+ } else {
+ ras_core_put_gpu_mem(ras_core, gpu_mem->mem_type, gpu_mem);
+ memset(gpu_mem, 0, sizeof(*gpu_mem));
+ }
+
+ return 0;
+}
+
+static void __acquire_psp_cmd_lock(struct ras_core_context *ras_core)
+{
+ struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
+
+ if (psp_ctx->external_mutex)
+ mutex_lock(psp_ctx->external_mutex);
+ else
+ mutex_lock(&psp_ctx->internal_mutex);
+}
+
+static void __release_psp_cmd_lock(struct ras_core_context *ras_core)
+{
+ struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
+
+ if (psp_ctx->external_mutex)
+ mutex_unlock(psp_ctx->external_mutex);
+ else
+ mutex_unlock(&psp_ctx->internal_mutex);
+}
+
+static uint32_t __get_ring_frame_slot(struct ras_core_context *ras_core)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+ uint32_t ras_ring_wptr_dw;
+
+ ras_ring_wptr_dw = psp->ip_func->psp_ras_ring_wptr_get(ras_core);
+
+ return div64_u64((ras_ring_wptr_dw << 2), sizeof(struct psp_gfx_rb_frame));
+}
+
+static int __set_ring_frame_slot(struct ras_core_context *ras_core,
+ uint32_t slot)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+
+ return psp->ip_func->psp_ras_ring_wptr_set(ras_core,
+ (slot * sizeof(struct psp_gfx_rb_frame)) >> 2);
+}
+
+static int write_frame_to_ras_psp_ring(struct ras_core_context *ras_core,
+ struct psp_gfx_rb_frame *frame)
+{
+ struct gpu_mem_block *ring_mem;
+ struct psp_gfx_rb_frame *rb_frame;
+ uint32_t max_frame_slot;
+ uint32_t slot_idx;
+ uint32_t write_flush_read_back = 0;
+ int ret = 0;
+
+ ring_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_RING);
+ if (!ring_mem)
+ return -ENOMEM;
+
+ max_frame_slot =
+ div64_u64(ring_mem->mem_size, sizeof(struct psp_gfx_rb_frame));
+
+ rb_frame =
+ (struct psp_gfx_rb_frame *)ring_mem->mem_cpu_addr;
+
+ slot_idx = __get_ring_frame_slot(ras_core);
+ if (slot_idx >= max_frame_slot)
+ slot_idx = 0;
+
+ memcpy(&rb_frame[slot_idx], frame, sizeof(*frame));
+
+ /* Do a read to force the write of the frame before writing
+ * write pointer.
+ */
+ write_flush_read_back = rb_frame[slot_idx].fence_value;
+ if (write_flush_read_back != frame->fence_value) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to submit ring cmd! cmd:0x%x:0x%x, fence:0x%x:0x%x value:%u, expected:%u\n",
+ rb_frame[slot_idx].cmd_buf_addr_hi,
+ rb_frame[slot_idx].cmd_buf_addr_lo,
+ rb_frame[slot_idx].fence_addr_hi,
+ rb_frame[slot_idx].fence_addr_lo,
+ write_flush_read_back, frame->fence_value);
+ ret = -EACCES;
+ goto err;
+ }
+
+ slot_idx++;
+
+ if (slot_idx >= max_frame_slot)
+ slot_idx = 0;
+
+ __set_ring_frame_slot(ras_core, slot_idx);
+
+err:
+ ras_psp_put_gpu_mem(ras_core, ring_mem);
+ return ret;
+}
+
+static int send_psp_cmd(struct ras_core_context *ras_core,
+ enum psp_gfx_cmd_id gfx_cmd_id, void *cmd_data,
+ uint32_t cmd_size, struct psp_cmd_resp *resp)
+{
+ struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx;
+ struct gpu_mem_block *psp_cmd_buf = NULL;
+ struct gpu_mem_block *psp_fence_buf = NULL;
+ struct psp_gfx_cmd_resp *gfx_cmd;
+ struct psp_gfx_rb_frame rb_frame;
+ int ret = 0;
+ int timeout = 1000;
+
+ if (!cmd_data || (cmd_size > sizeof(union psp_gfx_commands)) || !resp) {
+ RAS_DEV_ERR(ras_core->dev, "Invalid RAS PSP command, id: %u\n", gfx_cmd_id);
+ return -EINVAL;
+ }
+
+ __acquire_psp_cmd_lock(ras_core);
+
+ psp_cmd_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_CMD);
+ if (!psp_cmd_buf) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ psp_fence_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_FENCE);
+ if (!psp_fence_buf) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ gfx_cmd = (struct psp_gfx_cmd_resp *)psp_cmd_buf->mem_cpu_addr;
+ memset(gfx_cmd, 0, sizeof(*gfx_cmd));
+ gfx_cmd->cmd_id = gfx_cmd_id;
+ memcpy(&gfx_cmd->cmd, cmd_data, cmd_size);
+
+ psp_ctx->in_fence_value++;
+
+ memset(&rb_frame, 0, sizeof(rb_frame));
+ rb_frame.cmd_buf_addr_hi = upper_32_bits(psp_cmd_buf->mem_mc_addr);
+ rb_frame.cmd_buf_addr_lo = lower_32_bits(psp_cmd_buf->mem_mc_addr);
+ rb_frame.fence_addr_hi = upper_32_bits(psp_fence_buf->mem_mc_addr);
+ rb_frame.fence_addr_lo = lower_32_bits(psp_fence_buf->mem_mc_addr);
+ rb_frame.fence_value = psp_ctx->in_fence_value;
+
+ ret = write_frame_to_ras_psp_ring(ras_core, &rb_frame);
+ if (ret) {
+ psp_ctx->in_fence_value--;
+ goto exit;
+ }
+
+ while (*((uint64_t *)psp_fence_buf->mem_cpu_addr) !=
+ psp_ctx->in_fence_value) {
+ if (--timeout == 0)
+ break;
+ /*
+ * Shouldn't wait for timeout when err_event_athub occurs,
+ * because gpu reset thread triggered and lock resource should
+ * be released for psp resume sequence.
+ */
+ if (ras_core_ras_interrupt_detected(ras_core))
+ break;
+
+ msleep(2);
+ }
+
+ resp->status = gfx_cmd->resp.status;
+ resp->session_id = gfx_cmd->resp.session_id;
+
+exit:
+ ras_psp_put_gpu_mem(ras_core, psp_cmd_buf);
+ ras_psp_put_gpu_mem(ras_core, psp_fence_buf);
+
+ __release_psp_cmd_lock(ras_core);
+
+ return ret;
+}
+
+static void __check_ras_ta_cmd_resp(struct ras_core_context *ras_core,
+ struct ras_ta_cmd *ras_cmd)
+{
+
+ if (ras_cmd->ras_out_message.flags.err_inject_switch_disable_flag) {
+ RAS_DEV_WARN(ras_core->dev, "ECC switch disabled\n");
+ ras_cmd->ras_status = RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE;
+ } else if (ras_cmd->ras_out_message.flags.reg_access_failure_flag)
+ RAS_DEV_WARN(ras_core->dev, "RAS internal register access blocked\n");
+
+ switch (ras_cmd->ras_status) {
+ case RAS_TA_STATUS__ERROR_UNSUPPORTED_IP:
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS WARNING: cmd failed due to unsupported ip\n");
+ break;
+ case RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ:
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS WARNING: cmd failed due to unsupported error injection\n");
+ break;
+ case RAS_TA_STATUS__SUCCESS:
+ break;
+ case RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED:
+ if (ras_cmd->cmd_id == RAS_TA_CMD_ID__TRIGGER_ERROR)
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS WARNING: Inject error to critical region is not allowed\n");
+ break;
+ default:
+ RAS_DEV_WARN(ras_core->dev,
+ "RAS WARNING: ras status = 0x%X\n", ras_cmd->ras_status);
+ break;
+ }
+}
+
+static int send_ras_ta_runtime_cmd(struct ras_core_context *ras_core,
+ enum ras_ta_cmd_id cmd_id, void *in, uint32_t in_size,
+ void *out, uint32_t out_size)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+ struct gpu_mem_block *cmd_mem;
+ struct ras_ta_cmd *ras_cmd;
+ struct psp_gfx_cmd_invoke_cmd invoke_cmd = {0};
+ struct psp_cmd_resp resp = {0};
+ int ret = 0;
+
+ if (!in || (in_size > sizeof(union ras_ta_cmd_input)) ||
+ (cmd_id >= MAX_RAS_TA_CMD_ID)) {
+ RAS_DEV_ERR(ras_core->dev, "Invalid RAS TA command, id: %u\n", cmd_id);
+ return -EINVAL;
+ }
+
+ ras_psp_sync_system_ras_psp_status(ras_core);
+
+ cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD);
+ if (!cmd_mem)
+ return -ENOMEM;
+
+ if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) {
+ ret = -EACCES;
+ goto out;
+ }
+
+ ras_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr;
+
+ mutex_lock(&ta_ctx->ta_mutex);
+
+ memset(ras_cmd, 0, sizeof(*ras_cmd));
+ ras_cmd->cmd_id = cmd_id;
+ memcpy(&ras_cmd->ras_in_message, in, in_size);
+
+ invoke_cmd.ta_cmd_id = cmd_id;
+ invoke_cmd.session_id = ta_ctx->session_id;
+
+ ret = send_psp_cmd(ras_core, GFX_CMD_ID_INVOKE_CMD,
+ &invoke_cmd, sizeof(invoke_cmd), &resp);
+
+ /* If err_event_athub occurs error inject was successful, however
+ * return status from TA is no long reliable
+ */
+ if (ras_core_ras_interrupt_detected(ras_core)) {
+ ret = 0;
+ goto unlock;
+ }
+
+ if (ret || resp.status) {
+ RAS_DEV_ERR(ras_core->dev,
+ "RAS: Failed to send psp cmd! ret:%d, status:%u\n",
+ ret, resp.status);
+ ret = -ESTRPIPE;
+ goto unlock;
+ }
+
+ if (ras_cmd->if_version > RAS_TA_HOST_IF_VER) {
+ RAS_DEV_WARN(ras_core->dev, "RAS: Unsupported Interface\n");
+ ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!ras_cmd->ras_status && out && out_size)
+ memcpy(out, &ras_cmd->ras_out_message, out_size);
+
+ __check_ras_ta_cmd_resp(ras_core, ras_cmd);
+
+unlock:
+ mutex_unlock(&ta_ctx->ta_mutex);
+ ras_core_up_gpu_reset_lock(ras_core);
+out:
+ ras_psp_put_gpu_mem(ras_core, cmd_mem);
+ return ret;
+}
+
+static int trigger_ras_ta_error(struct ras_core_context *ras_core,
+ struct ras_ta_trigger_error_input *info, uint32_t instance_mask)
+{
+ uint32_t dev_mask = 0;
+
+ switch (info->block_id) {
+ case RAS_TA_BLOCK__GFX:
+ if (ras_gfx_get_ta_subblock(ras_core, info->inject_error_type,
+ info->sub_block_index, &info->sub_block_index))
+ return -EINVAL;
+
+ dev_mask = RAS_GET_MASK(ras_core->dev, GC, instance_mask);
+ break;
+ case RAS_TA_BLOCK__SDMA:
+ dev_mask = RAS_GET_MASK(ras_core->dev, SDMA0, instance_mask);
+ break;
+ case RAS_TA_BLOCK__VCN:
+ case RAS_TA_BLOCK__JPEG:
+ dev_mask = RAS_GET_MASK(ras_core->dev, VCN, instance_mask);
+ break;
+ default:
+ dev_mask = instance_mask;
+ break;
+ }
+
+ /* reuse sub_block_index for backward compatibility */
+ dev_mask <<= RAS_TA_INST_SHIFT;
+ dev_mask &= RAS_TA_INST_MASK;
+ info->sub_block_index |= dev_mask;
+
+ return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__TRIGGER_ERROR,
+ info, sizeof(*info), NULL, 0);
+}
+
+static int send_load_ta_fw_cmd(struct ras_core_context *ras_core,
+ struct ras_ta_ctx *ta_ctx)
+{
+ struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin;
+ struct gpu_mem_block *fw_mem;
+ struct gpu_mem_block *cmd_mem;
+ struct ras_ta_cmd *ta_cmd;
+ struct ras_ta_init_flags *ta_init_flags;
+ struct psp_gfx_cmd_load_ta psp_load_ta_cmd;
+ struct psp_cmd_resp resp = {0};
+ struct ras_ta_image_header *fw_hdr = NULL;
+ int ret;
+
+ fw_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_FW);
+ if (!fw_mem)
+ return -ENOMEM;
+
+ cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD);
+ if (!cmd_mem) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = ras_psp_get_ras_ta_init_param(ras_core, &ta_ctx->init_param);
+ if (ret)
+ goto err;
+
+ if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) {
+ ret = -EACCES;
+ goto err;
+ }
+
+ /* copy ras ta binary to shared gpu memory */
+ memcpy(fw_mem->mem_cpu_addr, fw_bin->bin_addr, fw_bin->bin_size);
+ fw_mem->mem_size = fw_bin->bin_size;
+
+ /* Initialize ras ta startup parameter */
+ ta_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr;
+ ta_init_flags = &ta_cmd->ras_in_message.init_flags;
+
+ ta_init_flags->poison_mode_en = ta_ctx->init_param.poison_mode_en;
+ ta_init_flags->dgpu_mode = ta_ctx->init_param.dgpu_mode;
+ ta_init_flags->xcc_mask = ta_ctx->init_param.xcc_mask;
+ ta_init_flags->channel_dis_num = ta_ctx->init_param.channel_dis_num;
+ ta_init_flags->nps_mode = ta_ctx->init_param.nps_mode;
+ ta_init_flags->active_umc_mask = ta_ctx->init_param.active_umc_mask;
+
+ /* Setup load ras ta command */
+ memset(&psp_load_ta_cmd, 0, sizeof(psp_load_ta_cmd));
+ psp_load_ta_cmd.app_phy_addr_lo = lower_32_bits(fw_mem->mem_mc_addr);
+ psp_load_ta_cmd.app_phy_addr_hi = upper_32_bits(fw_mem->mem_mc_addr);
+ psp_load_ta_cmd.app_len = fw_mem->mem_size;
+ psp_load_ta_cmd.cmd_buf_phy_addr_lo = lower_32_bits(cmd_mem->mem_mc_addr);
+ psp_load_ta_cmd.cmd_buf_phy_addr_hi = upper_32_bits(cmd_mem->mem_mc_addr);
+ psp_load_ta_cmd.cmd_buf_len = cmd_mem->mem_size;
+
+ ret = send_psp_cmd(ras_core, GFX_CMD_ID_LOAD_TA,
+ &psp_load_ta_cmd, sizeof(psp_load_ta_cmd), &resp);
+ if (!ret && !resp.status) {
+ /* Read TA version at FW offset 0x60 if TA version not found*/
+ fw_hdr = (struct ras_ta_image_header *)fw_bin->bin_addr;
+ RAS_DEV_INFO(ras_core->dev, "PSP: RAS TA(version:%X.%X.%X.%X) is loaded.\n",
+ (fw_hdr->image_version >> 24) & 0xFF, (fw_hdr->image_version >> 16) & 0xFF,
+ (fw_hdr->image_version >> 8) & 0xFF, fw_hdr->image_version & 0xFF);
+ ta_ctx->ta_version = fw_hdr->image_version;
+ ta_ctx->session_id = resp.session_id;
+ ta_ctx->ras_ta_initialized = true;
+ } else {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to load RAS TA! ret:%d, status:%d\n", ret, resp.status);
+ }
+
+ ras_core_up_gpu_reset_lock(ras_core);
+
+err:
+ ras_psp_put_gpu_mem(ras_core, fw_mem);
+ ras_psp_put_gpu_mem(ras_core, cmd_mem);
+ return ret;
+}
+
+static int load_ras_ta_firmware(struct ras_core_context *ras_core,
+ struct ras_psp_ta_load *ras_ta_load)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+ struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin;
+ int ret;
+
+ fw_bin->bin_addr = ras_ta_load->bin_addr;
+ fw_bin->bin_size = ras_ta_load->bin_size;
+ fw_bin->fw_version = ras_ta_load->fw_version;
+ fw_bin->feature_version = ras_ta_load->feature_version;
+
+ ret = send_load_ta_fw_cmd(ras_core, ta_ctx);
+ if (!ret) {
+ ras_ta_load->out_session_id = ta_ctx->session_id;
+ ras_ta_load->out_loaded_ta_version = ta_ctx->ta_version;
+ }
+
+ return ret;
+}
+
+static int unload_ras_ta_firmware(struct ras_core_context *ras_core,
+ struct ras_psp_ta_unload *ras_ta_unload)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+ struct psp_gfx_cmd_unload_ta cmd_unload_ta = {0};
+ struct psp_cmd_resp resp = {0};
+ int ret;
+
+ if (!ras_core_down_trylock_gpu_reset_lock(ras_core))
+ return -EACCES;
+
+ cmd_unload_ta.session_id = ta_ctx->session_id;
+ ret = send_psp_cmd(ras_core, GFX_CMD_ID_UNLOAD_TA,
+ &cmd_unload_ta, sizeof(cmd_unload_ta), &resp);
+ if (ret || resp.status) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Failed to unload RAS TA! ret:%d, status:%u\n",
+ ret, resp.status);
+ goto unlock;
+ }
+
+ kfree(ta_ctx->fw_bin.bin_addr);
+ memset(&ta_ctx->fw_bin, 0, sizeof(ta_ctx->fw_bin));
+ ta_ctx->ta_version = 0;
+ ta_ctx->ras_ta_initialized = false;
+ ta_ctx->session_id = 0;
+
+unlock:
+ ras_core_up_gpu_reset_lock(ras_core);
+
+ return ret;
+}
+
+int ras_psp_load_firmware(struct ras_core_context *ras_core,
+ struct ras_psp_ta_load *ras_ta_load)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+ struct ras_psp_ta_unload ras_ta_unload = {0};
+ int ret;
+
+ if (ta_ctx->preload_ras_ta_enabled)
+ return 0;
+
+ if (!ras_ta_load)
+ return -EINVAL;
+
+ if (ta_ctx->ras_ta_initialized) {
+ ras_ta_unload.ras_session_id = ta_ctx->session_id;
+ ret = unload_ras_ta_firmware(ras_core, &ras_ta_unload);
+ if (ret)
+ return ret;
+ }
+
+ return load_ras_ta_firmware(ras_core, ras_ta_load);
+}
+
+int ras_psp_unload_firmware(struct ras_core_context *ras_core,
+ struct ras_psp_ta_unload *ras_ta_unload)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+
+ if (ta_ctx->preload_ras_ta_enabled)
+ return 0;
+
+ if ((!ras_ta_unload) ||
+ (ras_ta_unload->ras_session_id != ta_ctx->session_id))
+ return -EINVAL;
+
+ return unload_ras_ta_firmware(ras_core, ras_ta_unload);
+}
+
+int ras_psp_trigger_error(struct ras_core_context *ras_core,
+ struct ras_ta_trigger_error_input *info, uint32_t instance_mask)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+
+ if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized) {
+ RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!");
+ return -ENOEXEC;
+ }
+
+ if (!info)
+ return -EINVAL;
+
+ return trigger_ras_ta_error(ras_core, info, instance_mask);
+}
+
+int ras_psp_query_address(struct ras_core_context *ras_core,
+ struct ras_ta_query_address_input *addr_in,
+ struct ras_ta_query_address_output *addr_out)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+
+ if (!ta_ctx->preload_ras_ta_enabled &&
+ !ta_ctx->ras_ta_initialized) {
+ RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!");
+ return -ENOEXEC;
+ }
+
+ if (!addr_in || !addr_out)
+ return -EINVAL;
+
+ return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__QUERY_ADDRESS,
+ addr_in, sizeof(*addr_in), addr_out, sizeof(*addr_out));
+}
+
+int ras_psp_sw_init(struct ras_core_context *ras_core)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+
+ memset(psp, 0, sizeof(*psp));
+
+ psp->sys_func = ras_core->config->psp_cfg.psp_sys_fn;
+ if (!psp->sys_func) {
+ RAS_DEV_ERR(ras_core->dev, "RAS psp sys function not configured!\n");
+ return -EINVAL;
+ }
+
+ mutex_init(&psp->psp_ctx.internal_mutex);
+ mutex_init(&psp->ta_ctx.ta_mutex);
+
+ return 0;
+}
+
+int ras_psp_sw_fini(struct ras_core_context *ras_core)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+
+ mutex_destroy(&psp->psp_ctx.internal_mutex);
+ mutex_destroy(&psp->ta_ctx.ta_mutex);
+
+ memset(psp, 0, sizeof(*psp));
+
+ return 0;
+}
+
+int ras_psp_hw_init(struct ras_core_context *ras_core)
+{
+ struct ras_psp *psp = &ras_core->ras_psp;
+
+ psp->psp_ip_version = ras_core->config->psp_ip_version;
+
+ psp->ip_func = ras_psp_get_ip_funcs(ras_core, psp->psp_ip_version);
+ if (!psp->ip_func)
+ return -EINVAL;
+
+ /* After GPU reset, the system RAS PSP status may change.
+ * therefore, it is necessary to synchronize the system status again.
+ */
+ ras_psp_sync_system_ras_psp_status(ras_core);
+
+ return 0;
+}
+
+int ras_psp_hw_fini(struct ras_core_context *ras_core)
+{
+ return 0;
+}
+
+bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core,
+ enum ras_ta_cmd_id cmd_id)
+{
+ struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx;
+ bool ret = false;
+
+ if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized)
+ return false;
+
+ switch (cmd_id) {
+ case RAS_TA_CMD_ID__QUERY_ADDRESS:
+ /* Currently, querying the address from RAS TA is only supported
+ * when the RAS TA firmware is loaded during driver installation.
+ */
+ if (ta_ctx->preload_ras_ta_enabled)
+ ret = true;
+ break;
+ case RAS_TA_CMD_ID__TRIGGER_ERROR:
+ ret = true;
+ break;
+ default:
+ ret = false;
+ break;
+ }
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp.h b/drivers/gpu/drm/amd/ras/rascore/ras_psp.h
new file mode 100644
index 000000000000..71776fecfd66
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp.h
@@ -0,0 +1,145 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_PSP_H__
+#define __RAS_PSP_H__
+#include "ras.h"
+#include "ras_ta_if.h"
+
+struct ras_core_context;
+struct ras_ta_trigger_error_input;
+struct ras_ta_query_address_input;
+struct ras_ta_query_address_output;
+enum ras_ta_cmd_id;
+
+struct ras_ta_image_header {
+ uint32_t reserved1[24];
+ uint32_t image_version; /* [0x60] Off Chip Firmware Version */
+ uint32_t reserved2[39];
+};
+
+struct ras_psp_sys_status {
+ bool initialized;
+ uint32_t session_id;
+ void *psp_cmd_mutex;
+};
+
+struct ras_ta_init_param {
+ uint8_t poison_mode_en;
+ uint8_t dgpu_mode;
+ uint16_t xcc_mask;
+ uint8_t channel_dis_num;
+ uint8_t nps_mode;
+ uint32_t active_umc_mask;
+};
+
+struct gpu_mem_block {
+ uint32_t mem_type;
+ void *mem_bo;
+ uint64_t mem_mc_addr;
+ void *mem_cpu_addr;
+ uint32_t mem_size;
+ int ref_count;
+ void *private;
+};
+
+struct ras_psp_ip_func {
+ uint32_t (*psp_ras_ring_wptr_get)(struct ras_core_context *ras_core);
+ int (*psp_ras_ring_wptr_set)(struct ras_core_context *ras_core, uint32_t wptr);
+};
+
+struct ras_psp_ring {
+ struct gpu_mem_block ras_ring_gpu_mem;
+};
+
+struct psp_cmd_resp {
+ uint32_t status;
+ uint32_t session_id;
+};
+
+struct ras_psp_ctx {
+ void *external_mutex;
+ struct mutex internal_mutex;
+ uint64_t in_fence_value;
+ struct gpu_mem_block psp_cmd_gpu_mem;
+ struct gpu_mem_block out_fence_gpu_mem;
+};
+
+struct ras_ta_fw_bin {
+ uint32_t fw_version;
+ uint32_t feature_version;
+ uint32_t bin_size;
+ uint8_t *bin_addr;
+};
+
+struct ras_ta_ctx {
+ bool preload_ras_ta_enabled;
+ bool ras_ta_initialized;
+ uint32_t session_id;
+ uint32_t resp_status;
+ uint32_t ta_version;
+ struct mutex ta_mutex;
+ struct ras_ta_fw_bin fw_bin;
+ struct ras_ta_init_param init_param;
+ struct gpu_mem_block fw_gpu_mem;
+ struct gpu_mem_block cmd_gpu_mem;
+};
+
+struct ras_psp {
+ uint32_t psp_ip_version;
+ struct ras_psp_ring psp_ring;
+ struct ras_psp_ctx psp_ctx;
+ struct ras_ta_ctx ta_ctx;
+ const struct ras_psp_ip_func *ip_func;
+ const struct ras_psp_sys_func *sys_func;
+};
+
+struct ras_psp_ta_load {
+ uint32_t fw_version;
+ uint32_t feature_version;
+ uint32_t bin_size;
+ uint8_t *bin_addr;
+ uint64_t out_session_id;
+ uint32_t out_loaded_ta_version;
+};
+
+struct ras_psp_ta_unload {
+ uint64_t ras_session_id;
+};
+
+int ras_psp_sw_init(struct ras_core_context *ras_core);
+int ras_psp_sw_fini(struct ras_core_context *ras_core);
+int ras_psp_hw_init(struct ras_core_context *ras_core);
+int ras_psp_hw_fini(struct ras_core_context *ras_core);
+int ras_psp_load_firmware(struct ras_core_context *ras_core,
+ struct ras_psp_ta_load *ras_ta_load);
+int ras_psp_unload_firmware(struct ras_core_context *ras_core,
+ struct ras_psp_ta_unload *ras_ta_unload);
+int ras_psp_trigger_error(struct ras_core_context *ras_core,
+ struct ras_ta_trigger_error_input *info, uint32_t instance_mask);
+int ras_psp_query_address(struct ras_core_context *ras_core,
+ struct ras_ta_query_address_input *addr_in,
+ struct ras_ta_query_address_output *addr_out);
+bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core,
+ enum ras_ta_cmd_id cmd_id);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.c
new file mode 100644
index 000000000000..626cf39b75ac
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "ras.h"
+#include "ras_psp_v13_0.h"
+
+#define regMP0_SMN_C2PMSG_67 0x0083
+#define regMP0_SMN_C2PMSG_67_BASE_IDX 0
+
+static uint32_t ras_psp_v13_0_ring_wptr_get(struct ras_core_context *ras_core)
+{
+ return RAS_DEV_RREG32_SOC15(ras_core->dev, MP0, 0, regMP0_SMN_C2PMSG_67);
+}
+
+static int ras_psp_v13_0_ring_wptr_set(struct ras_core_context *ras_core, uint32_t value)
+{
+ RAS_DEV_WREG32_SOC15(ras_core->dev, MP0, 0, regMP0_SMN_C2PMSG_67, value);
+
+ return 0;
+}
+
+const struct ras_psp_ip_func ras_psp_v13_0 = {
+ .psp_ras_ring_wptr_get = ras_psp_v13_0_ring_wptr_get,
+ .psp_ras_ring_wptr_set = ras_psp_v13_0_ring_wptr_set,
+};
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.h
new file mode 100644
index 000000000000..b705ffe38a12
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_PSP_V13_0_H__
+#define __RAS_PSP_V13_0_H__
+#include "ras_psp.h"
+
+extern const struct ras_psp_ip_func ras_psp_v13_0;
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h b/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h
new file mode 100644
index 000000000000..0921e36d3274
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h
@@ -0,0 +1,231 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef _RAS_TA_IF_H
+#define _RAS_TA_IF_H
+#include "ras.h"
+
+#define RAS_TA_HOST_IF_VER 0
+
+/* Responses have bit 31 set */
+#define RSP_ID_MASK (1U << 31)
+#define RSP_ID(cmdId) (((uint32_t)(cmdId)) | RSP_ID_MASK)
+
+/* invalid node instance value */
+#define RAS_TA_INV_NODE 0xffff
+
+/* RAS related enumerations */
+/**********************************************************/
+enum ras_ta_cmd_id {
+ RAS_TA_CMD_ID__ENABLE_FEATURES = 0,
+ RAS_TA_CMD_ID__DISABLE_FEATURES,
+ RAS_TA_CMD_ID__TRIGGER_ERROR,
+ RAS_TA_CMD_ID__QUERY_BLOCK_INFO,
+ RAS_TA_CMD_ID__QUERY_SUB_BLOCK_INFO,
+ RAS_TA_CMD_ID__QUERY_ADDRESS,
+ MAX_RAS_TA_CMD_ID
+};
+
+enum ras_ta_status {
+ RAS_TA_STATUS__SUCCESS = 0x0000,
+ RAS_TA_STATUS__RESET_NEEDED = 0xA001,
+ RAS_TA_STATUS__ERROR_INVALID_PARAMETER = 0xA002,
+ RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE = 0xA003,
+ RAS_TA_STATUS__ERROR_RAS_DUPLICATE_CMD = 0xA004,
+ RAS_TA_STATUS__ERROR_INJECTION_FAILED = 0xA005,
+ RAS_TA_STATUS__ERROR_ASD_READ_WRITE = 0xA006,
+ RAS_TA_STATUS__ERROR_TOGGLE_DF_CSTATE = 0xA007,
+ RAS_TA_STATUS__ERROR_TIMEOUT = 0xA008,
+ RAS_TA_STATUS__ERROR_BLOCK_DISABLED = 0XA009,
+ RAS_TA_STATUS__ERROR_GENERIC = 0xA00A,
+ RAS_TA_STATUS__ERROR_RAS_MMHUB_INIT = 0xA00B,
+ RAS_TA_STATUS__ERROR_GET_DEV_INFO = 0xA00C,
+ RAS_TA_STATUS__ERROR_UNSUPPORTED_DEV = 0xA00D,
+ RAS_TA_STATUS__ERROR_NOT_INITIALIZED = 0xA00E,
+ RAS_TA_STATUS__ERROR_TEE_INTERNAL = 0xA00F,
+ RAS_TA_STATUS__ERROR_UNSUPPORTED_FUNCTION = 0xA010,
+ RAS_TA_STATUS__ERROR_SYS_DRV_REG_ACCESS = 0xA011,
+ RAS_TA_STATUS__ERROR_RAS_READ_WRITE = 0xA012,
+ RAS_TA_STATUS__ERROR_NULL_PTR = 0xA013,
+ RAS_TA_STATUS__ERROR_UNSUPPORTED_IP = 0xA014,
+ RAS_TA_STATUS__ERROR_PCS_STATE_QUIET = 0xA015,
+ RAS_TA_STATUS__ERROR_PCS_STATE_ERROR = 0xA016,
+ RAS_TA_STATUS__ERROR_PCS_STATE_HANG = 0xA017,
+ RAS_TA_STATUS__ERROR_PCS_STATE_UNKNOWN = 0xA018,
+ RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ = 0xA019,
+ RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED = 0xA01A
+};
+
+enum ras_ta_block {
+ RAS_TA_BLOCK__UMC = 0,
+ RAS_TA_BLOCK__SDMA,
+ RAS_TA_BLOCK__GFX,
+ RAS_TA_BLOCK__MMHUB,
+ RAS_TA_BLOCK__ATHUB,
+ RAS_TA_BLOCK__PCIE_BIF,
+ RAS_TA_BLOCK__HDP,
+ RAS_TA_BLOCK__XGMI_WAFL,
+ RAS_TA_BLOCK__DF,
+ RAS_TA_BLOCK__SMN,
+ RAS_TA_BLOCK__SEM,
+ RAS_TA_BLOCK__MP0,
+ RAS_TA_BLOCK__MP1,
+ RAS_TA_BLOCK__FUSE,
+ RAS_TA_BLOCK__MCA,
+ RAS_TA_BLOCK__VCN,
+ RAS_TA_BLOCK__JPEG,
+ RAS_TA_BLOCK__IH,
+ RAS_TA_BLOCK__MPIO,
+ RAS_TA_BLOCK__MMSCH,
+ RAS_TA_NUM_BLOCK_MAX
+};
+
+enum ras_ta_mca_block {
+ RAS_TA_MCA_BLOCK__MP0 = 0,
+ RAS_TA_MCA_BLOCK__MP1 = 1,
+ RAS_TA_MCA_BLOCK__MPIO = 2,
+ RAS_TA_MCA_BLOCK__IOHC = 3,
+ RAS_TA_MCA_NUM_BLOCK_MAX
+};
+
+enum ras_ta_error_type {
+ RAS_TA_ERROR__NONE = 0,
+ RAS_TA_ERROR__PARITY = 1,
+ RAS_TA_ERROR__SINGLE_CORRECTABLE = 2,
+ RAS_TA_ERROR__MULTI_UNCORRECTABLE = 4,
+ RAS_TA_ERROR__POISON = 8,
+};
+
+enum ras_ta_address_type {
+ RAS_TA_MCA_TO_PA,
+ RAS_TA_PA_TO_MCA,
+};
+
+enum ras_ta_nps_mode {
+ RAS_TA_UNKNOWN_MODE = 0,
+ RAS_TA_NPS1_MODE = 1,
+ RAS_TA_NPS2_MODE = 2,
+ RAS_TA_NPS4_MODE = 4,
+ RAS_TA_NPS8_MODE = 8,
+};
+
+/* Input/output structures for RAS commands */
+/**********************************************************/
+
+struct ras_ta_enable_features_input {
+ enum ras_ta_block block_id;
+ enum ras_ta_error_type error_type;
+};
+
+struct ras_ta_disable_features_input {
+ enum ras_ta_block block_id;
+ enum ras_ta_error_type error_type;
+};
+
+struct ras_ta_trigger_error_input {
+ /* ras-block. i.e. umc, gfx */
+ enum ras_ta_block block_id;
+
+ /* type of error. i.e. single_correctable */
+ enum ras_ta_error_type inject_error_type;
+
+ /* mem block. i.e. hbm, sram etc. */
+ uint32_t sub_block_index;
+
+ /* explicit address of error */
+ uint64_t address;
+
+ /* method if error injection. i.e persistent, coherent etc. */
+ uint64_t value;
+};
+
+struct ras_ta_init_flags {
+ uint8_t poison_mode_en;
+ uint8_t dgpu_mode;
+ uint16_t xcc_mask;
+ uint8_t channel_dis_num;
+ uint8_t nps_mode;
+ uint32_t active_umc_mask;
+};
+
+struct ras_ta_mca_addr {
+ uint64_t err_addr;
+ uint32_t ch_inst;
+ uint32_t umc_inst;
+ uint32_t node_inst;
+ uint32_t socket_id;
+};
+
+struct ras_ta_phy_addr {
+ uint64_t pa;
+ uint32_t bank;
+ uint32_t channel_idx;
+};
+
+struct ras_ta_query_address_input {
+ enum ras_ta_address_type addr_type;
+ struct ras_ta_mca_addr ma;
+ struct ras_ta_phy_addr pa;
+};
+
+struct ras_ta_output_flags {
+ uint8_t ras_init_success_flag;
+ uint8_t err_inject_switch_disable_flag;
+ uint8_t reg_access_failure_flag;
+};
+
+struct ras_ta_query_address_output {
+ /* don't use the flags here */
+ struct ras_ta_output_flags flags;
+ struct ras_ta_mca_addr ma;
+ struct ras_ta_phy_addr pa;
+};
+
+/* Common input structure for RAS callbacks */
+/**********************************************************/
+union ras_ta_cmd_input {
+ struct ras_ta_init_flags init_flags;
+ struct ras_ta_enable_features_input enable_features;
+ struct ras_ta_disable_features_input disable_features;
+ struct ras_ta_trigger_error_input trigger_error;
+ struct ras_ta_query_address_input address;
+ uint32_t reserve_pad[256];
+};
+
+union ras_ta_cmd_output {
+ struct ras_ta_output_flags flags;
+ struct ras_ta_query_address_output address;
+ uint32_t reserve_pad[256];
+};
+
+struct ras_ta_cmd {
+ uint32_t cmd_id;
+ uint32_t resp_id;
+ uint32_t ras_status;
+ uint32_t if_version;
+ union ras_ta_cmd_input ras_in_message;
+ union ras_ta_cmd_output ras_out_message;
+};
+
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
new file mode 100644
index 000000000000..4dae64c424a2
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c
@@ -0,0 +1,707 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_umc.h"
+#include "ras_umc_v12_0.h"
+
+#define MAX_ECC_NUM_PER_RETIREMENT 16
+
+/* bad page timestamp format
+ * yy[31:27] mm[26:23] day[22:17] hh[16:12] mm[11:6] ss[5:0]
+ */
+#define EEPROM_TIMESTAMP_MINUTE 6
+#define EEPROM_TIMESTAMP_HOUR 12
+#define EEPROM_TIMESTAMP_DAY 17
+#define EEPROM_TIMESTAMP_MONTH 23
+#define EEPROM_TIMESTAMP_YEAR 27
+
+static uint64_t ras_umc_get_eeprom_timestamp(struct ras_core_context *ras_core)
+{
+ struct ras_time tm = {0};
+ uint64_t utc_timestamp = 0;
+ uint64_t eeprom_timestamp = 0;
+
+ utc_timestamp = ras_core_get_utc_second_timestamp(ras_core);
+ if (!utc_timestamp)
+ return utc_timestamp;
+
+ ras_core_convert_timestamp_to_time(ras_core, utc_timestamp, &tm);
+
+ /* the year range is 2000 ~ 2031, set the year if not in the range */
+ if (tm.tm_year < 2000)
+ tm.tm_year = 2000;
+ if (tm.tm_year > 2031)
+ tm.tm_year = 2031;
+
+ tm.tm_year -= 2000;
+
+ eeprom_timestamp = tm.tm_sec + (tm.tm_min << EEPROM_TIMESTAMP_MINUTE)
+ + (tm.tm_hour << EEPROM_TIMESTAMP_HOUR)
+ + (tm.tm_mday << EEPROM_TIMESTAMP_DAY)
+ + (tm.tm_mon << EEPROM_TIMESTAMP_MONTH)
+ + (tm.tm_year << EEPROM_TIMESTAMP_YEAR);
+ eeprom_timestamp &= 0xffffffff;
+
+ return eeprom_timestamp;
+}
+
+static const struct ras_umc_ip_func *ras_umc_get_ip_func(
+ struct ras_core_context *ras_core, uint32_t ip_version)
+{
+ switch (ip_version) {
+ case IP_VERSION(12, 0, 0):
+ case IP_VERSION(12, 5, 0):
+ return &ras_umc_func_v12_0;
+ default:
+ RAS_DEV_ERR(ras_core->dev,
+ "UMC ip version(0x%x) is not supported!\n", ip_version);
+ break;
+ }
+
+ return NULL;
+}
+
+int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
+ struct umc_mca_addr *in, struct umc_phy_addr *out,
+ uint32_t nps)
+{
+ struct ras_ta_query_address_input addr_in;
+ struct ras_ta_query_address_output addr_out;
+ int ret;
+
+ if (!in)
+ return -EINVAL;
+
+ memset(&addr_in, 0, sizeof(addr_in));
+ memset(&addr_out, 0, sizeof(addr_out));
+
+ addr_in.ma.err_addr = in->err_addr;
+ addr_in.ma.ch_inst = in->ch_inst;
+ addr_in.ma.umc_inst = in->umc_inst;
+ addr_in.ma.node_inst = in->node_inst;
+ addr_in.ma.socket_id = in->socket_id;
+
+ addr_in.addr_type = RAS_TA_MCA_TO_PA;
+
+ ret = ras_psp_query_address(ras_core, &addr_in, &addr_out);
+ if (ret) {
+ RAS_DEV_WARN(ras_core->dev,
+ "Failed to query RAS physical address for 0x%llx, ret:%d",
+ in->err_addr, ret);
+ return -EREMOTEIO;
+ }
+
+ if (out) {
+ out->pa = addr_out.pa.pa;
+ out->bank = addr_out.pa.bank;
+ out->channel_idx = addr_out.pa.channel_idx;
+ }
+
+ return 0;
+}
+
+static int ras_umc_log_ecc(struct ras_core_context *ras_core,
+ unsigned long idx, void *data)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ int ret;
+
+ mutex_lock(&ras_umc->tree_lock);
+ ret = radix_tree_insert(&ras_umc->root, idx, data);
+ if (!ret)
+ radix_tree_tag_set(&ras_umc->root, idx, UMC_ECC_NEW_DETECTED_TAG);
+ mutex_unlock(&ras_umc->tree_lock);
+
+ return ret;
+}
+
+int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ uint64_t buf[8] = {0};
+ void **slot;
+ void *data;
+ void *iter = buf;
+
+ mutex_lock(&ras_umc->tree_lock);
+ radix_tree_for_each_slot(slot, &ras_umc->root, iter, 0) {
+ data = ras_radix_tree_delete_iter(&ras_umc->root, iter);
+ kfree(data);
+ }
+ mutex_unlock(&ras_umc->tree_lock);
+
+ return 0;
+}
+
+static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ uint64_t page_pfn[16];
+ int count = 0, i;
+
+ memset(page_pfn, 0, sizeof(page_pfn));
+ if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages) {
+ count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
+ record, record->cur_nps, page_pfn, ARRAY_SIZE(page_pfn));
+ if (count <= 0) {
+ RAS_DEV_ERR(ras_core->dev,
+ "Fail to convert error address! count:%d\n", count);
+ return;
+ }
+ }
+
+ /* Reserve memory */
+ for (i = 0; i < count; i++)
+ ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__RESERVE_BAD_PAGE, &page_pfn[i]);
+}
+
+/* When gpu reset is ongoing, ecc logging operations will be pended.
+ */
+int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct ras_bank_ecc_node *ecc_node;
+
+ ecc_node = kzalloc(sizeof(*ecc_node), GFP_KERNEL);
+ if (!ecc_node)
+ return -ENOMEM;
+
+ memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc));
+
+ mutex_lock(&ras_umc->pending_ecc_lock);
+ list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list);
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+
+ return 0;
+}
+
+/* After gpu reset is complete, re-log the pending error banks.
+ */
+int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct ras_bank_ecc_node *ecc_node, *tmp;
+
+ mutex_lock(&ras_umc->pending_ecc_lock);
+ list_for_each_entry_safe(ecc_node,
+ tmp, &ras_umc->pending_ecc_list, node){
+ if (ecc_node && !ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) {
+ list_del(&ecc_node->node);
+ kfree(ecc_node);
+ }
+ }
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+
+ return 0;
+}
+
+int ras_umc_log_bad_bank(struct ras_core_context *ras_core, struct ras_bank_ecc *bank)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct eeprom_umc_record umc_rec;
+ struct eeprom_umc_record *err_rec;
+ int ret;
+
+ memset(&umc_rec, 0, sizeof(umc_rec));
+
+ mutex_lock(&ras_umc->bank_log_lock);
+ ret = ras_umc->ip_func->bank_to_eeprom_record(ras_core, bank, &umc_rec);
+ if (ret)
+ goto out;
+
+ err_rec = kzalloc(sizeof(*err_rec), GFP_KERNEL);
+ if (!err_rec) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ memcpy(err_rec, &umc_rec, sizeof(umc_rec));
+ ret = ras_umc_log_ecc(ras_core, err_rec->cur_nps_retired_row_pfn, err_rec);
+ if (ret) {
+ if (ret == -EEXIST) {
+ RAS_DEV_INFO(ras_core->dev, "The bad pages have been logged before.\n");
+ ret = 0;
+ }
+
+ kfree(err_rec);
+ goto out;
+ }
+
+ ras_umc_reserve_eeprom_record(ras_core, err_rec);
+
+ ret = ras_core_event_notify(ras_core,
+ RAS_EVENT_ID__BAD_PAGE_DETECTED, NULL);
+
+out:
+ mutex_unlock(&ras_umc->bank_log_lock);
+ return ret;
+}
+
+static int ras_umc_get_new_records(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *records, u32 num)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct eeprom_umc_record *entries[MAX_ECC_NUM_PER_RETIREMENT];
+ u32 entry_num = num < MAX_ECC_NUM_PER_RETIREMENT ? num : MAX_ECC_NUM_PER_RETIREMENT;
+ int count = 0;
+ int new_detected, i;
+
+ mutex_lock(&ras_umc->tree_lock);
+ new_detected = radix_tree_gang_lookup_tag(&ras_umc->root, (void **)entries,
+ 0, entry_num, UMC_ECC_NEW_DETECTED_TAG);
+ for (i = 0; i < new_detected; i++) {
+ if (!entries[i])
+ continue;
+
+ memcpy(&records[i], entries[i], sizeof(struct eeprom_umc_record));
+ count++;
+ radix_tree_tag_clear(&ras_umc->root,
+ entries[i]->cur_nps_retired_row_pfn, UMC_ECC_NEW_DETECTED_TAG);
+ }
+ mutex_unlock(&ras_umc->tree_lock);
+
+ return count;
+}
+
+static bool ras_umc_check_retired_record(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, bool from_eeprom)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
+ uint32_t nps = 0;
+ int i, ret;
+
+ if (from_eeprom) {
+ nps = ras_umc->umc_err_data.umc_nps_mode;
+ if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_record) {
+ ret = ras_umc->ip_func->eeprom_record_to_nps_record(ras_core, record, nps);
+ if (ret)
+ RAS_DEV_WARN(ras_core->dev,
+ "Failed to adjust eeprom record, ret:%d", ret);
+ }
+ return false;
+ }
+
+ for (i = 0; i < data->count; i++) {
+ if ((data->bps[i].retired_row_pfn == record->retired_row_pfn) &&
+ (data->bps[i].cur_nps_retired_row_pfn == record->cur_nps_retired_row_pfn))
+ return true;
+ }
+
+ return false;
+}
+
+/* alloc/realloc bps array */
+static int ras_umc_realloc_err_data_space(struct ras_core_context *ras_core,
+ struct eeprom_store_record *data, int pages)
+{
+ unsigned int old_space = data->count + data->space_left;
+ unsigned int new_space = old_space + pages;
+ unsigned int align_space = ALIGN(new_space, 512);
+ void *bps = kzalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+
+ if (!bps)
+ return -ENOMEM;
+
+ if (data->bps) {
+ memcpy(bps, data->bps,
+ data->count * sizeof(*data->bps));
+ kfree(data->bps);
+ }
+
+ data->bps = bps;
+ data->space_left += align_space - old_space;
+ return 0;
+}
+
+static int ras_umc_update_eeprom_rom_data(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *bps)
+{
+ struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.rom_data;
+
+ if (!data->space_left &&
+ ras_umc_realloc_err_data_space(ras_core, data, 256)) {
+ return -ENOMEM;
+ }
+
+ memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
+ data->count++;
+ data->space_left--;
+ return 0;
+}
+
+static int ras_umc_update_eeprom_ram_data(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *bps)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
+ uint64_t page_pfn[16];
+ int count = 0, j;
+
+ if (!data->space_left &&
+ ras_umc_realloc_err_data_space(ras_core, data, 256)) {
+ return -ENOMEM;
+ }
+
+ memset(page_pfn, 0, sizeof(page_pfn));
+ if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages)
+ count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core,
+ bps, bps->cur_nps, page_pfn, ARRAY_SIZE(page_pfn));
+
+ if (count > 0) {
+ for (j = 0; j < count; j++) {
+ bps->cur_nps_retired_row_pfn = page_pfn[j];
+ memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
+ data->count++;
+ data->space_left--;
+ }
+ } else {
+ memcpy(&data->bps[data->count], bps, sizeof(*data->bps));
+ data->count++;
+ data->space_left--;
+ }
+
+ return 0;
+}
+
+/* it deal with vram only. */
+static int ras_umc_add_bad_pages(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *bps,
+ int pages, bool from_eeprom)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct ras_umc_err_data *data = &ras_umc->umc_err_data;
+ int i, ret = 0;
+
+ if (!bps || pages <= 0)
+ return 0;
+
+ mutex_lock(&ras_umc->umc_lock);
+ for (i = 0; i < pages; i++) {
+ if (ras_umc_check_retired_record(ras_core, &bps[i], from_eeprom))
+ continue;
+
+ ret = ras_umc_update_eeprom_rom_data(ras_core, &bps[i]);
+ if (ret)
+ goto out;
+
+ if (data->last_retired_pfn == bps[i].cur_nps_retired_row_pfn)
+ continue;
+
+ data->last_retired_pfn = bps[i].cur_nps_retired_row_pfn;
+
+ if (from_eeprom)
+ ras_umc_reserve_eeprom_record(ras_core, &bps[i]);
+
+ ret = ras_umc_update_eeprom_ram_data(ras_core, &bps[i]);
+ if (ret)
+ goto out;
+ }
+out:
+ mutex_unlock(&ras_umc->umc_lock);
+
+ return ret;
+}
+
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
+int ras_umc_load_bad_pages(struct ras_core_context *ras_core)
+{
+ struct eeprom_umc_record *bps;
+ uint32_t ras_num_recs;
+ int ret;
+
+ ras_num_recs = ras_eeprom_get_record_count(ras_core);
+ /* no bad page record, skip eeprom access */
+ if (!ras_num_recs ||
+ ras_core->ras_eeprom.record_threshold_config == DISABLE_RETIRE_PAGE)
+ return 0;
+
+ bps = kcalloc(ras_num_recs, sizeof(*bps), GFP_KERNEL);
+ if (!bps)
+ return -ENOMEM;
+
+ ret = ras_eeprom_read(ras_core, bps, ras_num_recs);
+ if (ret) {
+ RAS_DEV_ERR(ras_core->dev, "Failed to load EEPROM table records!");
+ } else {
+ ras_core->ras_umc.umc_err_data.last_retired_pfn = UMC_INV_MEM_PFN;
+ ret = ras_umc_add_bad_pages(ras_core, bps, ras_num_recs, true);
+ }
+
+ kfree(bps);
+ return ret;
+}
+
+/*
+ * write error record array to eeprom, the function should be
+ * protected by recovery_lock
+ * new_cnt: new added UE count, excluding reserved bad pages, can be NULL
+ */
+static int ras_umc_save_bad_pages(struct ras_core_context *ras_core)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data;
+ uint32_t eeprom_record_num;
+ int save_count;
+ int ret = 0;
+
+ if (!data->bps)
+ return 0;
+
+ eeprom_record_num = ras_eeprom_get_record_count(ras_core);
+ mutex_lock(&ras_umc->umc_lock);
+ save_count = data->count - eeprom_record_num;
+ /* only new entries are saved */
+ if (save_count > 0) {
+ if (ras_eeprom_append(ras_core,
+ &data->bps[eeprom_record_num],
+ save_count)) {
+ RAS_DEV_ERR(ras_core->dev, "Failed to save EEPROM table data!");
+ ret = -EIO;
+ goto exit;
+ }
+
+ RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n", save_count);
+ }
+
+exit:
+ mutex_unlock(&ras_umc->umc_lock);
+ return ret;
+}
+
+int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data)
+{
+ struct eeprom_umc_record records[MAX_ECC_NUM_PER_RETIREMENT];
+ int count, ret;
+
+ memset(records, 0, sizeof(records));
+ count = ras_umc_get_new_records(ras_core, records, ARRAY_SIZE(records));
+ if (count <= 0)
+ return -ENODATA;
+
+ ret = ras_umc_add_bad_pages(ras_core, records, count, false);
+ if (ret) {
+ RAS_DEV_ERR(ras_core->dev, "Failed to add ras bad page!\n");
+ return -EINVAL;
+ }
+
+ ret = ras_umc_save_bad_pages(ras_core);
+ if (ret) {
+ RAS_DEV_ERR(ras_core->dev, "Failed to save ras bad page\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ras_umc_sw_init(struct ras_core_context *ras_core)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+
+ memset(ras_umc, 0, sizeof(*ras_umc));
+
+ INIT_LIST_HEAD(&ras_umc->pending_ecc_list);
+
+ INIT_RADIX_TREE(&ras_umc->root, GFP_KERNEL);
+
+ mutex_init(&ras_umc->tree_lock);
+ mutex_init(&ras_umc->pending_ecc_lock);
+ mutex_init(&ras_umc->umc_lock);
+ mutex_init(&ras_umc->bank_log_lock);
+
+ return 0;
+}
+
+int ras_umc_sw_fini(struct ras_core_context *ras_core)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct ras_umc_err_data *umc_err_data = &ras_umc->umc_err_data;
+ struct ras_bank_ecc_node *ecc_node, *tmp;
+
+ mutex_destroy(&ras_umc->umc_lock);
+ mutex_destroy(&ras_umc->bank_log_lock);
+
+ if (umc_err_data->rom_data.bps) {
+ umc_err_data->rom_data.count = 0;
+ kfree(umc_err_data->rom_data.bps);
+ umc_err_data->rom_data.bps = NULL;
+ umc_err_data->rom_data.space_left = 0;
+ }
+
+ if (umc_err_data->ram_data.bps) {
+ umc_err_data->ram_data.count = 0;
+ kfree(umc_err_data->ram_data.bps);
+ umc_err_data->ram_data.bps = NULL;
+ umc_err_data->ram_data.space_left = 0;
+ }
+
+ ras_umc_clear_logged_ecc(ras_core);
+
+ mutex_lock(&ras_umc->pending_ecc_lock);
+ list_for_each_entry_safe(ecc_node,
+ tmp, &ras_umc->pending_ecc_list, node){
+ list_del(&ecc_node->node);
+ kfree(ecc_node);
+ }
+ mutex_unlock(&ras_umc->pending_ecc_lock);
+
+ mutex_destroy(&ras_umc->tree_lock);
+ mutex_destroy(&ras_umc->pending_ecc_lock);
+
+ return 0;
+}
+
+int ras_umc_hw_init(struct ras_core_context *ras_core)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ uint32_t nps;
+
+ nps = ras_core_get_curr_nps_mode(ras_core);
+
+ if (!nps || (nps >= UMC_MEMORY_PARTITION_MODE_UNKNOWN)) {
+ RAS_DEV_ERR(ras_core->dev, "Invalid memory NPS mode: %u!\n", nps);
+ return -ENODATA;
+ }
+
+ ras_umc->umc_err_data.umc_nps_mode = nps;
+
+ ras_umc->umc_vram_type = ras_core->config->umc_cfg.umc_vram_type;
+ if (!ras_umc->umc_vram_type) {
+ RAS_DEV_ERR(ras_core->dev, "Invalid UMC VRAM Type: %u!\n",
+ ras_umc->umc_vram_type);
+ return -ENODATA;
+ }
+
+ ras_umc->umc_ip_version = ras_core->config->umc_ip_version;
+ ras_umc->ip_func = ras_umc_get_ip_func(ras_core, ras_umc->umc_ip_version);
+ if (!ras_umc->ip_func)
+ return -EINVAL;
+
+ return 0;
+}
+
+int ras_umc_hw_fini(struct ras_core_context *ras_core)
+{
+ return 0;
+}
+
+int ras_umc_clean_badpage_data(struct ras_core_context *ras_core)
+{
+ struct ras_umc_err_data *data = &ras_core->ras_umc.umc_err_data;
+
+ mutex_lock(&ras_core->ras_umc.umc_lock);
+
+ kfree(data->rom_data.bps);
+ kfree(data->ram_data.bps);
+
+ memset(data, 0, sizeof(*data));
+ mutex_unlock(&ras_core->ras_umc.umc_lock);
+
+ return 0;
+}
+
+int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
+ uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr,
+ enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record)
+{
+ struct eeprom_umc_record *err_rec = record;
+
+ /* Set bad page pfn and nps mode */
+ EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(err_rec,
+ RAS_ADDR_TO_PFN(cur_nps_addr->pa), cur_nps);
+
+ err_rec->address = err_addr;
+ err_rec->ts = ras_umc_get_eeprom_timestamp(ras_core);
+ err_rec->err_type = RAS_EEPROM_ERR_NON_RECOVERABLE;
+ err_rec->cu = 0;
+ err_rec->mem_channel = cur_nps_addr->channel_idx;
+ err_rec->mcumc_id = umc_inst;
+ err_rec->cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(cur_nps_addr->pa);
+ err_rec->cur_nps_bank = cur_nps_addr->bank;
+ err_rec->cur_nps = cur_nps;
+ return 0;
+}
+
+int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core)
+{
+ struct ras_umc_err_data *err_data = &ras_core->ras_umc.umc_err_data;
+
+ return err_data->rom_data.count;
+}
+
+int ras_umc_get_badpage_count(struct ras_core_context *ras_core)
+{
+ struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data;
+
+ return data->count;
+}
+
+int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record)
+{
+ struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data;
+
+ if (index >= data->count)
+ return -EINVAL;
+
+ memcpy(record, &data->bps[index], sizeof(struct eeprom_umc_record));
+ return 0;
+}
+
+bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data;
+ uint64_t page_pfn = RAS_ADDR_TO_PFN(addr);
+ int i, ret = false;
+
+ mutex_lock(&ras_umc->umc_lock);
+ for (i = 0; i < data->count; i++) {
+ if (data->bps[i].cur_nps_retired_row_pfn == page_pfn) {
+ ret = true;
+ break;
+ }
+ }
+ mutex_unlock(&ras_umc->umc_lock);
+
+ return ret;
+}
+
+int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
+ uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa)
+{
+ struct ras_umc *ras_umc = &ras_core->ras_umc;
+ int ret = 0;
+
+ if (bank_to_pa)
+ ret = ras_umc->ip_func->bank_to_soc_pa(ras_core, *bank_addr, soc_pa);
+ else
+ ret = ras_umc->ip_func->soc_pa_to_bank(ras_core, *soc_pa, bank_addr);
+
+ return ret;
+}
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
new file mode 100644
index 000000000000..7d9e779d8c4c
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RAS_UMC_H__
+#define __RAS_UMC_H__
+#include "ras.h"
+#include "ras_eeprom.h"
+#include "ras_cmd.h"
+
+#define UMC_VRAM_TYPE_UNKNOWN 0
+#define UMC_VRAM_TYPE_GDDR1 1
+#define UMC_VRAM_TYPE_DDR2 2
+#define UMC_VRAM_TYPE_GDDR3 3
+#define UMC_VRAM_TYPE_GDDR4 4
+#define UMC_VRAM_TYPE_GDDR5 5
+#define UMC_VRAM_TYPE_HBM 6
+#define UMC_VRAM_TYPE_DDR3 7
+#define UMC_VRAM_TYPE_DDR4 8
+#define UMC_VRAM_TYPE_GDDR6 9
+#define UMC_VRAM_TYPE_DDR5 10
+#define UMC_VRAM_TYPE_LPDDR4 11
+#define UMC_VRAM_TYPE_LPDDR5 12
+#define UMC_VRAM_TYPE_HBM3E 13
+
+#define UMC_ECC_NEW_DETECTED_TAG 0x1
+#define UMC_INV_MEM_PFN (0xFFFFFFFFFFFFFFFF)
+
+/* three column bits and one row bit in MCA address flip
+ * in bad page retirement
+ */
+#define UMC_PA_FLIP_BITS_NUM 4
+
+enum umc_memory_partition_mode {
+ UMC_MEMORY_PARTITION_MODE_NONE = 0,
+ UMC_MEMORY_PARTITION_MODE_NPS1 = 1,
+ UMC_MEMORY_PARTITION_MODE_NPS2 = 2,
+ UMC_MEMORY_PARTITION_MODE_NPS3 = 3,
+ UMC_MEMORY_PARTITION_MODE_NPS4 = 4,
+ UMC_MEMORY_PARTITION_MODE_NPS6 = 6,
+ UMC_MEMORY_PARTITION_MODE_NPS8 = 8,
+ UMC_MEMORY_PARTITION_MODE_UNKNOWN
+};
+
+struct ras_core_context;
+struct ras_bank_ecc;
+
+struct umc_flip_bits {
+ uint32_t flip_bits_in_pa[UMC_PA_FLIP_BITS_NUM];
+ uint32_t flip_row_bit;
+ uint32_t r13_in_pa;
+ uint32_t bit_num;
+};
+
+struct umc_mca_addr {
+ uint64_t err_addr;
+ uint32_t ch_inst;
+ uint32_t umc_inst;
+ uint32_t node_inst;
+ uint32_t socket_id;
+};
+
+struct umc_phy_addr {
+ uint64_t pa;
+ uint32_t bank;
+ uint32_t channel_idx;
+};
+
+struct umc_bank_addr {
+ uint32_t stack_id; /* SID */
+ uint32_t bank_group;
+ uint32_t bank;
+ uint32_t row;
+ uint32_t column;
+ uint32_t channel;
+ uint32_t subchannel; /* Also called Pseudochannel (PC) */
+};
+
+struct ras_umc_ip_func {
+ int (*bank_to_eeprom_record)(struct ras_core_context *ras_core,
+ struct ras_bank_ecc *bank, struct eeprom_umc_record *record);
+ int (*eeprom_record_to_nps_record)(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, uint32_t nps);
+ int (*eeprom_record_to_nps_pages)(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, uint32_t nps,
+ uint64_t *pfns, uint32_t num);
+ int (*bank_to_soc_pa)(struct ras_core_context *ras_core,
+ struct umc_bank_addr bank_addr, uint64_t *soc_pa);
+ int (*soc_pa_to_bank)(struct ras_core_context *ras_core,
+ uint64_t soc_pa, struct umc_bank_addr *bank_addr);
+};
+
+struct eeprom_store_record {
+ /* point to data records array */
+ struct eeprom_umc_record *bps;
+ /* the count of entries */
+ int count;
+ /* the space can place new entries */
+ int space_left;
+};
+
+struct ras_umc_err_data {
+ struct eeprom_store_record rom_data;
+ struct eeprom_store_record ram_data;
+ enum umc_memory_partition_mode umc_nps_mode;
+ uint64_t last_retired_pfn;
+};
+
+struct ras_umc {
+ u32 umc_ip_version;
+ u32 umc_vram_type;
+ const struct ras_umc_ip_func *ip_func;
+ struct radix_tree_root root;
+ struct mutex tree_lock;
+ struct mutex umc_lock;
+ struct mutex bank_log_lock;
+ struct mutex pending_ecc_lock;
+ struct ras_umc_err_data umc_err_data;
+ struct list_head pending_ecc_list;
+};
+
+int ras_umc_sw_init(struct ras_core_context *ras);
+int ras_umc_sw_fini(struct ras_core_context *ras);
+int ras_umc_hw_init(struct ras_core_context *ras);
+int ras_umc_hw_fini(struct ras_core_context *ras);
+int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core,
+ struct umc_mca_addr *in, struct umc_phy_addr *out,
+ uint32_t nps);
+int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data);
+int ras_umc_log_bad_bank(struct ras_core_context *ras, struct ras_bank_ecc *bank);
+int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank);
+int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core);
+int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core);
+int ras_umc_load_bad_pages(struct ras_core_context *ras_core);
+int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core);
+int ras_umc_clean_badpage_data(struct ras_core_context *ras_core);
+int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core,
+ uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr,
+ enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record);
+
+int ras_umc_get_badpage_count(struct ras_core_context *ras_core);
+int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record);
+bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr);
+int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core,
+ uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa);
+#endif
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
new file mode 100644
index 000000000000..5d9a11c17a86
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
@@ -0,0 +1,511 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "ras.h"
+#include "ras_umc.h"
+#include "ras_core_status.h"
+#include "ras_umc_v12_0.h"
+
+#define NumDieInterleaved 4
+
+static const uint32_t umc_v12_0_channel_idx_tbl[]
+ [UMC_V12_0_UMC_INSTANCE_NUM][UMC_V12_0_CHANNEL_INSTANCE_NUM] = {
+ {{3, 7, 11, 15, 2, 6, 10, 14}, {1, 5, 9, 13, 0, 4, 8, 12},
+ {19, 23, 27, 31, 18, 22, 26, 30}, {17, 21, 25, 29, 16, 20, 24, 28}},
+ {{47, 43, 39, 35, 46, 42, 38, 34}, {45, 41, 37, 33, 44, 40, 36, 32},
+ {63, 59, 55, 51, 62, 58, 54, 50}, {61, 57, 53, 49, 60, 56, 52, 48}},
+ {{79, 75, 71, 67, 78, 74, 70, 66}, {77, 73, 69, 65, 76, 72, 68, 64},
+ {95, 91, 87, 83, 94, 90, 86, 82}, {93, 89, 85, 81, 92, 88, 84, 80}},
+ {{99, 103, 107, 111, 98, 102, 106, 110}, {97, 101, 105, 109, 96, 100, 104, 108},
+ {115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}}
+};
+
+/* mapping of MCA error address to normalized address */
+static const uint32_t umc_v12_0_ma2na_mapping[] = {
+ 0, 5, 6, 8, 9, 14, 12, 13,
+ 10, 11, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28,
+ 24, 7, 29, 30,
+};
+
+static bool umc_v12_0_bit_wise_xor(uint32_t val)
+{
+ bool result = 0;
+ int i;
+
+ for (i = 0; i < 32; i++)
+ result = result ^ ((val >> i) & 0x1);
+
+ return result;
+}
+
+static void __get_nps_pa_flip_bits(struct ras_core_context *ras_core,
+ enum umc_memory_partition_mode nps,
+ struct umc_flip_bits *flip_bits)
+{
+ uint32_t vram_type = ras_core->ras_umc.umc_vram_type;
+
+ /* default setting */
+ flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_C2_BIT;
+ flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C3_BIT;
+ flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_C4_BIT;
+ flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R13_BIT;
+ flip_bits->flip_row_bit = 13;
+ flip_bits->bit_num = 4;
+ flip_bits->r13_in_pa = UMC_V12_0_PA_R13_BIT;
+
+ if (nps == UMC_MEMORY_PARTITION_MODE_NPS2) {
+ flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH5_BIT;
+ flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C2_BIT;
+ flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B1_BIT;
+ flip_bits->r13_in_pa = UMC_V12_0_PA_R12_BIT;
+ } else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4) {
+ flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH4_BIT;
+ flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_CH5_BIT;
+ flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B0_BIT;
+ flip_bits->r13_in_pa = UMC_V12_0_PA_R11_BIT;
+ }
+
+ switch (vram_type) {
+ case UMC_VRAM_TYPE_HBM:
+ /* other nps modes are taken as nps1 */
+ if (nps == UMC_MEMORY_PARTITION_MODE_NPS2)
+ flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT;
+ else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4)
+ flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT;
+
+ break;
+ case UMC_VRAM_TYPE_HBM3E:
+ flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT;
+ flip_bits->flip_row_bit = 12;
+
+ if (nps == UMC_MEMORY_PARTITION_MODE_NPS2)
+ flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT;
+ else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4)
+ flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R10_BIT;
+
+ break;
+ default:
+ RAS_DEV_WARN(ras_core->dev,
+ "Unknown HBM type, set RAS retire flip bits to the value in NPS1 mode.\n");
+ break;
+ }
+}
+
+static uint64_t convert_nps_pa_to_row_pa(struct ras_core_context *ras_core,
+ uint64_t pa, enum umc_memory_partition_mode nps, bool zero_pfn_ok)
+{
+ struct umc_flip_bits flip_bits = {0};
+ uint64_t row_pa;
+ int i;
+
+ __get_nps_pa_flip_bits(ras_core, nps, &flip_bits);
+
+ row_pa = pa;
+ /* clear loop bits in soc physical address */
+ for (i = 0; i < flip_bits.bit_num; i++)
+ row_pa &= ~BIT_ULL(flip_bits.flip_bits_in_pa[i]);
+
+ if (!zero_pfn_ok && !RAS_ADDR_TO_PFN(row_pa))
+ row_pa |= BIT_ULL(flip_bits.flip_bits_in_pa[2]);
+
+ return row_pa;
+}
+
+static int lookup_bad_pages_in_a_row(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, uint32_t nps,
+ uint64_t *pfns, uint32_t num,
+ uint64_t seq_no, bool dump)
+{
+ uint32_t col, col_lower, row, row_lower, idx, row_high;
+ uint64_t soc_pa, row_pa, column, err_addr;
+ uint64_t retired_addr = RAS_PFN_TO_ADDR(record->cur_nps_retired_row_pfn);
+ struct umc_flip_bits flip_bits = {0};
+ uint32_t retire_unit;
+ uint32_t i;
+
+ __get_nps_pa_flip_bits(ras_core, nps, &flip_bits);
+
+ row_pa = convert_nps_pa_to_row_pa(ras_core, retired_addr, nps, true);
+
+ err_addr = record->address;
+ /* get column bit 0 and 1 in mca address */
+ col_lower = (err_addr >> 1) & 0x3ULL;
+ /* MA_R13_BIT will be handled later */
+ row_lower = (err_addr >> UMC_V12_0_MCA_R0_BIT) & 0x1fffULL;
+ row_lower &= ~BIT_ULL(flip_bits.flip_row_bit);
+
+ if (ras_core->ras_gfx.gfx_ip_version >= IP_VERSION(9, 5, 0)) {
+ row_high = (row_pa >> flip_bits.r13_in_pa) & 0x3ULL;
+ /* it's 2.25GB in each channel, from MCA address to PA
+ * [R14 R13] is converted if the two bits value are 0x3,
+ * get them from PA instead of MCA address.
+ */
+ row_lower |= (row_high << 13);
+ }
+
+ idx = 0;
+ row = 0;
+ retire_unit = 0x1 << flip_bits.bit_num;
+ /* loop for all possibilities of retire bits */
+ for (column = 0; column < retire_unit; column++) {
+ soc_pa = row_pa;
+ for (i = 0; i < flip_bits.bit_num; i++)
+ soc_pa |= (((column >> i) & 0x1ULL) << flip_bits.flip_bits_in_pa[i]);
+
+ col = ((column & 0x7) << 2) | col_lower;
+
+ /* add row bit 13 */
+ if (flip_bits.bit_num == UMC_PA_FLIP_BITS_NUM)
+ row = ((column >> 3) << flip_bits.flip_row_bit) | row_lower;
+
+ if (dump)
+ RAS_DEV_INFO(ras_core->dev,
+ "{%llu} Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
+ seq_no, soc_pa, row, col,
+ record->cur_nps_bank, record->mem_channel);
+
+
+ if (pfns && (idx < num))
+ pfns[idx++] = RAS_ADDR_TO_PFN(soc_pa);
+ }
+
+ return idx;
+}
+
+static int umc_v12_convert_ma_to_pa(struct ras_core_context *ras_core,
+ struct umc_mca_addr *addr_in, struct umc_phy_addr *addr_out,
+ uint32_t nps)
+{
+ uint32_t i, na_shift;
+ uint64_t soc_pa, na, na_nps;
+ uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
+ uint32_t bank0, bank1, bank2, bank3, bank;
+ uint32_t ch_inst = addr_in->ch_inst;
+ uint32_t umc_inst = addr_in->umc_inst;
+ uint32_t node_inst = addr_in->node_inst;
+ uint32_t socket_id = addr_in->socket_id;
+ uint32_t channel_index;
+ uint64_t err_addr = addr_in->err_addr;
+
+ if (node_inst != UMC_INV_AID_NODE) {
+ if (ch_inst >= UMC_V12_0_CHANNEL_INSTANCE_NUM ||
+ umc_inst >= UMC_V12_0_UMC_INSTANCE_NUM ||
+ node_inst >= UMC_V12_0_AID_NUM_MAX ||
+ socket_id >= UMC_V12_0_SOCKET_NUM_MAX)
+ return -EINVAL;
+ } else {
+ if (socket_id >= UMC_V12_0_SOCKET_NUM_MAX ||
+ ch_inst >= UMC_V12_0_TOTAL_CHANNEL_NUM)
+ return -EINVAL;
+ }
+
+ bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
+ bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL;
+ bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL;
+ bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;
+ col = (err_addr >> 1) & 0x1fULL;
+ row = (err_addr >> 10) & 0x3fffULL;
+
+ /* apply bank hash algorithm */
+ bank0 =
+ bank_hash0 ^ (UMC_V12_0_XOR_EN0 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0))));
+ bank1 =
+ bank_hash1 ^ (UMC_V12_0_XOR_EN1 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1))));
+ bank2 =
+ bank_hash2 ^ (UMC_V12_0_XOR_EN2 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2))));
+ bank3 =
+ bank_hash3 ^ (UMC_V12_0_XOR_EN3 &
+ (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^
+ (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3))));
+
+ bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3);
+ err_addr &= ~0x3c0ULL;
+ err_addr |= (bank << UMC_V12_0_MCA_B0_BIT);
+
+ na_nps = 0x0;
+ /* convert mca error address to normalized address */
+ for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++)
+ na_nps |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i];
+
+ if (nps == UMC_MEMORY_PARTITION_MODE_NPS1)
+ na_shift = 8;
+ else if (nps == UMC_MEMORY_PARTITION_MODE_NPS2)
+ na_shift = 9;
+ else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4)
+ na_shift = 10;
+ else if (nps == UMC_MEMORY_PARTITION_MODE_NPS8)
+ na_shift = 11;
+ else
+ return -EINVAL;
+
+ na = ((na_nps >> na_shift) << 8) | (na_nps & 0xff);
+
+ if (node_inst != UMC_INV_AID_NODE)
+ channel_index =
+ umc_v12_0_channel_idx_tbl[node_inst][umc_inst][ch_inst];
+ else {
+ channel_index = ch_inst;
+ node_inst = channel_index /
+ (UMC_V12_0_UMC_INSTANCE_NUM * UMC_V12_0_CHANNEL_INSTANCE_NUM);
+ }
+
+ /* translate umc channel address to soc pa, 3 parts are included */
+ soc_pa = ADDR_OF_32KB_BLOCK(na) |
+ ADDR_OF_256B_BLOCK(channel_index) |
+ OFFSET_IN_256B_BLOCK(na);
+
+ /* calc channel hash based on absolute address */
+ soc_pa += socket_id * SOCKET_LFB_SIZE;
+ /* the umc channel bits are not original values, they are hashed */
+ UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa);
+ /* restore pa */
+ soc_pa -= socket_id * SOCKET_LFB_SIZE;
+
+ /* get some channel bits from na_nps directly and
+ * add nps section offset
+ */
+ if (nps == UMC_MEMORY_PARTITION_MODE_NPS2) {
+ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_CH5_BIT);
+ soc_pa |= ((na_nps & 0x100) << 5);
+ soc_pa += (node_inst >> 1) * (SOCKET_LFB_SIZE >> 1);
+ } else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4) {
+ soc_pa &= ~(0x3ULL << UMC_V12_0_PA_CH4_BIT);
+ soc_pa |= ((na_nps & 0x300) << 4);
+ soc_pa += node_inst * (SOCKET_LFB_SIZE >> 2);
+ } else if (nps == UMC_MEMORY_PARTITION_MODE_NPS8) {
+ soc_pa &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT);
+ soc_pa |= ((na_nps & 0x700) << 4);
+ soc_pa += node_inst * (SOCKET_LFB_SIZE >> 2) +
+ (channel_index >> 4) * (SOCKET_LFB_SIZE >> 3);
+ }
+
+ addr_out->pa = soc_pa;
+ addr_out->bank = bank;
+ addr_out->channel_idx = channel_index;
+
+ return 0;
+}
+
+static int convert_ma_to_pa(struct ras_core_context *ras_core,
+ struct umc_mca_addr *addr_in, struct umc_phy_addr *addr_out,
+ uint32_t nps)
+{
+ int ret;
+
+ if (ras_psp_check_supported_cmd(ras_core, RAS_TA_CMD_ID__QUERY_ADDRESS))
+ ret = ras_umc_psp_convert_ma_to_pa(ras_core,
+ addr_in, addr_out, nps);
+ else
+ ret = umc_v12_convert_ma_to_pa(ras_core,
+ addr_in, addr_out, nps);
+
+ return ret;
+}
+
+static int convert_bank_to_nps_addr(struct ras_core_context *ras_core,
+ struct ras_bank_ecc *bank, struct umc_phy_addr *pa_addr, uint32_t nps)
+{
+ struct umc_mca_addr addr_in;
+ struct umc_phy_addr addr_out;
+ int ret;
+
+ memset(&addr_in, 0, sizeof(addr_in));
+ memset(&addr_out, 0, sizeof(addr_out));
+
+ addr_in.err_addr = ACA_ADDR_2_ERR_ADDR(bank->addr);
+ addr_in.ch_inst = ACA_IPID_2_UMC_CH(bank->ipid);
+ addr_in.umc_inst = ACA_IPID_2_UMC_INST(bank->ipid);
+ addr_in.node_inst = ACA_IPID_2_DIE_ID(bank->ipid);
+ addr_in.socket_id = ACA_IPID_2_SOCKET_ID(bank->ipid);
+
+ ret = convert_ma_to_pa(ras_core, &addr_in, &addr_out, nps);
+ if (!ret) {
+ pa_addr->pa =
+ convert_nps_pa_to_row_pa(ras_core, addr_out.pa, nps, false);
+ pa_addr->channel_idx = addr_out.channel_idx;
+ pa_addr->bank = addr_out.bank;
+ }
+
+ return ret;
+}
+
+static int umc_v12_0_bank_to_eeprom_record(struct ras_core_context *ras_core,
+ struct ras_bank_ecc *bank, struct eeprom_umc_record *record)
+{
+ struct umc_phy_addr nps_addr;
+ int ret;
+
+ memset(&nps_addr, 0, sizeof(nps_addr));
+
+ ret = convert_bank_to_nps_addr(ras_core, bank,
+ &nps_addr, bank->nps);
+ if (ret)
+ return ret;
+
+ ras_umc_fill_eeprom_record(ras_core,
+ ACA_ADDR_2_ERR_ADDR(bank->addr), ACA_IPID_2_UMC_INST(bank->ipid),
+ &nps_addr, bank->nps, record);
+
+ lookup_bad_pages_in_a_row(ras_core, record,
+ bank->nps, NULL, 0, bank->seq_no, true);
+
+ return 0;
+}
+
+static int convert_eeprom_record_to_nps_addr(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, uint64_t *pa, uint32_t nps)
+{
+ struct device_system_info dev_info = {0};
+ struct umc_mca_addr addr_in;
+ struct umc_phy_addr addr_out;
+ int ret;
+
+ memset(&addr_in, 0, sizeof(addr_in));
+ memset(&addr_out, 0, sizeof(addr_out));
+
+ ras_core_get_device_system_info(ras_core, &dev_info);
+
+ addr_in.err_addr = record->address;
+ addr_in.ch_inst = record->mem_channel;
+ addr_in.umc_inst = record->mcumc_id;
+ addr_in.node_inst = UMC_INV_AID_NODE;
+ addr_in.socket_id = dev_info.socket_id;
+
+ ret = convert_ma_to_pa(ras_core, &addr_in, &addr_out, nps);
+ if (ret)
+ return ret;
+
+ *pa = convert_nps_pa_to_row_pa(ras_core, addr_out.pa, nps, false);
+
+ return 0;
+}
+
+static int umc_v12_0_eeprom_record_to_nps_record(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, uint32_t nps)
+{
+ uint64_t pa = 0;
+ int ret = 0;
+
+ if (nps == EEPROM_RECORD_UMC_NPS_MODE(record)) {
+ record->cur_nps_retired_row_pfn = EEPROM_RECORD_UMC_ADDR_PFN(record);
+ } else {
+ ret = convert_eeprom_record_to_nps_addr(ras_core,
+ record, &pa, nps);
+ if (!ret)
+ record->cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(pa);
+ }
+
+ record->cur_nps = nps;
+
+ return ret;
+}
+
+static int umc_v12_0_eeprom_record_to_nps_pages(struct ras_core_context *ras_core,
+ struct eeprom_umc_record *record, uint32_t nps,
+ uint64_t *pfns, uint32_t num)
+{
+ return lookup_bad_pages_in_a_row(ras_core,
+ record, nps, pfns, num, 0, false);
+}
+
+static int umc_12_0_soc_pa_to_bank(struct ras_core_context *ras_core,
+ uint64_t soc_pa,
+ struct umc_bank_addr *bank_addr)
+{
+
+ int channel_hashed = 0;
+ int channel_real = 0;
+ int channel_reversed = 0;
+ int i = 0;
+
+ bank_addr->stack_id = UMC_V12_0_SOC_PA_TO_SID(soc_pa);
+ bank_addr->bank_group = 0; /* This is a combination of SID & Bank. Needed?? */
+ bank_addr->bank = UMC_V12_0_SOC_PA_TO_BANK(soc_pa);
+ bank_addr->row = UMC_V12_0_SOC_PA_TO_ROW(soc_pa);
+ bank_addr->column = UMC_V12_0_SOC_PA_TO_COL(soc_pa);
+
+ /* Channel bits 4-6 are hashed. Bruteforce reverse the hash */
+ channel_hashed = (soc_pa >> UMC_V12_0_PA_CH4_BIT) & 0x7;
+
+ for (i = 0; i < 8; i++) {
+ channel_reversed = 0;
+ channel_reversed |= UMC_V12_0_CHANNEL_HASH_CH4((i << 4), soc_pa);
+ channel_reversed |= (UMC_V12_0_CHANNEL_HASH_CH5((i << 4), soc_pa) << 1);
+ channel_reversed |= (UMC_V12_0_CHANNEL_HASH_CH6((i << 4), soc_pa) << 2);
+ if (channel_reversed == channel_hashed)
+ channel_real = ((i << 4)) | ((soc_pa >> UMC_V12_0_PA_CH0_BIT) & 0xf);
+ }
+
+ bank_addr->channel = channel_real;
+ bank_addr->subchannel = UMC_V12_0_SOC_PA_TO_PC(soc_pa);
+
+ return 0;
+}
+
+static int umc_12_0_bank_to_soc_pa(struct ras_core_context *ras_core,
+ struct umc_bank_addr bank_addr,
+ uint64_t *soc_pa)
+{
+ uint64_t na = 0;
+ uint64_t tmp_pa = 0;
+ *soc_pa = 0;
+
+ tmp_pa |= UMC_V12_0_SOC_SID_TO_PA(bank_addr.stack_id);
+ tmp_pa |= UMC_V12_0_SOC_BANK_TO_PA(bank_addr.bank);
+ tmp_pa |= UMC_V12_0_SOC_ROW_TO_PA(bank_addr.row);
+ tmp_pa |= UMC_V12_0_SOC_COL_TO_PA(bank_addr.column);
+ tmp_pa |= UMC_V12_0_SOC_CH_TO_PA(bank_addr.channel);
+ tmp_pa |= UMC_V12_0_SOC_PC_TO_PA(bank_addr.subchannel);
+
+ /* Get the NA */
+ na = ((tmp_pa >> UMC_V12_0_PA_C2_BIT) << UMC_V12_0_NA_C2_BIT);
+ na |= tmp_pa & 0xff;
+
+ /* translate umc channel address to soc pa, 3 parts are included */
+ tmp_pa = ADDR_OF_32KB_BLOCK(na) |
+ ADDR_OF_256B_BLOCK(bank_addr.channel) |
+ OFFSET_IN_256B_BLOCK(na);
+
+ /* the umc channel bits are not original values, they are hashed */
+ UMC_V12_0_SET_CHANNEL_HASH(bank_addr.channel, tmp_pa);
+
+ *soc_pa = tmp_pa;
+
+ return 0;
+}
+
+const struct ras_umc_ip_func ras_umc_func_v12_0 = {
+ .bank_to_eeprom_record = umc_v12_0_bank_to_eeprom_record,
+ .eeprom_record_to_nps_record = umc_v12_0_eeprom_record_to_nps_record,
+ .eeprom_record_to_nps_pages = umc_v12_0_eeprom_record_to_nps_pages,
+ .bank_to_soc_pa = umc_12_0_bank_to_soc_pa,
+ .soc_pa_to_bank = umc_12_0_soc_pa_to_bank,
+};
+
diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.h
new file mode 100644
index 000000000000..8a35ad856165
--- /dev/null
+++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.h
@@ -0,0 +1,314 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#ifndef __RAS_UMC_V12_0_H__
+#define __RAS_UMC_V12_0_H__
+#include "ras.h"
+
+/* MCA_UMC_UMC0_MCUMC_ADDRT0 */
+#define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr__SHIFT 0x0
+#define MCA_UMC_UMC0_MCUMC_ADDRT0__Reserved__SHIFT 0x38
+#define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr_MASK 0x00FFFFFFFFFFFFFFL
+#define MCA_UMC_UMC0_MCUMC_ADDRT0__Reserved_MASK 0xFF00000000000000L
+
+/* MCMP1_IPIDT0 */
+#define MCMP1_IPIDT0__InstanceIdLo__SHIFT 0x0
+#define MCMP1_IPIDT0__HardwareID__SHIFT 0x20
+#define MCMP1_IPIDT0__InstanceIdHi__SHIFT 0x2c
+#define MCMP1_IPIDT0__McaType__SHIFT 0x30
+
+#define MCMP1_IPIDT0__InstanceIdLo_MASK 0x00000000FFFFFFFFL
+#define MCMP1_IPIDT0__HardwareID_MASK 0x00000FFF00000000L
+#define MCMP1_IPIDT0__InstanceIdHi_MASK 0x0000F00000000000L
+#define MCMP1_IPIDT0__McaType_MASK 0xFFFF000000000000L
+
+/* number of umc channel instance with memory map register access */
+#define UMC_V12_0_CHANNEL_INSTANCE_NUM 8
+/* number of umc instance with memory map register access */
+#define UMC_V12_0_UMC_INSTANCE_NUM 4
+
+/* one piece of normalized address is mapped to 8 pieces of physical address */
+#define UMC_V12_0_NA_MAP_PA_NUM 8
+
+/* bank bits in MCA error address */
+#define UMC_V12_0_MCA_B0_BIT 6
+#define UMC_V12_0_MCA_B1_BIT 7
+#define UMC_V12_0_MCA_B2_BIT 8
+#define UMC_V12_0_MCA_B3_BIT 9
+
+/* row bits in MCA address */
+#define UMC_V12_0_MCA_R0_BIT 10
+
+/* Stack ID bits in SOC physical address */
+#define UMC_V12_0_PA_SID1_BIT 37
+#define UMC_V12_0_PA_SID0_BIT 36
+
+/* bank bits in SOC physical address */
+#define UMC_V12_0_PA_B3_BIT 18
+#define UMC_V12_0_PA_B2_BIT 17
+#define UMC_V12_0_PA_B1_BIT 20
+#define UMC_V12_0_PA_B0_BIT 19
+
+/* row bits in SOC physical address */
+#define UMC_V12_0_PA_R13_BIT 35
+#define UMC_V12_0_PA_R12_BIT 34
+#define UMC_V12_0_PA_R11_BIT 33
+#define UMC_V12_0_PA_R10_BIT 32
+#define UMC_V12_0_PA_R9_BIT 31
+#define UMC_V12_0_PA_R8_BIT 30
+#define UMC_V12_0_PA_R7_BIT 29
+#define UMC_V12_0_PA_R6_BIT 28
+#define UMC_V12_0_PA_R5_BIT 27
+#define UMC_V12_0_PA_R4_BIT 26
+#define UMC_V12_0_PA_R3_BIT 25
+#define UMC_V12_0_PA_R2_BIT 24
+#define UMC_V12_0_PA_R1_BIT 23
+#define UMC_V12_0_PA_R0_BIT 22
+
+/* column bits in SOC physical address */
+#define UMC_V12_0_PA_C4_BIT 21
+#define UMC_V12_0_PA_C3_BIT 16
+#define UMC_V12_0_PA_C2_BIT 15
+#define UMC_V12_0_PA_C1_BIT 6
+#define UMC_V12_0_PA_C0_BIT 5
+
+/* channel index bits in SOC physical address */
+#define UMC_V12_0_PA_CH6_BIT 14
+#define UMC_V12_0_PA_CH5_BIT 13
+#define UMC_V12_0_PA_CH4_BIT 12
+#define UMC_V12_0_PA_CH3_BIT 11
+#define UMC_V12_0_PA_CH2_BIT 10
+#define UMC_V12_0_PA_CH1_BIT 9
+#define UMC_V12_0_PA_CH0_BIT 8
+
+/* Pseudochannel index bits in SOC physical address */
+#define UMC_V12_0_PA_PC0_BIT 7
+
+#define UMC_V12_0_NA_C2_BIT 8
+
+#define UMC_V12_0_SOC_PA_TO_SID(pa) \
+ ((((pa >> UMC_V12_0_PA_SID0_BIT) & 0x1ULL) << 0ULL) | \
+ (((pa >> UMC_V12_0_PA_SID1_BIT) & 0x1ULL) << 1ULL))
+
+#define UMC_V12_0_SOC_PA_TO_BANK(pa) \
+ ((((pa >> UMC_V12_0_PA_B0_BIT) & 0x1ULL) << 0ULL) | \
+ (((pa >> UMC_V12_0_PA_B1_BIT) & 0x1ULL) << 1ULL) | \
+ (((pa >> UMC_V12_0_PA_B2_BIT) & 0x1ULL) << 2ULL) | \
+ (((pa >> UMC_V12_0_PA_B3_BIT) & 0x1ULL) << 3ULL))
+
+#define UMC_V12_0_SOC_PA_TO_ROW(pa) \
+ ((((pa >> UMC_V12_0_PA_R0_BIT) & 0x1ULL) << 0ULL) | \
+ (((pa >> UMC_V12_0_PA_R1_BIT) & 0x1ULL) << 1ULL) | \
+ (((pa >> UMC_V12_0_PA_R2_BIT) & 0x1ULL) << 2ULL) | \
+ (((pa >> UMC_V12_0_PA_R3_BIT) & 0x1ULL) << 3ULL) | \
+ (((pa >> UMC_V12_0_PA_R4_BIT) & 0x1ULL) << 4ULL) | \
+ (((pa >> UMC_V12_0_PA_R5_BIT) & 0x1ULL) << 5ULL) | \
+ (((pa >> UMC_V12_0_PA_R6_BIT) & 0x1ULL) << 6ULL) | \
+ (((pa >> UMC_V12_0_PA_R7_BIT) & 0x1ULL) << 7ULL) | \
+ (((pa >> UMC_V12_0_PA_R8_BIT) & 0x1ULL) << 8ULL) | \
+ (((pa >> UMC_V12_0_PA_R9_BIT) & 0x1ULL) << 9ULL) | \
+ (((pa >> UMC_V12_0_PA_R10_BIT) & 0x1ULL) << 10ULL) | \
+ (((pa >> UMC_V12_0_PA_R11_BIT) & 0x1ULL) << 11ULL) | \
+ (((pa >> UMC_V12_0_PA_R12_BIT) & 0x1ULL) << 12ULL) | \
+ (((pa >> UMC_V12_0_PA_R13_BIT) & 0x1ULL) << 13ULL))
+
+#define UMC_V12_0_SOC_PA_TO_COL(pa) \
+ ((((pa >> UMC_V12_0_PA_C0_BIT) & 0x1ULL) << 0ULL) | \
+ (((pa >> UMC_V12_0_PA_C1_BIT) & 0x1ULL) << 1ULL) | \
+ (((pa >> UMC_V12_0_PA_C2_BIT) & 0x1ULL) << 2ULL) | \
+ (((pa >> UMC_V12_0_PA_C3_BIT) & 0x1ULL) << 3ULL) | \
+ (((pa >> UMC_V12_0_PA_C4_BIT) & 0x1ULL) << 4ULL))
+
+#define UMC_V12_0_SOC_PA_TO_CH(pa) \
+ ((((pa >> UMC_V12_0_PA_CH0_BIT) & 0x1ULL) << 0ULL) | \
+ (((pa >> UMC_V12_0_PA_CH1_BIT) & 0x1ULL) << 1ULL) | \
+ (((pa >> UMC_V12_0_PA_CH2_BIT) & 0x1ULL) << 2ULL) | \
+ (((pa >> UMC_V12_0_PA_CH3_BIT) & 0x1ULL) << 3ULL) | \
+ (((pa >> UMC_V12_0_PA_CH4_BIT) & 0x1ULL) << 4ULL) | \
+ (((pa >> UMC_V12_0_PA_CH5_BIT) & 0x1ULL) << 5ULL) | \
+ (((pa >> UMC_V12_0_PA_CH6_BIT) & 0x1ULL) << 6ULL))
+
+#define UMC_V12_0_SOC_PA_TO_PC(pa) (((pa >> UMC_V12_0_PA_PC0_BIT) & 0x1ULL) << 0ULL)
+
+#define UMC_V12_0_SOC_SID_TO_PA(sid) \
+ ((((sid >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_SID0_BIT) | \
+ (((sid >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_SID1_BIT))
+
+#define UMC_V12_0_SOC_BANK_TO_PA(bank) \
+ ((((bank >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_B0_BIT) | \
+ (((bank >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_B1_BIT) | \
+ (((bank >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_B2_BIT) | \
+ (((bank >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_B3_BIT))
+
+#define UMC_V12_0_SOC_ROW_TO_PA(row) \
+ ((((row >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_R0_BIT) | \
+ (((row >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_R1_BIT) | \
+ (((row >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_R2_BIT) | \
+ (((row >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_R3_BIT) | \
+ (((row >> 4ULL) & 0x1ULL) << UMC_V12_0_PA_R4_BIT) | \
+ (((row >> 5ULL) & 0x1ULL) << UMC_V12_0_PA_R5_BIT) | \
+ (((row >> 6ULL) & 0x1ULL) << UMC_V12_0_PA_R6_BIT) | \
+ (((row >> 7ULL) & 0x1ULL) << UMC_V12_0_PA_R7_BIT) | \
+ (((row >> 8ULL) & 0x1ULL) << UMC_V12_0_PA_R8_BIT) | \
+ (((row >> 9ULL) & 0x1ULL) << UMC_V12_0_PA_R9_BIT) | \
+ (((row >> 10ULL) & 0x1ULL) << UMC_V12_0_PA_R10_BIT) | \
+ (((row >> 11ULL) & 0x1ULL) << UMC_V12_0_PA_R11_BIT) | \
+ (((row >> 12ULL) & 0x1ULL) << UMC_V12_0_PA_R12_BIT) | \
+ (((row >> 13ULL) & 0x1ULL) << UMC_V12_0_PA_R13_BIT))
+
+#define UMC_V12_0_SOC_COL_TO_PA(col) \
+ ((((col >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_C0_BIT) | \
+ (((col >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_C1_BIT) | \
+ (((col >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_C2_BIT) | \
+ (((col >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_C3_BIT) | \
+ (((col >> 4ULL) & 0x1ULL) << UMC_V12_0_PA_C4_BIT))
+
+#define UMC_V12_0_SOC_CH_TO_PA(ch) \
+ ((((ch >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_CH0_BIT) | \
+ (((ch >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_CH1_BIT) | \
+ (((ch >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_CH2_BIT) | \
+ (((ch >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_CH3_BIT) | \
+ (((ch >> 4ULL) & 0x1ULL) << UMC_V12_0_PA_CH4_BIT) | \
+ (((ch >> 5ULL) & 0x1ULL) << UMC_V12_0_PA_CH5_BIT) | \
+ (((ch >> 6ULL) & 0x1ULL) << UMC_V12_0_PA_CH6_BIT))
+
+#define UMC_V12_0_SOC_PC_TO_PA(pc) (((pc >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_PC0_BIT)
+
+/* bank hash settings */
+#define UMC_V12_0_XOR_EN0 1
+#define UMC_V12_0_XOR_EN1 1
+#define UMC_V12_0_XOR_EN2 1
+#define UMC_V12_0_XOR_EN3 1
+#define UMC_V12_0_COL_XOR0 0x0
+#define UMC_V12_0_COL_XOR1 0x0
+#define UMC_V12_0_COL_XOR2 0x800
+#define UMC_V12_0_COL_XOR3 0x1000
+#define UMC_V12_0_ROW_XOR0 0x11111
+#define UMC_V12_0_ROW_XOR1 0x22222
+#define UMC_V12_0_ROW_XOR2 0x4444
+#define UMC_V12_0_ROW_XOR3 0x8888
+
+/* channel hash settings */
+#define UMC_V12_0_HASH_4K 0
+#define UMC_V12_0_HASH_64K 1
+#define UMC_V12_0_HASH_2M 1
+#define UMC_V12_0_HASH_1G 1
+#define UMC_V12_0_HASH_1T 1
+
+/* XOR some bits of PA into CH4~CH6 bits (bits 12~14 of PA),
+ * hash bit is only effective when related setting is enabled
+ */
+#define UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) ((((channel_idx) >> 5) & 0x1) ^ \
+ (((pa) >> 20) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
+ (((pa) >> 27) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+ (((pa) >> 34) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+ (((pa) >> 41) & 0x1ULL & UMC_V12_0_HASH_1T))
+#define UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) ((((channel_idx) >> 6) & 0x1) ^ \
+ (((pa) >> 21) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
+ (((pa) >> 28) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+ (((pa) >> 35) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+ (((pa) >> 42) & 0x1ULL & UMC_V12_0_HASH_1T))
+#define UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) ((((channel_idx) >> 4) & 0x1) ^ \
+ (((pa) >> 19) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \
+ (((pa) >> 26) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+ (((pa) >> 33) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+ (((pa) >> 40) & 0x1ULL & UMC_V12_0_HASH_1T) ^ \
+ (((pa) >> 47) & 0x1ULL & UMC_V12_0_HASH_1T))
+#define UMC_V12_0_SET_CHANNEL_HASH(channel_idx, pa) do { \
+ (pa) &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT); \
+ (pa) |= (UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) << UMC_V12_0_PA_CH4_BIT); \
+ (pa) |= (UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) << UMC_V12_0_PA_CH5_BIT); \
+ (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \
+ } while (0)
+
+
+/*
+ * (addr / 256) * 4096, the higher 26 bits in ErrorAddr
+ * is the index of 4KB block
+ */
+#define ADDR_OF_4KB_BLOCK(addr) (((addr) & ~0xffULL) << 4)
+/*
+ * (addr / 256) * 8192, the higher 26 bits in ErrorAddr
+ * is the index of 8KB block
+ */
+#define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5)
+/*
+ * (addr / 256) * 32768, the higher 26 bits in ErrorAddr
+ * is the index of 8KB block
+ */
+#define ADDR_OF_32KB_BLOCK(addr) (((addr) & ~0xffULL) << 7)
+/* channel index is the index of 256B block */
+#define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8)
+/* offset in 256B block */
+#define OFFSET_IN_256B_BLOCK(addr) ((addr) & 0xffULL)
+
+
+#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \
+ ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \
+ (0x1ULL << UMC_V12_0_PA_C4_BIT) | \
+ (0x1ULL << UMC_V12_0_PA_R13_BIT)))
+
+#define ACA_IPID_HI_2_UMC_AID(_ipid_hi) (((_ipid_hi) >> 2) & 0x3)
+#define ACA_IPID_LO_2_UMC_CH(_ipid_lo) \
+ (((((_ipid_lo) >> 20) & 0x1) * 4) + (((_ipid_lo) >> 12) & 0xF))
+#define ACA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
+
+#define ACA_IPID_2_DIE_ID(ipid) ((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) >> 2) & 0x03)
+#define ACA_IPID_2_UMC_CH(ipid) \
+ (ACA_IPID_LO_2_UMC_CH(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo)))
+
+#define ACA_IPID_2_UMC_INST(ipid) \
+ (ACA_IPID_LO_2_UMC_INST(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo)))
+
+#define ACA_IPID_2_SOCKET_ID(ipid) \
+ (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
+ (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))
+
+#define ACA_ADDR_2_ERR_ADDR(addr) \
+ REG_GET_FIELD(addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr)
+
+/* R13 bit shift should be considered, double the number */
+#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2)
+
+
+/* C2, C3, C4, R13, four MCA bits are looped in page retirement */
+#define UMC_V12_0_RETIRE_LOOP_BITS 4
+
+/* invalid node instance value */
+#define UMC_INV_AID_NODE 0xffff
+
+#define UMC_V12_0_AID_NUM_MAX 4
+#define UMC_V12_0_SOCKET_NUM_MAX 8
+
+#define UMC_V12_0_TOTAL_CHANNEL_NUM \
+ (UMC_V12_0_AID_NUM_MAX * UMC_V12_0_UMC_INSTANCE_NUM * UMC_V12_0_CHANNEL_INSTANCE_NUM)
+
+/* one device has 192GB HBM */
+#define SOCKET_LFB_SIZE 0x3000000000ULL
+
+extern const struct ras_umc_ip_func ras_umc_func_v12_0;
+
+int ras_umc_get_badpage_count(struct ras_core_context *ras_core);
+int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record);
+#endif
+