diff options
Diffstat (limited to 'drivers/gpu/drm/amd/ras')
54 files changed, 12697 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/ras/Makefile b/drivers/gpu/drm/amd/ras/Makefile new file mode 100644 index 000000000000..bbdaba811d34 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/Makefile @@ -0,0 +1,34 @@ +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +ifeq ($(AMD_GPU_RAS_MGR),) + AMD_GPU_RAS_MGR := ras_mgr +endif + +subdir-ccflags-y += -I$(AMD_GPU_RAS_FULL_PATH)/rascore +subdir-ccflags-y += -I$(AMD_GPU_RAS_FULL_PATH)/$(AMD_GPU_RAS_MGR) + +RAS_LIBS = $(AMD_GPU_RAS_MGR) rascore + +AMD_RAS = $(addsuffix /Makefile, $(addprefix $(AMD_GPU_RAS_FULL_PATH)/,$(RAS_LIBS))) + +include $(AMD_RAS) + diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/Makefile b/drivers/gpu/drm/amd/ras/ras_mgr/Makefile new file mode 100644 index 000000000000..5e5a2cfa4068 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/Makefile @@ -0,0 +1,33 @@ +# Copyright 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +RAS_MGR_FILES = amdgpu_ras_sys.o \ + amdgpu_ras_mgr.o \ + amdgpu_ras_eeprom_i2c.o \ + amdgpu_ras_mp1_v13_0.o \ + amdgpu_ras_cmd.o \ + amdgpu_ras_process.o \ + amdgpu_ras_nbio_v7_9.o + + +RAS_MGR = $(addprefix $(AMD_GPU_RAS_PATH)/ras_mgr/, $(RAS_MGR_FILES)) + +AMD_GPU_RAS_FILES += $(RAS_MGR) + diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c new file mode 100644 index 000000000000..78419b7f7729 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include <linux/pci.h> +#include "amdgpu.h" +#include "amdgpu_ras.h" +#include "ras_sys.h" +#include "amdgpu_ras_cmd.h" +#include "amdgpu_ras_mgr.h" + +/* inject address is 52 bits */ +#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) + +#define AMDGPU_RAS_TYPE_RASCORE 0x1 +#define AMDGPU_RAS_TYPE_AMDGPU 0x2 +#define AMDGPU_RAS_TYPE_VF 0x3 + +static int amdgpu_ras_trigger_error_prepare(struct ras_core_context *ras_core, + struct ras_cmd_inject_error_req *block_info) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int ret; + + if (block_info->block_id == TA_RAS_BLOCK__XGMI_WAFL) { + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW)) + RAS_DEV_WARN(adev, "Failed to disallow df cstate"); + + ret = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DISALLOW); + if (ret && (ret != -EOPNOTSUPP)) + RAS_DEV_WARN(adev, "Failed to disallow XGMI power down"); + } + + return 0; +} + +static int amdgpu_ras_trigger_error_end(struct ras_core_context *ras_core, + struct ras_cmd_inject_error_req *block_info) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int ret; + + if (block_info->block_id == TA_RAS_BLOCK__XGMI_WAFL) { + if (amdgpu_ras_intr_triggered()) + return 0; + + ret = amdgpu_dpm_set_pm_policy(adev, PP_PM_POLICY_XGMI_PLPD, XGMI_PLPD_DEFAULT); + if (ret && (ret != -EOPNOTSUPP)) + RAS_DEV_WARN(adev, "Failed to allow XGMI power down"); + + if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW)) + RAS_DEV_WARN(adev, "Failed to allow df cstate"); + } + + return 0; +} + +static uint64_t local_addr_to_xgmi_global_addr(struct ras_core_context *ras_core, + uint64_t addr) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct amdgpu_xgmi *xgmi = &adev->gmc.xgmi; + + return (addr + xgmi->physical_node_id * xgmi->node_segment_size); +} + +static int amdgpu_ras_inject_error(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct ras_cmd_inject_error_req *req = + (struct ras_cmd_inject_error_req *)cmd->input_buff_raw; + int ret = RAS_CMD__ERROR_GENERIC; + + if (req->block_id == RAS_BLOCK_ID__UMC) { + if (amdgpu_ras_mgr_check_retired_addr(adev, req->address)) { + RAS_DEV_WARN(ras_core->dev, + "RAS WARN: inject: 0x%llx has already been marked as bad!\n", + req->address); + return RAS_CMD__ERROR_ACCESS_DENIED; + } + + if ((req->address >= adev->gmc.mc_vram_size && + adev->gmc.mc_vram_size) || + (req->address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + RAS_DEV_WARN(adev, "RAS WARN: input address 0x%llx is invalid.", + req->address); + return RAS_CMD__ERROR_INVALID_INPUT_DATA; + } + + /* Calculate XGMI relative offset */ + if (adev->gmc.xgmi.num_physical_nodes > 1 && + req->block_id != RAS_BLOCK_ID__GFX) { + req->address = local_addr_to_xgmi_global_addr(ras_core, req->address); + } + } + + amdgpu_ras_trigger_error_prepare(ras_core, req); + ret = rascore_handle_cmd(ras_core, cmd, data); + amdgpu_ras_trigger_error_end(ras_core, req); + if (ret) { + RAS_DEV_ERR(adev, "ras inject block %u failed %d\n", req->block_id, ret); + ret = RAS_CMD__ERROR_ACCESS_DENIED; + } + + + return ret; +} + +static int amdgpu_ras_get_ras_safe_fb_addr_ranges(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct ras_cmd_dev_handle *input_data = + (struct ras_cmd_dev_handle *)cmd->input_buff_raw; + struct ras_cmd_ras_safe_fb_address_ranges_rsp *ranges = + (struct ras_cmd_ras_safe_fb_address_ranges_rsp *)cmd->output_buff_raw; + struct amdgpu_mem_partition_info *mem_ranges; + uint32_t i = 0; + + if (cmd->input_size != sizeof(*input_data)) + return RAS_CMD__ERROR_INVALID_INPUT_DATA; + + mem_ranges = adev->gmc.mem_partitions; + for (i = 0; i < adev->gmc.num_mem_partitions; i++) { + ranges->range[i].start = mem_ranges[i].range.fpfn << AMDGPU_GPU_PAGE_SHIFT; + ranges->range[i].size = mem_ranges[i].size; + ranges->range[i].idx = i; + } + + ranges->num_ranges = adev->gmc.num_mem_partitions; + + ranges->version = 0; + cmd->output_size = sizeof(struct ras_cmd_ras_safe_fb_address_ranges_rsp); + + return RAS_CMD__SUCCESS; +} + +static int ras_translate_fb_address(struct ras_core_context *ras_core, + enum ras_fb_addr_type src_type, + enum ras_fb_addr_type dest_type, + union ras_translate_fb_address *src_addr, + union ras_translate_fb_address *dest_addr) +{ + uint64_t soc_phy_addr; + int ret = RAS_CMD__SUCCESS; + + /* Does not need to be queued as event as this is a SW translation */ + switch (src_type) { + case RAS_FB_ADDR_SOC_PHY: + soc_phy_addr = src_addr->soc_phy_addr; + break; + case RAS_FB_ADDR_BANK: + ret = ras_cmd_translate_bank_to_soc_pa(ras_core, + src_addr->bank_addr, &soc_phy_addr); + if (ret) + return RAS_CMD__ERROR_GENERIC; + break; + default: + return RAS_CMD__ERROR_INVALID_CMD; + } + + switch (dest_type) { + case RAS_FB_ADDR_SOC_PHY: + dest_addr->soc_phy_addr = soc_phy_addr; + break; + case RAS_FB_ADDR_BANK: + ret = ras_cmd_translate_soc_pa_to_bank(ras_core, + soc_phy_addr, &dest_addr->bank_addr); + if (ret) + return RAS_CMD__ERROR_GENERIC; + break; + default: + return RAS_CMD__ERROR_INVALID_CMD; + } + + return ret; +} + +static int amdgpu_ras_translate_fb_address(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_translate_fb_address_req *req_buff = + (struct ras_cmd_translate_fb_address_req *)cmd->input_buff_raw; + struct ras_cmd_translate_fb_address_rsp *rsp_buff = + (struct ras_cmd_translate_fb_address_rsp *)cmd->output_buff_raw; + int ret = RAS_CMD__ERROR_GENERIC; + + if (cmd->input_size != sizeof(struct ras_cmd_translate_fb_address_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + if ((req_buff->src_addr_type >= RAS_FB_ADDR_UNKNOWN) || + (req_buff->dest_addr_type >= RAS_FB_ADDR_UNKNOWN)) + return RAS_CMD__ERROR_INVALID_INPUT_DATA; + + ret = ras_translate_fb_address(ras_core, req_buff->src_addr_type, + req_buff->dest_addr_type, &req_buff->trans_addr, &rsp_buff->trans_addr); + if (ret) + return RAS_CMD__ERROR_GENERIC; + + rsp_buff->version = 0; + cmd->output_size = sizeof(struct ras_cmd_translate_fb_address_rsp); + + return RAS_CMD__SUCCESS; +} + +static struct ras_cmd_func_map amdgpu_ras_cmd_maps[] = { + {RAS_CMD__INJECT_ERROR, amdgpu_ras_inject_error}, + {RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES, amdgpu_ras_get_ras_safe_fb_addr_ranges}, + {RAS_CMD__TRANSLATE_FB_ADDRESS, amdgpu_ras_translate_fb_address}, +}; + +int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_func_map *ras_cmd = NULL; + int i, res; + + for (i = 0; i < ARRAY_SIZE(amdgpu_ras_cmd_maps); i++) { + if (cmd->cmd_id == amdgpu_ras_cmd_maps[i].cmd_id) { + ras_cmd = &amdgpu_ras_cmd_maps[i]; + break; + } + } + + if (ras_cmd) + res = ras_cmd->func(ras_core, cmd, NULL); + else + res = RAS_CMD__ERROR_UKNOWN_CMD; + + return res; +} + +int amdgpu_ras_submit_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd) +{ + struct ras_core_context *cmd_core = ras_core; + int timeout = 60; + int res; + + cmd->cmd_res = RAS_CMD__ERROR_INVALID_CMD; + cmd->output_size = 0; + + if (!ras_core_is_enabled(cmd_core)) + return RAS_CMD__ERROR_ACCESS_DENIED; + + while (ras_core_gpu_in_reset(cmd_core)) { + msleep(1000); + if (!timeout--) + return RAS_CMD__ERROR_TIMEOUT; + } + + res = amdgpu_ras_handle_cmd(cmd_core, cmd, NULL); + if (res == RAS_CMD__ERROR_UKNOWN_CMD) + res = rascore_handle_cmd(cmd_core, cmd, NULL); + + cmd->cmd_res = res; + + if (cmd->output_size > cmd->output_buf_size) { + RAS_DEV_ERR(cmd_core->dev, + "Output size 0x%x exceeds output buffer size 0x%x!\n", + cmd->output_size, cmd->output_buf_size); + return RAS_CMD__SUCCESS_EXEED_BUFFER; + } + + return RAS_CMD__SUCCESS; +} diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h new file mode 100644 index 000000000000..5973b156cc85 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_cmd.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __AMDGPU_RAS_CMD_H__ +#define __AMDGPU_RAS_CMD_H__ +#include "ras.h" + +enum amdgpu_ras_cmd_id { + RAS_CMD__AMDGPU_BEGIN = RAS_CMD_ID_AMDGPU_START, + RAS_CMD__TRANSLATE_MEMORY_FD, + RAS_CMD__AMDGPU_SUPPORTED_MAX = RAS_CMD_ID_AMDGPU_END, +}; + +struct ras_cmd_translate_memory_fd_req { + struct ras_cmd_dev_handle dev; + uint32_t type; + uint32_t fd; + uint64_t address; + uint32_t reserved[4]; +}; + +struct ras_cmd_translate_memory_fd_rsp { + uint32_t version; + uint32_t padding; + uint64_t start; + uint64_t size; + uint32_t reserved[2]; +}; + +int amdgpu_ras_handle_cmd(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data); +int amdgpu_ras_submit_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd); + +#endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c new file mode 100644 index 000000000000..3ed3ff42b7e1 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "amdgpu.h" +#include "amdgpu_atomfirmware.h" +#include "amdgpu_ras_eeprom.h" +#include "amdgpu_ras_mgr.h" +#include "amdgpu_ras_eeprom_i2c.h" +#include "ras_eeprom.h" + +/* These are memory addresses as would be seen by one or more EEPROM + * chips strung on the I2C bus, usually by manipulating pins 1-3 of a + * set of EEPROM devices. They form a continuous memory space. + * + * The I2C device address includes the device type identifier, 1010b, + * which is a reserved value and indicates that this is an I2C EEPROM + * device. It also includes the top 3 bits of the 19 bit EEPROM memory + * address, namely bits 18, 17, and 16. This makes up the 7 bit + * address sent on the I2C bus with bit 0 being the direction bit, + * which is not represented here, and sent by the hardware directly. + * + * For instance, + * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0. + * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h. + * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h. + * Depending on the size of the I2C EEPROM device(s), bits 18:16 may + * address memory in a device or a device on the I2C bus, depending on + * the status of pins 1-3. See top of amdgpu_eeprom.c. + * + * The RAS table lives either at address 0 or address 40000h of EEPROM. + */ +#define EEPROM_I2C_MADDR_0 0x0 +#define EEPROM_I2C_MADDR_4 0x40000 + +#define MAKE_I2C_ADDR(_aa) ((0xA << 3) | (((_aa) >> 16) & 0xF)) +#define to_amdgpu_ras(x) (container_of(x, struct amdgpu_ras, eeprom_control)) + +#define EEPROM_PAGE_BITS 8 +#define EEPROM_PAGE_SIZE (1U << EEPROM_PAGE_BITS) +#define EEPROM_PAGE_MASK (EEPROM_PAGE_SIZE - 1) + +#define EEPROM_OFFSET_SIZE 2 + +static int ras_eeprom_i2c_config(struct ras_core_context *ras_core) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + u8 i2c_addr; + + if (amdgpu_atomfirmware_ras_rom_addr(adev, &i2c_addr)) { + /* The address given by VBIOS is an 8-bit, wire-format + * address, i.e. the most significant byte. + * + * Normalize it to a 19-bit EEPROM address. Remove the + * device type identifier and make it a 7-bit address; + * then make it a 19-bit EEPROM address. See top of + * amdgpu_eeprom.c. + */ + i2c_addr = (i2c_addr & 0x0F) >> 1; + control->i2c_address = ((u32) i2c_addr) << 16; + return 0; + } + + switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) { + case IP_VERSION(13, 0, 5): + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 10): + case IP_VERSION(13, 0, 12): + case IP_VERSION(13, 0, 14): + control->i2c_address = EEPROM_I2C_MADDR_4; + return 0; + default: + return -ENODATA; + } + return -ENODATA; +} + +static int ras_eeprom_i2c_xfer(struct ras_core_context *ras_core, u32 eeprom_addr, + u8 *eeprom_buf, u32 buf_size, bool read) +{ + struct i2c_adapter *i2c_adap = ras_core->ras_eeprom.i2c_adapter; + u8 eeprom_offset_buf[EEPROM_OFFSET_SIZE]; + struct i2c_msg msgs[] = { + { + .flags = 0, + .len = EEPROM_OFFSET_SIZE, + .buf = eeprom_offset_buf, + }, + { + .flags = read ? I2C_M_RD : 0, + }, + }; + const u8 *p = eeprom_buf; + int r; + u16 len; + + for (r = 0; buf_size > 0; + buf_size -= len, eeprom_addr += len, eeprom_buf += len) { + /* Set the EEPROM address we want to write to/read from. + */ + msgs[0].addr = MAKE_I2C_ADDR(eeprom_addr); + msgs[1].addr = msgs[0].addr; + msgs[0].buf[0] = (eeprom_addr >> 8) & 0xff; + msgs[0].buf[1] = eeprom_addr & 0xff; + + if (!read) { + /* Write the maximum amount of data, without + * crossing the device's page boundary, as per + * its spec. Partial page writes are allowed, + * starting at any location within the page, + * so long as the page boundary isn't crossed + * over (actually the page pointer rolls + * over). + * + * As per the AT24CM02 EEPROM spec, after + * writing into a page, the I2C driver should + * terminate the transfer, i.e. in + * "i2c_transfer()" below, with a STOP + * condition, so that the self-timed write + * cycle begins. This is implied for the + * "i2c_transfer()" abstraction. + */ + len = min(EEPROM_PAGE_SIZE - (eeprom_addr & EEPROM_PAGE_MASK), + buf_size); + } else { + /* Reading from the EEPROM has no limitation + * on the number of bytes read from the EEPROM + * device--they are simply sequenced out. + * Keep in mind that i2c_msg.len is u16 type. + */ + len = min(U16_MAX, buf_size); + } + msgs[1].len = len; + msgs[1].buf = eeprom_buf; + + + /* This constitutes a START-STOP transaction. + */ + r = i2c_transfer(i2c_adap, msgs, ARRAY_SIZE(msgs)); + if (r != ARRAY_SIZE(msgs)) + break; + + if (!read) { + /* According to EEPROM specs the length of the + * self-writing cycle, tWR (tW), is 10 ms. + * + * TODO: Use polling on ACK, aka Acknowledge + * Polling, to minimize waiting for the + * internal write cycle to complete, as it is + * usually smaller than tWR (tW). + */ + msleep(10); + } + } + + return r < 0 ? r : eeprom_buf - p; +} + +const struct ras_eeprom_sys_func amdgpu_ras_eeprom_i2c_sys_func = { + .eeprom_i2c_xfer = ras_eeprom_i2c_xfer, + .update_eeprom_i2c_config = ras_eeprom_i2c_config, +}; diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.h new file mode 100644 index 000000000000..3b5878605411 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_eeprom_i2c.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright (C) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN + * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef __AMDGPU_RAS_EEPROM_I2C_H__ +#define __AMDGPU_RAS_EEPROM_I2C_H__ +#include "ras.h" + +extern const struct ras_eeprom_sys_func amdgpu_ras_eeprom_i2c_sys_func; +#endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c new file mode 100644 index 000000000000..afe8135b6258 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "amdgpu.h" +#include "amdgpu_reset.h" +#include "amdgpu_xgmi.h" +#include "ras_sys.h" +#include "amdgpu_ras_mgr.h" +#include "amdgpu_ras_cmd.h" +#include "amdgpu_ras_process.h" +#include "amdgpu_ras_eeprom_i2c.h" +#include "amdgpu_ras_mp1_v13_0.h" +#include "amdgpu_ras_nbio_v7_9.h" + +#define MAX_SOCKET_NUM_PER_HIVE 8 +#define MAX_AID_NUM_PER_SOCKET 4 +#define MAX_XCD_NUM_PER_AID 2 + +/* typical ECC bad page rate is 1 bad page per 100MB VRAM */ +#define TYPICAL_ECC_BAD_PAGE_RATE (100ULL * SZ_1M) + +#define COUNT_BAD_PAGE_THRESHOLD(size) (((size) >> 21) << 4) + +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain + */ +#define RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20) + + +static void ras_mgr_init_event_mgr(struct ras_event_manager *mgr) +{ + struct ras_event_state *event_state; + int i; + + memset(mgr, 0, sizeof(*mgr)); + atomic64_set(&mgr->seqno, 0); + + for (i = 0; i < ARRAY_SIZE(mgr->event_state); i++) { + event_state = &mgr->event_state[i]; + event_state->last_seqno = RAS_EVENT_INVALID_ID; + atomic64_set(&event_state->count, 0); + } +} + +static void amdgpu_ras_mgr_init_event_mgr(struct ras_core_context *ras_core) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct ras_event_manager *event_mgr; + struct amdgpu_hive_info *hive; + + hive = amdgpu_get_xgmi_hive(adev); + event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr; + + /* init event manager with node 0 on xgmi system */ + if (!amdgpu_reset_in_recovery(adev)) { + if (!hive || adev->gmc.xgmi.node_id == 0) + ras_mgr_init_event_mgr(event_mgr); + } + + if (hive) + amdgpu_put_xgmi_hive(hive); +} + +static int amdgpu_ras_mgr_init_aca_config(struct amdgpu_device *adev, + struct ras_core_config *config) +{ + struct ras_aca_config *aca_cfg = &config->aca_cfg; + + aca_cfg->socket_num_per_hive = MAX_SOCKET_NUM_PER_HIVE; + aca_cfg->aid_num_per_socket = MAX_AID_NUM_PER_SOCKET; + aca_cfg->xcd_num_per_aid = MAX_XCD_NUM_PER_AID; + + return 0; +} + +static int amdgpu_ras_mgr_init_eeprom_config(struct amdgpu_device *adev, + struct ras_core_config *config) +{ + struct ras_eeprom_config *eeprom_cfg = &config->eeprom_cfg; + + eeprom_cfg->eeprom_sys_fn = &amdgpu_ras_eeprom_i2c_sys_func; + eeprom_cfg->eeprom_i2c_adapter = adev->pm.ras_eeprom_i2c_bus; + if (eeprom_cfg->eeprom_i2c_adapter) { + const struct i2c_adapter_quirks *quirks = + ((struct i2c_adapter *)eeprom_cfg->eeprom_i2c_adapter)->quirks; + + if (quirks) { + eeprom_cfg->max_i2c_read_len = quirks->max_read_len; + eeprom_cfg->max_i2c_write_len = quirks->max_write_len; + } + } + + /* + * amdgpu_bad_page_threshold is used to config + * the threshold for the number of bad pages. + * -1: Threshold is set to default value + * Driver will issue a warning message when threshold is reached + * and continue runtime services. + * 0: Disable bad page retirement + * Driver will not retire bad pages + * which is intended for debugging purpose. + * -2: Threshold is determined by a formula + * that assumes 1 bad page per 100M of local memory. + * Driver will continue runtime services when threhold is reached. + * 0 < threshold < max number of bad page records in EEPROM, + * A user-defined threshold is set + * Driver will halt runtime services when this custom threshold is reached. + */ + if (amdgpu_bad_page_threshold == NONSTOP_OVER_THRESHOLD) + eeprom_cfg->eeprom_record_threshold_count = + div64_u64(adev->gmc.mc_vram_size, TYPICAL_ECC_BAD_PAGE_RATE); + else if (amdgpu_bad_page_threshold == WARN_NONSTOP_OVER_THRESHOLD) + eeprom_cfg->eeprom_record_threshold_count = + COUNT_BAD_PAGE_THRESHOLD(RAS_RESERVED_VRAM_SIZE_DEFAULT); + else + eeprom_cfg->eeprom_record_threshold_count = amdgpu_bad_page_threshold; + + eeprom_cfg->eeprom_record_threshold_config = amdgpu_bad_page_threshold; + + return 0; +} + +static int amdgpu_ras_mgr_init_mp1_config(struct amdgpu_device *adev, + struct ras_core_config *config) +{ + struct ras_mp1_config *mp1_cfg = &config->mp1_cfg; + int ret = 0; + + switch (config->mp1_ip_version) { + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): + case IP_VERSION(13, 0, 12): + mp1_cfg->mp1_sys_fn = &amdgpu_ras_mp1_sys_func_v13_0; + break; + default: + RAS_DEV_ERR(adev, + "The mp1(0x%x) ras config is not right!\n", + config->mp1_ip_version); + ret = -EINVAL; + break; + } + + return ret; +} + +static int amdgpu_ras_mgr_init_nbio_config(struct amdgpu_device *adev, + struct ras_core_config *config) +{ + struct ras_nbio_config *nbio_cfg = &config->nbio_cfg; + int ret = 0; + + switch (config->nbio_ip_version) { + case IP_VERSION(7, 9, 0): + case IP_VERSION(7, 9, 1): + nbio_cfg->nbio_sys_fn = &amdgpu_ras_nbio_sys_func_v7_9; + break; + default: + RAS_DEV_ERR(adev, + "The nbio(0x%x) ras config is not right!\n", + config->nbio_ip_version); + ret = -EINVAL; + break; + } + + return ret; +} + +static int amdgpu_ras_mgr_get_ras_psp_system_status(struct ras_core_context *ras_core, + struct ras_psp_sys_status *status) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct ta_context *context = &adev->psp.ras_context.context; + + status->initialized = context->initialized; + status->session_id = context->session_id; + status->psp_cmd_mutex = &adev->psp.mutex; + + return 0; +} + +static int amdgpu_ras_mgr_get_ras_ta_init_param(struct ras_core_context *ras_core, + struct ras_ta_init_param *ras_ta_param) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + uint32_t nps_mode; + + if (amdgpu_ras_is_poison_mode_supported(adev)) + ras_ta_param->poison_mode_en = 1; + + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) + ras_ta_param->dgpu_mode = 1; + + ras_ta_param->xcc_mask = adev->gfx.xcc_mask; + ras_ta_param->channel_dis_num = hweight32(adev->gmc.m_half_use) * 2; + + ras_ta_param->active_umc_mask = adev->umc.active_mask; + + if (!amdgpu_ras_mgr_get_curr_nps_mode(adev, &nps_mode)) + ras_ta_param->nps_mode = nps_mode; + + return 0; +} + +const struct ras_psp_sys_func amdgpu_ras_psp_sys_func = { + .get_ras_psp_system_status = amdgpu_ras_mgr_get_ras_psp_system_status, + .get_ras_ta_init_param = amdgpu_ras_mgr_get_ras_ta_init_param, +}; + +static int amdgpu_ras_mgr_init_psp_config(struct amdgpu_device *adev, + struct ras_core_config *config) +{ + struct ras_psp_config *psp_cfg = &config->psp_cfg; + + psp_cfg->psp_sys_fn = &amdgpu_ras_psp_sys_func; + + return 0; +} + +static int amdgpu_ras_mgr_init_umc_config(struct amdgpu_device *adev, + struct ras_core_config *config) +{ + struct ras_umc_config *umc_cfg = &config->umc_cfg; + + umc_cfg->umc_vram_type = adev->gmc.vram_type; + + return 0; +} + +static struct ras_core_context *amdgpu_ras_mgr_create_ras_core(struct amdgpu_device *adev) +{ + struct ras_core_config init_config; + + memset(&init_config, 0, sizeof(init_config)); + + init_config.umc_ip_version = amdgpu_ip_version(adev, UMC_HWIP, 0); + init_config.mp1_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0); + init_config.gfx_ip_version = amdgpu_ip_version(adev, GC_HWIP, 0); + init_config.nbio_ip_version = amdgpu_ip_version(adev, NBIO_HWIP, 0); + init_config.psp_ip_version = amdgpu_ip_version(adev, MP1_HWIP, 0); + + if (init_config.umc_ip_version == IP_VERSION(12, 0, 0) || + init_config.umc_ip_version == IP_VERSION(12, 5, 0)) + init_config.aca_ip_version = IP_VERSION(1, 0, 0); + + init_config.sys_fn = &amdgpu_ras_sys_fn; + init_config.ras_eeprom_supported = true; + init_config.poison_supported = + amdgpu_ras_is_poison_mode_supported(adev); + + amdgpu_ras_mgr_init_aca_config(adev, &init_config); + amdgpu_ras_mgr_init_eeprom_config(adev, &init_config); + amdgpu_ras_mgr_init_mp1_config(adev, &init_config); + amdgpu_ras_mgr_init_nbio_config(adev, &init_config); + amdgpu_ras_mgr_init_psp_config(adev, &init_config); + amdgpu_ras_mgr_init_umc_config(adev, &init_config); + + return ras_core_create(&init_config); +} + +static int amdgpu_ras_mgr_sw_init(struct amdgpu_ip_block *ip_block) +{ + struct amdgpu_device *adev = ip_block->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_mgr *ras_mgr; + int ret = 0; + + /* Disabled by default */ + con->uniras_enabled = false; + + /* Enabled only in debug mode */ + if (adev->debug_enable_ras_aca) { + con->uniras_enabled = true; + RAS_DEV_INFO(adev, "Debug amdgpu uniras!"); + } + + if (!con->uniras_enabled) + return 0; + + ras_mgr = kzalloc(sizeof(*ras_mgr), GFP_KERNEL); + if (!ras_mgr) + return -EINVAL; + + con->ras_mgr = ras_mgr; + ras_mgr->adev = adev; + + ras_mgr->ras_core = amdgpu_ras_mgr_create_ras_core(adev); + if (!ras_mgr->ras_core) { + RAS_DEV_ERR(adev, "Failed to create ras core!\n"); + ret = -EINVAL; + goto err; + } + + ras_mgr->ras_core->dev = adev; + + amdgpu_ras_process_init(adev); + ras_core_sw_init(ras_mgr->ras_core); + amdgpu_ras_mgr_init_event_mgr(ras_mgr->ras_core); + return 0; + +err: + kfree(ras_mgr); + return ret; +} + +static int amdgpu_ras_mgr_sw_fini(struct amdgpu_ip_block *ip_block) +{ + struct amdgpu_device *adev = ip_block->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_mgr *ras_mgr = (struct amdgpu_ras_mgr *)con->ras_mgr; + + if (!con->uniras_enabled) + return 0; + + if (!ras_mgr) + return 0; + + amdgpu_ras_process_fini(adev); + ras_core_sw_fini(ras_mgr->ras_core); + ras_core_destroy(ras_mgr->ras_core); + ras_mgr->ras_core = NULL; + + kfree(con->ras_mgr); + con->ras_mgr = NULL; + + return 0; +} + +static int amdgpu_ras_mgr_hw_init(struct amdgpu_ip_block *ip_block) +{ + struct amdgpu_device *adev = ip_block->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + int ret; + + if (!con->uniras_enabled) + return 0; + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + ret = ras_core_hw_init(ras_mgr->ras_core); + if (ret) { + RAS_DEV_ERR(adev, "Failed to initialize ras core!\n"); + return ret; + } + + ras_mgr->ras_is_ready = true; + + amdgpu_enable_uniras(adev, true); + + RAS_DEV_INFO(adev, "AMDGPU RAS Is Ready.\n"); + return 0; +} + +static int amdgpu_ras_mgr_hw_fini(struct amdgpu_ip_block *ip_block) +{ + struct amdgpu_device *adev = ip_block->adev; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!con->uniras_enabled) + return 0; + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + ras_core_hw_fini(ras_mgr->ras_core); + + ras_mgr->ras_is_ready = false; + + return 0; +} + +struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context(struct amdgpu_device *adev) +{ + if (!adev || !adev->psp.ras_context.ras) + return NULL; + + return (struct amdgpu_ras_mgr *)adev->psp.ras_context.ras->ras_mgr; +} + +static const struct amd_ip_funcs __maybe_unused ras_v1_0_ip_funcs = { + .name = "ras_v1_0", + .sw_init = amdgpu_ras_mgr_sw_init, + .sw_fini = amdgpu_ras_mgr_sw_fini, + .hw_init = amdgpu_ras_mgr_hw_init, + .hw_fini = amdgpu_ras_mgr_hw_fini, +}; + +const struct amdgpu_ip_block_version ras_v1_0_ip_block = { + .type = AMD_IP_BLOCK_TYPE_RAS, + .major = 1, + .minor = 0, + .rev = 0, + .funcs = &ras_v1_0_ip_funcs, +}; + +int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!ras_mgr || !ras_mgr->ras_core) + return -EPERM; + + if (amdgpu_sriov_vf(adev)) + return -EPERM; + + RAS_DEV_INFO(adev, "Enable amdgpu unified ras!"); + return ras_core_set_status(ras_mgr->ras_core, enable); +} + +bool amdgpu_uniras_enabled(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!ras_mgr || !ras_mgr->ras_core) + return false; + + if (amdgpu_sriov_vf(adev)) + return false; + + return ras_core_is_enabled(ras_mgr->ras_core); +} + +static bool amdgpu_ras_mgr_is_ready(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (ras_mgr && ras_mgr->ras_core && ras_mgr->ras_is_ready && + ras_core_is_ready(ras_mgr->ras_core)) + return true; + + return false; +} + +int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!amdgpu_ras_mgr_is_ready(adev)) + return -EPERM; + + return ras_core_handle_nbio_irq(ras_mgr->ras_core, data); +} + +uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev, + enum ras_seqno_type seqno_type) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + int ret; + uint64_t seq_no; + + if (!amdgpu_ras_mgr_is_ready(adev) || + (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)) + return 0; + + seq_no = ras_core_gen_seqno(ras_mgr->ras_core, seqno_type); + + if ((seqno_type == RAS_SEQNO_TYPE_DE) || + (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION)) { + ret = ras_core_put_seqno(ras_mgr->ras_core, seqno_type, seq_no); + if (ret) + RAS_DEV_WARN(adev, "There are too many ras interrupts!"); + } + + return seq_no; +} + +int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct ras_ih_info *ih_info = (struct ras_ih_info *)data; + uint64_t seq_no = 0; + int ret = 0; + + if (!amdgpu_ras_mgr_is_ready(adev)) + return -EPERM; + + if (ih_info && (ih_info->block == AMDGPU_RAS_BLOCK__UMC)) { + if (ras_mgr->ras_core->poison_supported) { + seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_DE); + RAS_DEV_INFO(adev, + "{%llu} RAS poison is created, no user action is needed.\n", + seq_no); + } + + ret = amdgpu_ras_process_handle_umc_interrupt(adev, ih_info); + } else if (ras_mgr->ras_core->poison_supported) { + ret = amdgpu_ras_process_handle_unexpected_interrupt(adev, ih_info); + } else { + RAS_DEV_WARN(adev, + "No RAS interrupt handler for non-UMC block with poison disabled.\n"); + } + + return ret; +} + +int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data) +{ + if (!amdgpu_ras_mgr_is_ready(adev)) + return -EPERM; + + return amdgpu_ras_process_handle_consumption_interrupt(adev, data); +} + +int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!amdgpu_ras_mgr_is_ready(adev)) + return -EPERM; + + return ras_core_update_ecc_info(ras_mgr->ras_core); +} + +int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!amdgpu_ras_mgr_is_ready(adev)) + return -EPERM; + + con->gpu_reset_flags |= flags; + return amdgpu_ras_reset_gpu(adev); +} + +bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!amdgpu_ras_mgr_is_ready(adev)) + return false; + + return ras_eeprom_check_safety_watermark(ras_mgr->ras_core); +} + +int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev, + uint32_t *nps_mode) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + uint32_t mode; + + if (!amdgpu_ras_mgr_is_ready(adev)) + return -EINVAL; + + mode = ras_core_get_curr_nps_mode(ras_mgr->ras_core); + if (!mode || mode > AMDGPU_NPS8_PARTITION_MODE) + return -EINVAL; + + *nps_mode = mode; + + return 0; +} + +bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev, + uint64_t addr) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!amdgpu_ras_mgr_is_ready(adev)) + return false; + + return ras_umc_check_retired_addr(ras_mgr->ras_core, addr); +} + +bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!ras_mgr || !ras_mgr->ras_core || !ras_mgr->ras_is_ready) + return false; + + return ras_core_gpu_is_rma(ras_mgr->ras_core); +} + +int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, + uint32_t cmd_id, void *input, uint32_t input_size, + void *output, uint32_t out_size) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct ras_cmd_ctx *cmd_ctx; + uint32_t ctx_buf_size = PAGE_SIZE; + int ret; + + if (!amdgpu_ras_mgr_is_ready(adev)) + return -EPERM; + + cmd_ctx = kzalloc(ctx_buf_size, GFP_KERNEL); + if (!cmd_ctx) + return -ENOMEM; + + cmd_ctx->cmd_id = cmd_id; + + memcpy(cmd_ctx->input_buff_raw, input, input_size); + cmd_ctx->input_size = input_size; + cmd_ctx->output_buf_size = ctx_buf_size - sizeof(*cmd_ctx); + + ret = amdgpu_ras_submit_cmd(ras_mgr->ras_core, cmd_ctx); + if (!ret && !cmd_ctx->cmd_res && output && (out_size == cmd_ctx->output_size)) + memcpy(output, cmd_ctx->output_buff_raw, cmd_ctx->output_size); + + kfree(cmd_ctx); + + return ret; +} + +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_mgr_is_ready(adev)) { + RAS_DEV_ERR(adev, "Invalid ras suspend!\n"); + return -EPERM; + } + + amdgpu_ras_process_pre_reset(adev); + return 0; +} + +int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev) +{ + if (!amdgpu_ras_mgr_is_ready(adev)) { + RAS_DEV_ERR(adev, "Invalid ras resume!\n"); + return -EPERM; + } + + amdgpu_ras_process_post_reset(adev); + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h new file mode 100644 index 000000000000..8fb7eb4b8f13 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mgr.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef __AMDGPU_RAS_MGR_H__ +#define __AMDGPU_RAS_MGR_H__ +#include "ras.h" +#include "amdgpu_ras_process.h" + +enum ras_ih_type { + RAS_IH_NONE, + RAS_IH_FROM_BLOCK_CONTROLLER, + RAS_IH_FROM_CONSUMER_CLIENT, + RAS_IH_FROM_FATAL_ERROR, +}; + +struct ras_ih_info { + uint32_t block; + union { + struct amdgpu_iv_entry iv_entry; + struct { + uint16_t pasid; + uint32_t reset; + pasid_notify pasid_fn; + void *data; + }; + }; +}; + +struct amdgpu_ras_mgr { + struct amdgpu_device *adev; + struct ras_core_context *ras_core; + struct delayed_work retire_page_dwork; + struct ras_event_manager ras_event_mgr; + uint64_t last_poison_consumption_seqno; + bool ras_is_ready; + + bool is_paused; + struct completion ras_event_done; +}; + +extern const struct amdgpu_ip_block_version ras_v1_0_ip_block; + +struct amdgpu_ras_mgr *amdgpu_ras_mgr_get_context( + struct amdgpu_device *adev); +int amdgpu_enable_uniras(struct amdgpu_device *adev, bool enable); +bool amdgpu_uniras_enabled(struct amdgpu_device *adev); +int amdgpu_ras_mgr_handle_fatal_interrupt(struct amdgpu_device *adev, void *data); +int amdgpu_ras_mgr_handle_controller_interrupt(struct amdgpu_device *adev, void *data); +int amdgpu_ras_mgr_handle_consumer_interrupt(struct amdgpu_device *adev, void *data); +int amdgpu_ras_mgr_update_ras_ecc(struct amdgpu_device *adev); +int amdgpu_ras_mgr_reset_gpu(struct amdgpu_device *adev, uint32_t flags); +uint64_t amdgpu_ras_mgr_gen_ras_event_seqno(struct amdgpu_device *adev, + enum ras_seqno_type seqno_type); +bool amdgpu_ras_mgr_check_eeprom_safety_watermark(struct amdgpu_device *adev); +int amdgpu_ras_mgr_get_curr_nps_mode(struct amdgpu_device *adev, uint32_t *nps_mode); +bool amdgpu_ras_mgr_check_retired_addr(struct amdgpu_device *adev, + uint64_t addr); +bool amdgpu_ras_mgr_is_rma(struct amdgpu_device *adev); +int amdgpu_ras_mgr_handle_ras_cmd(struct amdgpu_device *adev, + uint32_t cmd_id, void *input, uint32_t input_size, + void *output, uint32_t out_size); +int amdgpu_ras_mgr_pre_reset(struct amdgpu_device *adev); +int amdgpu_ras_mgr_post_reset(struct amdgpu_device *adev); +#endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.c new file mode 100644 index 000000000000..79a51b1603ac --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "amdgpu_smu.h" +#include "amdgpu_reset.h" +#include "amdgpu_ras_mp1_v13_0.h" + +#define RAS_MP1_MSG_QueryValidMcaCeCount 0x3A +#define RAS_MP1_MSG_McaBankCeDumpDW 0x3B + +static int mp1_v13_0_get_valid_bank_count(struct ras_core_context *ras_core, + u32 msg, u32 *count) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + u32 smu_msg; + int ret = 0; + + if (!count) + return -EINVAL; + + smu_msg = (msg == RAS_MP1_MSG_QueryValidMcaCeCount) ? + SMU_MSG_QueryValidMcaCeCount : SMU_MSG_QueryValidMcaCount; + + if (down_read_trylock(&adev->reset_domain->sem)) { + ret = amdgpu_smu_ras_send_msg(adev, smu_msg, 0, count); + up_read(&adev->reset_domain->sem); + } else { + ret = -RAS_CORE_GPU_IN_MODE1_RESET; + } + + if (ret) + *count = 0; + + return ret; +} + +static int mp1_v13_0_dump_valid_bank(struct ras_core_context *ras_core, + u32 msg, u32 idx, u32 reg_idx, u64 *val) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + uint32_t data[2] = {0, 0}; + uint32_t param; + int ret = 0; + int i, offset; + u32 smu_msg = (msg == RAS_MP1_MSG_McaBankCeDumpDW) ? + SMU_MSG_McaBankCeDumpDW : SMU_MSG_McaBankDumpDW; + + if (down_read_trylock(&adev->reset_domain->sem)) { + offset = reg_idx * 8; + for (i = 0; i < ARRAY_SIZE(data); i++) { + param = ((idx & 0xffff) << 16) | ((offset + (i << 2)) & 0xfffc); + ret = amdgpu_smu_ras_send_msg(adev, smu_msg, param, &data[i]); + if (ret) { + RAS_DEV_ERR(adev, "ACA failed to read register[%d], offset:0x%x\n", + reg_idx, offset); + break; + } + } + up_read(&adev->reset_domain->sem); + + if (!ret) + *val = (uint64_t)data[1] << 32 | data[0]; + } else { + ret = -RAS_CORE_GPU_IN_MODE1_RESET; + } + + return ret; +} + +const struct ras_mp1_sys_func amdgpu_ras_mp1_sys_func_v13_0 = { + .mp1_get_valid_bank_count = mp1_v13_0_get_valid_bank_count, + .mp1_dump_valid_bank = mp1_v13_0_dump_valid_bank, +}; + diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.h new file mode 100644 index 000000000000..71c614ae1ae4 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_mp1_v13_0.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef __AMDGPU_RAS_MP1_V13_0_H__ +#define __AMDGPU_RAS_MP1_V13_0_H__ +#include "ras.h" + +extern const struct ras_mp1_sys_func amdgpu_ras_mp1_sys_func_v13_0; + +#endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.c new file mode 100644 index 000000000000..2783f5875c7c --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.c @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "amdgpu_ras_mgr.h" +#include "amdgpu_ras_nbio_v7_9.h" +#include "nbio/nbio_7_9_0_offset.h" +#include "nbio/nbio_7_9_0_sh_mask.h" +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" + +static int nbio_v7_9_set_ras_controller_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned int type, + enum amdgpu_interrupt_state state) +{ + /* Dummy function, there is no initialization operation in driver */ + + return 0; +} + +static int nbio_v7_9_process_ras_controller_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for ras_controller_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static int nbio_v7_9_set_ras_err_event_athub_irq_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *src, + unsigned int type, + enum amdgpu_interrupt_state state) +{ + /* Dummy function, there is no initialization operation in driver */ + + return 0; +} + +static int nbio_v7_9_process_err_event_athub_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + /* By design, the ih cookie for err_event_athub_irq should be written + * to BIFring instead of general iv ring. However, due to known bif ring + * hw bug, it has to be disabled. There is no chance the process function + * will be involked. Just left it as a dummy one. + */ + return 0; +} + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_controller_irq_funcs = { + .set = nbio_v7_9_set_ras_controller_irq_state, + .process = nbio_v7_9_process_ras_controller_irq, +}; + +static const struct amdgpu_irq_src_funcs nbio_v7_9_ras_err_event_athub_irq_funcs = { + .set = nbio_v7_9_set_ras_err_event_athub_irq_state, + .process = nbio_v7_9_process_err_event_athub_irq, +}; + +static int nbio_v7_9_init_ras_controller_interrupt(struct ras_core_context *ras_core, bool state) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int r; + + /* init the irq funcs */ + adev->nbio.ras_controller_irq.funcs = + &nbio_v7_9_ras_controller_irq_funcs; + adev->nbio.ras_controller_irq.num_types = 1; + + /* register ras controller interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__RAS_CONTROLLER_INTERRUPT, + &adev->nbio.ras_controller_irq); + + return r; +} + +static int nbio_v7_9_init_ras_err_event_athub_interrupt(struct ras_core_context *ras_core, + bool state) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int r; + + /* init the irq funcs */ + adev->nbio.ras_err_event_athub_irq.funcs = + &nbio_v7_9_ras_err_event_athub_irq_funcs; + adev->nbio.ras_err_event_athub_irq.num_types = 1; + + /* register ras err event athub interrupt */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_BIF, + NBIF_7_4__SRCID__ERREVENT_ATHUB_INTERRUPT, + &adev->nbio.ras_err_event_athub_irq); + + return r; +} + +const struct ras_nbio_sys_func amdgpu_ras_nbio_sys_func_v7_9 = { + .set_ras_controller_irq_state = nbio_v7_9_init_ras_controller_interrupt, + .set_ras_err_event_athub_irq_state = nbio_v7_9_init_ras_err_event_athub_interrupt, +}; diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.h new file mode 100644 index 000000000000..272259e9a0e7 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_nbio_v7_9.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __AMDGPU_RAS_NBIO_V7_9_H__ +#define __AMDGPU_RAS_NBIO_V7_9_H__ + +extern const struct ras_nbio_sys_func amdgpu_ras_nbio_sys_func_v7_9; + +#endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c new file mode 100644 index 000000000000..5782c007de71 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.c @@ -0,0 +1,190 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "amdgpu.h" +#include "amdgpu_reset.h" +#include "amdgpu_xgmi.h" +#include "ras_sys.h" +#include "amdgpu_ras_mgr.h" +#include "amdgpu_ras_process.h" + +#define RAS_MGR_RETIRE_PAGE_INTERVAL 100 +#define RAS_EVENT_PROCESS_TIMEOUT 1200 + +static void ras_process_retire_page_dwork(struct work_struct *work) +{ + struct amdgpu_ras_mgr *ras_mgr = + container_of(work, struct amdgpu_ras_mgr, retire_page_dwork.work); + struct amdgpu_device *adev = ras_mgr->adev; + int ret; + + if (amdgpu_ras_is_rma(adev)) + return; + + /* If gpu reset is ongoing, delay retiring the bad pages */ + if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { + schedule_delayed_work(&ras_mgr->retire_page_dwork, + msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL * 3)); + return; + } + + ret = ras_umc_handle_bad_pages(ras_mgr->ras_core, NULL); + if (!ret) + schedule_delayed_work(&ras_mgr->retire_page_dwork, + msecs_to_jiffies(RAS_MGR_RETIRE_PAGE_INTERVAL)); +} + +int amdgpu_ras_process_init(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + ras_mgr->is_paused = false; + init_completion(&ras_mgr->ras_event_done); + + INIT_DELAYED_WORK(&ras_mgr->retire_page_dwork, ras_process_retire_page_dwork); + + return 0; +} + +int amdgpu_ras_process_fini(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + ras_mgr->is_paused = false; + /* Save all cached bad pages to eeprom */ + flush_delayed_work(&ras_mgr->retire_page_dwork); + cancel_delayed_work_sync(&ras_mgr->retire_page_dwork); + return 0; +} + +int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev, void *data) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!ras_mgr->ras_core) + return -EINVAL; + + return ras_process_add_interrupt_req(ras_mgr->ras_core, NULL, true); +} + +int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, void *data) +{ + amdgpu_ras_set_fed(adev, true); + return amdgpu_ras_mgr_reset_gpu(adev, AMDGPU_RAS_GPU_RESET_MODE1_RESET); +} + +int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, void *data) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct ras_ih_info *ih_info = (struct ras_ih_info *)data; + struct ras_event_req req; + uint64_t seqno; + + if (!ih_info) + return -EINVAL; + + memset(&req, 0, sizeof(req)); + req.block = ih_info->block; + req.data = ih_info->data; + req.pasid = ih_info->pasid; + req.pasid_fn = ih_info->pasid_fn; + req.reset = ih_info->reset; + + seqno = ras_core_get_seqno(ras_mgr->ras_core, + RAS_SEQNO_TYPE_POISON_CONSUMPTION, false); + + /* When the ACA register cannot be read from FW, the poison + * consumption seqno in the fifo will not pop up, so it is + * necessary to check whether the seqno is the previous seqno. + */ + if (seqno == ras_mgr->last_poison_consumption_seqno) { + /* Pop and discard the previous seqno */ + ras_core_get_seqno(ras_mgr->ras_core, + RAS_SEQNO_TYPE_POISON_CONSUMPTION, true); + seqno = ras_core_get_seqno(ras_mgr->ras_core, + RAS_SEQNO_TYPE_POISON_CONSUMPTION, false); + } + ras_mgr->last_poison_consumption_seqno = seqno; + req.seqno = seqno; + + return ras_process_add_interrupt_req(ras_mgr->ras_core, &req, false); +} + +int amdgpu_ras_process_begin(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (ras_mgr->is_paused) + return -EAGAIN; + + reinit_completion(&ras_mgr->ras_event_done); + return 0; +} + +int amdgpu_ras_process_end(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + complete(&ras_mgr->ras_event_done); + return 0; +} + +int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + long rc; + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + if (!ras_mgr->ras_core->is_initialized) + return -EPERM; + + ras_mgr->is_paused = true; + + /* Wait for RAS event processing to complete */ + rc = wait_for_completion_interruptible_timeout(&ras_mgr->ras_event_done, + msecs_to_jiffies(RAS_EVENT_PROCESS_TIMEOUT)); + if (rc <= 0) + RAS_DEV_WARN(adev, "Waiting for ras process to complete %s\n", + rc ? "interrupted" : "timeout"); + + flush_delayed_work(&ras_mgr->retire_page_dwork); + return 0; +} + +int amdgpu_ras_process_post_reset(struct amdgpu_device *adev) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + + if (!ras_mgr || !ras_mgr->ras_core) + return -EINVAL; + + if (!ras_mgr->ras_core->is_initialized) + return -EPERM; + + ras_mgr->is_paused = false; + + schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h new file mode 100644 index 000000000000..d55cdaeac441 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_process.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef __AMDGPU_RAS_PROCESS_H__ +#define __AMDGPU_RAS_PROCESS_H__ +#include "ras_process.h" +#include "amdgpu_ras_mgr.h" + +enum ras_ih_type; +int amdgpu_ras_process_init(struct amdgpu_device *adev); +int amdgpu_ras_process_fini(struct amdgpu_device *adev); +int amdgpu_ras_process_handle_umc_interrupt(struct amdgpu_device *adev, + void *data); +int amdgpu_ras_process_handle_unexpected_interrupt(struct amdgpu_device *adev, + void *data); +int amdgpu_ras_process_handle_consumption_interrupt(struct amdgpu_device *adev, + void *data); +int amdgpu_ras_process_begin(struct amdgpu_device *adev); +int amdgpu_ras_process_end(struct amdgpu_device *adev); +int amdgpu_ras_process_pre_reset(struct amdgpu_device *adev); +int amdgpu_ras_process_post_reset(struct amdgpu_device *adev); +#endif diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c new file mode 100644 index 000000000000..45ed8c3b5563 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c @@ -0,0 +1,279 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras_sys.h" +#include "amdgpu_ras_mgr.h" +#include "amdgpu_ras.h" +#include "amdgpu_reset.h" + +static int amdgpu_ras_sys_detect_fatal_event(struct ras_core_context *ras_core, void *data) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int ret; + uint64_t seq_no; + + ret = amdgpu_ras_global_ras_isr(adev); + if (ret) + return ret; + + seq_no = amdgpu_ras_mgr_gen_ras_event_seqno(adev, RAS_SEQNO_TYPE_UE); + RAS_DEV_INFO(adev, + "{%llu} Uncorrectable hardware error(ERREVENT_ATHUB_INTERRUPT) detected!\n", + seq_no); + + return amdgpu_ras_process_handle_unexpected_interrupt(adev, data); +} + +static int amdgpu_ras_sys_poison_consumption_event(struct ras_core_context *ras_core, + void *data) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct ras_event_req *req = (struct ras_event_req *)data; + pasid_notify pasid_fn; + + if (!req) + return -EINVAL; + + if (req->pasid_fn) { + pasid_fn = (pasid_notify)req->pasid_fn; + pasid_fn(adev, req->pasid, req->data); + } + + return 0; +} + +static int amdgpu_ras_sys_gen_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type, uint64_t *seqno) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(adev); + struct ras_event_manager *event_mgr; + struct ras_event_state *event_state; + struct amdgpu_hive_info *hive; + enum ras_event_type event_type; + uint64_t seq_no; + + if (!ras_mgr || !seqno || + (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX)) + return -EINVAL; + + switch (seqno_type) { + case RAS_SEQNO_TYPE_UE: + event_type = RAS_EVENT_TYPE_FATAL; + break; + case RAS_SEQNO_TYPE_CE: + case RAS_SEQNO_TYPE_DE: + event_type = RAS_EVENT_TYPE_POISON_CREATION; + break; + case RAS_SEQNO_TYPE_POISON_CONSUMPTION: + event_type = RAS_EVENT_TYPE_POISON_CONSUMPTION; + break; + default: + event_type = RAS_EVENT_TYPE_INVALID; + break; + } + + hive = amdgpu_get_xgmi_hive(adev); + event_mgr = hive ? &hive->event_mgr : &ras_mgr->ras_event_mgr; + event_state = &event_mgr->event_state[event_type]; + if ((event_type == RAS_EVENT_TYPE_FATAL) && amdgpu_ras_in_recovery(adev)) { + seq_no = event_state->last_seqno; + } else { + seq_no = atomic64_inc_return(&event_mgr->seqno); + event_state->last_seqno = seq_no; + atomic64_inc(&event_state->count); + } + amdgpu_put_xgmi_hive(hive); + + *seqno = seq_no; + return 0; + +} + +static int amdgpu_ras_sys_event_notifier(struct ras_core_context *ras_core, + enum ras_notify_event event_id, void *data) +{ + struct amdgpu_ras_mgr *ras_mgr = amdgpu_ras_mgr_get_context(ras_core->dev); + int ret = 0; + + switch (event_id) { + case RAS_EVENT_ID__BAD_PAGE_DETECTED: + schedule_delayed_work(&ras_mgr->retire_page_dwork, 0); + break; + case RAS_EVENT_ID__POISON_CONSUMPTION: + amdgpu_ras_sys_poison_consumption_event(ras_core, data); + break; + case RAS_EVENT_ID__RESERVE_BAD_PAGE: + ret = amdgpu_ras_reserve_page(ras_core->dev, *(uint64_t *)data); + break; + case RAS_EVENT_ID__FATAL_ERROR_DETECTED: + ret = amdgpu_ras_sys_detect_fatal_event(ras_core, data); + break; + case RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM: + ret = amdgpu_dpm_send_hbm_bad_pages_num(ras_core->dev, *(uint32_t *)data); + break; + case RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP: + ret = amdgpu_dpm_send_hbm_bad_channel_flag(ras_core->dev, *(uint32_t *)data); + break; + case RAS_EVENT_ID__DEVICE_RMA: + ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL); + ret = amdgpu_dpm_send_rma_reason(ras_core->dev); + break; + case RAS_EVENT_ID__RESET_GPU: + ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data); + break; + case RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN: + ret = amdgpu_ras_process_begin(ras_core->dev); + break; + case RAS_EVENT_ID__RAS_EVENT_PROC_END: + ret = amdgpu_ras_process_end(ras_core->dev); + break; + default: + RAS_DEV_WARN(ras_core->dev, "Invalid ras notify event:%d\n", event_id); + break; + } + + return ret; +} + +static u64 amdgpu_ras_sys_get_utc_second_timestamp(struct ras_core_context *ras_core) +{ + return ktime_get_real_seconds(); +} + +static int amdgpu_ras_sys_check_gpu_status(struct ras_core_context *ras_core, + uint32_t *status) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + uint32_t gpu_status = 0; + + if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) + gpu_status |= RAS_GPU_STATUS__IN_RESET; + + if (amdgpu_sriov_vf(adev)) + gpu_status |= RAS_GPU_STATUS__IS_VF; + + *status = gpu_status; + + return 0; +} + +static int amdgpu_ras_sys_get_device_system_info(struct ras_core_context *ras_core, + struct device_system_info *dev_info) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + + dev_info->device_id = adev->pdev->device; + dev_info->vendor_id = adev->pdev->vendor; + dev_info->socket_id = adev->smuio.funcs->get_socket_id(adev); + + return 0; +} + +static int amdgpu_ras_sys_gpu_reset_lock(struct ras_core_context *ras_core, + bool down, bool try) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + int ret = 0; + + if (down && try) + ret = down_read_trylock(&adev->reset_domain->sem); + else if (down) + down_read(&adev->reset_domain->sem); + else + up_read(&adev->reset_domain->sem); + + return ret; +} + +static bool amdgpu_ras_sys_detect_ras_interrupt(struct ras_core_context *ras_core) +{ + return !!atomic_read(&amdgpu_ras_in_intr); +} + +static int amdgpu_ras_sys_get_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)ras_core->dev; + struct psp_context *psp = &adev->psp; + struct psp_ring *psp_ring; + struct ta_mem_context *mem_ctx; + + if (mem_type == GPU_MEM_TYPE_RAS_PSP_RING) { + psp_ring = &psp->km_ring; + gpu_mem->mem_bo = adev->firmware.rbuf; + gpu_mem->mem_size = psp_ring->ring_size; + gpu_mem->mem_mc_addr = psp_ring->ring_mem_mc_addr; + gpu_mem->mem_cpu_addr = psp_ring->ring_mem; + } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_CMD) { + gpu_mem->mem_bo = psp->cmd_buf_bo; + gpu_mem->mem_size = PSP_CMD_BUFFER_SIZE; + gpu_mem->mem_mc_addr = psp->cmd_buf_mc_addr; + gpu_mem->mem_cpu_addr = psp->cmd_buf_mem; + } else if (mem_type == GPU_MEM_TYPE_RAS_PSP_FENCE) { + gpu_mem->mem_bo = psp->fence_buf_bo; + gpu_mem->mem_size = PSP_FENCE_BUFFER_SIZE; + gpu_mem->mem_mc_addr = psp->fence_buf_mc_addr; + gpu_mem->mem_cpu_addr = psp->fence_buf; + } else if (mem_type == GPU_MEM_TYPE_RAS_TA_FW) { + gpu_mem->mem_bo = psp->fw_pri_bo; + gpu_mem->mem_size = PSP_1_MEG; + gpu_mem->mem_mc_addr = psp->fw_pri_mc_addr; + gpu_mem->mem_cpu_addr = psp->fw_pri_buf; + } else if (mem_type == GPU_MEM_TYPE_RAS_TA_CMD) { + mem_ctx = &psp->ras_context.context.mem_context; + gpu_mem->mem_bo = mem_ctx->shared_bo; + gpu_mem->mem_size = mem_ctx->shared_mem_size; + gpu_mem->mem_mc_addr = mem_ctx->shared_mc_addr; + gpu_mem->mem_cpu_addr = mem_ctx->shared_buf; + } else { + return -EINVAL; + } + + if (!gpu_mem->mem_bo || !gpu_mem->mem_size || + !gpu_mem->mem_mc_addr || !gpu_mem->mem_cpu_addr) { + RAS_DEV_ERR(ras_core->dev, "The ras psp gpu memory is invalid!\n"); + return -ENOMEM; + } + + return 0; +} + +static int amdgpu_ras_sys_put_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) +{ + + return 0; +} + +const struct ras_sys_func amdgpu_ras_sys_fn = { + .ras_notifier = amdgpu_ras_sys_event_notifier, + .get_utc_second_timestamp = amdgpu_ras_sys_get_utc_second_timestamp, + .gen_seqno = amdgpu_ras_sys_gen_seqno, + .check_gpu_status = amdgpu_ras_sys_check_gpu_status, + .get_device_system_info = amdgpu_ras_sys_get_device_system_info, + .gpu_reset_lock = amdgpu_ras_sys_gpu_reset_lock, + .detect_ras_interrupt = amdgpu_ras_sys_detect_ras_interrupt, + .get_gpu_mem = amdgpu_ras_sys_get_gpu_mem, + .put_gpu_mem = amdgpu_ras_sys_put_gpu_mem, +}; diff --git a/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h new file mode 100644 index 000000000000..8156531a7b63 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/ras_mgr/ras_sys.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_SYS_H__ +#define __RAS_SYS_H__ +#include <linux/stdarg.h> +#include <linux/printk.h> +#include <linux/dev_printk.h> +#include <linux/mempool.h> +#include "amdgpu.h" + +#define RAS_DEV_ERR(device, fmt, ...) \ + do { \ + if (device) \ + dev_err(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_ERR fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_DEV_WARN(device, fmt, ...) \ + do { \ + if (device) \ + dev_warn(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_WARNING fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_DEV_INFO(device, fmt, ...) \ + do { \ + if (device) \ + dev_info(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_INFO fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_DEV_DBG(device, fmt, ...) \ + do { \ + if (device) \ + dev_dbg(((struct amdgpu_device *)device)->dev, fmt, ##__VA_ARGS__); \ + else \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ + } while (0) + +#define RAS_INFO(fmt, ...) printk(KERN_INFO fmt, ##__VA_ARGS__) + +#define RAS_DEV_RREG32_SOC15(dev, ip, inst, reg) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + __RREG32_SOC15_RLC__(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg, \ + 0, ip##_HWIP, inst); \ +}) + +#define RAS_DEV_WREG32_SOC15(dev, ip, inst, reg, value) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + __WREG32_SOC15_RLC__((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg), \ + value, 0, ip##_HWIP, inst); \ +}) + +/* GET_INST returns the physical instance corresponding to a logical instance */ +#define RAS_GET_INST(dev, ip, inst) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + adev->ip_map.logical_to_dev_inst ? \ + adev->ip_map.logical_to_dev_inst(adev, ip##_HWIP, inst) : inst; \ +}) + +#define RAS_GET_MASK(dev, ip, mask) \ +({ \ + struct amdgpu_device *adev = (struct amdgpu_device *)dev; \ + (adev->ip_map.logical_to_dev_mask ? \ + adev->ip_map.logical_to_dev_mask(adev, ip##_HWIP, mask) : mask); \ +}) + +static inline void *ras_radix_tree_delete_iter(struct radix_tree_root *root, void *iter) +{ + return radix_tree_delete(root, ((struct radix_tree_iter *)iter)->index); +} + +static inline long ras_wait_event_interruptible_timeout(void *wq_head, + int (*condition)(void *param), void *param, unsigned int timeout) +{ + return wait_event_interruptible_timeout(*(wait_queue_head_t *)wq_head, + condition(param), timeout); +} + +extern const struct ras_sys_func amdgpu_ras_sys_fn; + +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/Makefile b/drivers/gpu/drm/amd/ras/rascore/Makefile index e69de29bb2d1..e826a1f86424 100644 --- a/drivers/gpu/drm/amd/ras/rascore/Makefile +++ b/drivers/gpu/drm/amd/ras/rascore/Makefile @@ -0,0 +1,44 @@ +# +# Copyright 2025 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +RAS_CORE_FILES = ras_core.o \ + ras_mp1.o \ + ras_mp1_v13_0.o \ + ras_aca.o \ + ras_aca_v1_0.o \ + ras_eeprom.o \ + ras_umc.o \ + ras_umc_v12_0.o \ + ras_cmd.o \ + ras_gfx.o \ + ras_gfx_v9_0.o \ + ras_process.o \ + ras_nbio.o \ + ras_nbio_v7_9.o \ + ras_log_ring.o \ + ras_cper.o \ + ras_psp.o \ + ras_psp_v13_0.o + + +RAS_CORE = $(addprefix $(AMD_GPU_RAS_PATH)/rascore/,$(RAS_CORE_FILES)) + +AMD_GPU_RAS_FILES += $(RAS_CORE) diff --git a/drivers/gpu/drm/amd/ras/rascore/ras.h b/drivers/gpu/drm/amd/ras/rascore/ras.h new file mode 100644 index 000000000000..3396b2e0949d --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_H__ +#define __RAS_H__ +#include "ras_sys.h" +#include "ras_umc.h" +#include "ras_aca.h" +#include "ras_eeprom.h" +#include "ras_core_status.h" +#include "ras_process.h" +#include "ras_gfx.h" +#include "ras_cmd.h" +#include "ras_nbio.h" +#include "ras_mp1.h" +#include "ras_psp.h" +#include "ras_log_ring.h" + +#define RAS_HW_ERR "[Hardware Error]: " + +#define RAS_GPU_PAGE_SHIFT 12 +#define RAS_ADDR_TO_PFN(addr) ((addr) >> RAS_GPU_PAGE_SHIFT) +#define RAS_PFN_TO_ADDR(pfn) ((pfn) << RAS_GPU_PAGE_SHIFT) + +#define RAS_CORE_RESET_GPU 0x10000 + +#define GPU_RESET_CAUSE_POISON (RAS_CORE_RESET_GPU | 0x0001) +#define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002) +#define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004) + +enum ras_block_id { + RAS_BLOCK_ID__UMC = 0, + RAS_BLOCK_ID__SDMA, + RAS_BLOCK_ID__GFX, + RAS_BLOCK_ID__MMHUB, + RAS_BLOCK_ID__ATHUB, + RAS_BLOCK_ID__PCIE_BIF, + RAS_BLOCK_ID__HDP, + RAS_BLOCK_ID__XGMI_WAFL, + RAS_BLOCK_ID__DF, + RAS_BLOCK_ID__SMN, + RAS_BLOCK_ID__SEM, + RAS_BLOCK_ID__MP0, + RAS_BLOCK_ID__MP1, + RAS_BLOCK_ID__FUSE, + RAS_BLOCK_ID__MCA, + RAS_BLOCK_ID__VCN, + RAS_BLOCK_ID__JPEG, + RAS_BLOCK_ID__IH, + RAS_BLOCK_ID__MPIO, + + RAS_BLOCK_ID__LAST +}; + +enum ras_ecc_err_type { + RAS_ECC_ERR__NONE = 0, + RAS_ECC_ERR__PARITY = 1, + RAS_ECC_ERR__SINGLE_CORRECTABLE = 2, + RAS_ECC_ERR__MULTI_UNCORRECTABLE = 4, + RAS_ECC_ERR__POISON = 8, +}; + +enum ras_err_type { + RAS_ERR_TYPE__UE = 0, + RAS_ERR_TYPE__CE, + RAS_ERR_TYPE__DE, + RAS_ERR_TYPE__LAST +}; + +enum ras_seqno_type { + RAS_SEQNO_TYPE_INVALID = 0, + RAS_SEQNO_TYPE_UE, + RAS_SEQNO_TYPE_CE, + RAS_SEQNO_TYPE_DE, + RAS_SEQNO_TYPE_POISON_CONSUMPTION, + RAS_SEQNO_TYPE_COUNT_MAX, +}; + +enum ras_seqno_fifo { + SEQNO_FIFO_INVALID = 0, + SEQNO_FIFO_POISON_CREATION, + SEQNO_FIFO_POISON_CONSUMPTION, + SEQNO_FIFO_COUNT_MAX +}; + +enum ras_notify_event { + RAS_EVENT_ID__NONE, + RAS_EVENT_ID__BAD_PAGE_DETECTED, + RAS_EVENT_ID__POISON_CONSUMPTION, + RAS_EVENT_ID__RESERVE_BAD_PAGE, + RAS_EVENT_ID__DEVICE_RMA, + RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM, + RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP, + RAS_EVENT_ID__FATAL_ERROR_DETECTED, + RAS_EVENT_ID__RESET_GPU, + RAS_EVENT_ID__RESET_VF, + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, + RAS_EVENT_ID__RAS_EVENT_PROC_END, +}; + +enum ras_gpu_status { + RAS_GPU_STATUS__NOT_READY = 0, + RAS_GPU_STATUS__READY = 0x1, + RAS_GPU_STATUS__IN_RESET = 0x2, + RAS_GPU_STATUS__IS_RMA = 0x4, + RAS_GPU_STATUS__IS_VF = 0x8, +}; + +struct ras_core_context; +struct ras_bank_ecc; +struct ras_umc; +struct ras_aca; +struct ras_process; +struct ras_nbio; +struct ras_log_ring; +struct ras_psp; + +struct ras_mp1_sys_func { + int (*mp1_get_valid_bank_count)(struct ras_core_context *ras_core, + u32 msg, u32 *count); + int (*mp1_dump_valid_bank)(struct ras_core_context *ras_core, + u32 msg, u32 idx, u32 reg_idx, u64 *val); +}; + +struct ras_eeprom_sys_func { + int (*eeprom_i2c_xfer)(struct ras_core_context *ras_core, + u32 eeprom_addr, u8 *eeprom_buf, u32 buf_size, bool read); + int (*update_eeprom_i2c_config)(struct ras_core_context *ras_core); +}; + +struct ras_nbio_sys_func { + int (*set_ras_controller_irq_state)(struct ras_core_context *ras_core, + bool state); + int (*set_ras_err_event_athub_irq_state)(struct ras_core_context *ras_core, + bool state); +}; + +struct ras_time { + int tm_sec; + int tm_min; + int tm_hour; + int tm_mday; + int tm_mon; + long tm_year; +}; + +struct device_system_info { + uint32_t device_id; + uint32_t vendor_id; + uint32_t socket_id; +}; + +enum gpu_mem_type { + GPU_MEM_TYPE_DEFAULT, + GPU_MEM_TYPE_RAS_PSP_RING, + GPU_MEM_TYPE_RAS_PSP_CMD, + GPU_MEM_TYPE_RAS_PSP_FENCE, + GPU_MEM_TYPE_RAS_TA_FW, + GPU_MEM_TYPE_RAS_TA_CMD, +}; + +struct ras_psp_sys_func { + int (*get_ras_psp_system_status)(struct ras_core_context *ras_core, + struct ras_psp_sys_status *status); + int (*get_ras_ta_init_param)(struct ras_core_context *ras_core, + struct ras_ta_init_param *ras_ta_param); +}; + +struct ras_sys_func { + int (*gpu_reset_lock)(struct ras_core_context *ras_core, + bool down, bool try); + int (*check_gpu_status)(struct ras_core_context *ras_core, + uint32_t *status); + int (*gen_seqno)(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type, uint64_t *seqno); + int (*async_handle_ras_event)(struct ras_core_context *ras_core, void *data); + int (*ras_notifier)(struct ras_core_context *ras_core, + enum ras_notify_event event_id, void *data); + u64 (*get_utc_second_timestamp)(struct ras_core_context *ras_core); + int (*get_device_system_info)(struct ras_core_context *ras_core, + struct device_system_info *dev_info); + bool (*detect_ras_interrupt)(struct ras_core_context *ras_core); + int (*get_gpu_mem)(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); + int (*put_gpu_mem)(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); +}; + +struct ras_ecc_count { + uint64_t new_ce_count; + uint64_t total_ce_count; + uint64_t new_ue_count; + uint64_t total_ue_count; + uint64_t new_de_count; + uint64_t total_de_count; +}; + +struct ras_bank_ecc { + uint32_t nps; + uint64_t seq_no; + uint64_t status; + uint64_t ipid; + uint64_t addr; +}; + +struct ras_bank_ecc_node { + struct list_head node; + struct ras_bank_ecc ecc; +}; + +struct ras_aca_config { + u32 socket_num_per_hive; + u32 aid_num_per_socket; + u32 xcd_num_per_aid; +}; + +struct ras_mp1_config { + const struct ras_mp1_sys_func *mp1_sys_fn; +}; + +struct ras_nbio_config { + const struct ras_nbio_sys_func *nbio_sys_fn; +}; + +struct ras_psp_config { + const struct ras_psp_sys_func *psp_sys_fn; +}; + +struct ras_umc_config { + uint32_t umc_vram_type; +}; + +struct ras_eeprom_config { + const struct ras_eeprom_sys_func *eeprom_sys_fn; + int eeprom_record_threshold_config; + uint32_t eeprom_record_threshold_count; + void *eeprom_i2c_adapter; + u32 eeprom_i2c_addr; + u32 eeprom_i2c_port; + u16 max_i2c_read_len; + u16 max_i2c_write_len; +}; + +struct ras_core_config { + u32 aca_ip_version; + u32 umc_ip_version; + u32 mp1_ip_version; + u32 gfx_ip_version; + u32 nbio_ip_version; + u32 psp_ip_version; + + bool poison_supported; + bool ras_eeprom_supported; + const struct ras_sys_func *sys_fn; + + struct ras_aca_config aca_cfg; + struct ras_mp1_config mp1_cfg; + struct ras_nbio_config nbio_cfg; + struct ras_psp_config psp_cfg; + struct ras_eeprom_config eeprom_cfg; + struct ras_umc_config umc_cfg; +}; + +struct ras_core_context { + void *dev; + struct ras_core_config *config; + u32 socket_num_per_hive; + u32 aid_num_per_socket; + u32 xcd_num_per_aid; + int max_ue_banks_per_query; + int max_ce_banks_per_query; + struct ras_aca ras_aca; + + bool ras_eeprom_supported; + struct ras_eeprom_control ras_eeprom; + + struct ras_psp ras_psp; + struct ras_umc ras_umc; + struct ras_nbio ras_nbio; + struct ras_gfx ras_gfx; + struct ras_mp1 ras_mp1; + struct ras_process ras_proc; + struct ras_cmd_mgr ras_cmd; + struct ras_log_ring ras_log_ring; + + const struct ras_sys_func *sys_fn; + + /* is poison mode supported */ + bool poison_supported; + + bool is_rma; + bool is_initialized; + + struct kfifo de_seqno_fifo; + struct kfifo consumption_seqno_fifo; + spinlock_t seqno_lock; + + bool ras_core_enabled; +}; + +struct ras_core_context *ras_core_create(struct ras_core_config *init_config); +void ras_core_destroy(struct ras_core_context *ras_core); +int ras_core_sw_init(struct ras_core_context *ras_core); +int ras_core_sw_fini(struct ras_core_context *ras_core); +int ras_core_hw_init(struct ras_core_context *ras_core); +int ras_core_hw_fini(struct ras_core_context *ras_core); +bool ras_core_is_ready(struct ras_core_context *ras_core); +uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type); +uint64_t ras_core_get_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type, bool pop); + +int ras_core_put_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type, uint64_t seqno); + +int ras_core_update_ecc_info(struct ras_core_context *ras_core); +int ras_core_query_block_ecc_data(struct ras_core_context *ras_core, + enum ras_block_id block, struct ras_ecc_count *ecc_count); + +bool ras_core_gpu_in_reset(struct ras_core_context *ras_core); +bool ras_core_gpu_is_rma(struct ras_core_context *ras_core); +bool ras_core_gpu_is_vf(struct ras_core_context *ras_core); +bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data); +int ras_core_handle_fatal_error(struct ras_core_context *ras_core); + +uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core); +const char *ras_core_get_ras_block_name(enum ras_block_id block_id); +int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core, + uint64_t timestamp, struct ras_time *tm); + +int ras_core_set_status(struct ras_core_context *ras_core, bool enable); +bool ras_core_is_enabled(struct ras_core_context *ras_core); +uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core); +int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core, + uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa); +bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core); +int ras_core_get_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); +int ras_core_put_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem); +bool ras_core_check_safety_watermark(struct ras_core_context *ras_core); +int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core); +void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core); +void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core); +int ras_core_event_notify(struct ras_core_context *ras_core, + enum ras_notify_event event_id, void *data); +int ras_core_get_device_system_info(struct ras_core_context *ras_core, + struct device_system_info *dev_info); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca.c b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c new file mode 100644 index 000000000000..e433c70d2989 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_aca.h" +#include "ras_aca_v1_0.h" +#include "ras_mp1_v13_0.h" + +#define ACA_MARK_FATAL_FLAG 0x100 +#define ACA_MARK_UE_READ_FLAG 0x1 + +#define blk_name(block_id) ras_core_get_ras_block_name(block_id) + +static struct aca_regs_dump { + const char *name; + int reg_idx; +} aca_regs[] = { + {"CONTROL", ACA_REG_IDX__CTL}, + {"STATUS", ACA_REG_IDX__STATUS}, + {"ADDR", ACA_REG_IDX__ADDR}, + {"MISC", ACA_REG_IDX__MISC0}, + {"CONFIG", ACA_REG_IDX__CONFG}, + {"IPID", ACA_REG_IDX__IPID}, + {"SYND", ACA_REG_IDX__SYND}, + {"DESTAT", ACA_REG_IDX__DESTAT}, + {"DEADDR", ACA_REG_IDX__DEADDR}, + {"CONTROL_MASK", ACA_REG_IDX__CTL_MASK}, +}; + + +static void aca_report_ecc_info(struct ras_core_context *ras_core, + u64 seq_no, u32 blk, u32 skt, u32 aid, + struct aca_aid_ecc *aid_ecc, + struct aca_bank_ecc *new_ecc) +{ + struct aca_ecc_count ecc_count = {0}; + + ecc_count.new_ue_count = new_ecc->ue_count; + ecc_count.new_de_count = new_ecc->de_count; + ecc_count.new_ce_count = new_ecc->ce_count; + if (blk == RAS_BLOCK_ID__GFX) { + struct aca_ecc_count *xcd_ecc; + int xcd_id; + + for (xcd_id = 0; xcd_id < aid_ecc->xcd.xcd_num; xcd_id++) { + xcd_ecc = &aid_ecc->xcd.xcd[xcd_id].ecc_err; + ecc_count.total_ue_count += xcd_ecc->total_ue_count; + ecc_count.total_de_count += xcd_ecc->total_de_count; + ecc_count.total_ce_count += xcd_ecc->total_ce_count; + } + } else { + ecc_count.total_ue_count = aid_ecc->ecc_err.total_ue_count; + ecc_count.total_de_count = aid_ecc->ecc_err.total_de_count; + ecc_count.total_ce_count = aid_ecc->ecc_err.total_ce_count; + } + + if (ecc_count.new_ue_count) { + RAS_DEV_INFO(ras_core->dev, + "{%llu} socket: %d, die: %d, %u new uncorrectable hardware errors detected in %s block\n", + seq_no, skt, aid, ecc_count.new_ue_count, blk_name(blk)); + RAS_DEV_INFO(ras_core->dev, + "{%llu} socket: %d, die: %d, %u uncorrectable hardware errors detected in total in %s block\n", + seq_no, skt, aid, ecc_count.total_ue_count, blk_name(blk)); + } + + if (ecc_count.new_de_count) { + RAS_DEV_INFO(ras_core->dev, + "{%llu} socket: %d, die: %d, %u new %s detected in %s block\n", + seq_no, skt, aid, ecc_count.new_de_count, + (blk == RAS_BLOCK_ID__UMC) ? + "deferred hardware errors" : "poison consumption", + blk_name(blk)); + RAS_DEV_INFO(ras_core->dev, + "{%llu} socket: %d, die: %d, %u %s detected in total in %s block\n", + seq_no, skt, aid, ecc_count.total_de_count, + (blk == RAS_BLOCK_ID__UMC) ? + "deferred hardware errors" : "poison consumption", + blk_name(blk)); + } + + if (ecc_count.new_ce_count) { + RAS_DEV_INFO(ras_core->dev, + "{%llu} socket: %d, die: %d, %u new correctable hardware errors detected in %s block\n", + seq_no, skt, aid, ecc_count.new_ce_count, blk_name(blk)); + RAS_DEV_INFO(ras_core->dev, + "{%llu} socket: %d, die: %d, %u correctable hardware errors detected in total in %s block\n", + seq_no, skt, aid, ecc_count.total_ce_count, blk_name(blk)); + } +} + +static void aca_bank_log(struct ras_core_context *ras_core, + int idx, int total, struct aca_bank_reg *bank, + struct aca_bank_ecc *bank_ecc) +{ + int i; + + RAS_DEV_INFO(ras_core->dev, + "{%llu}" RAS_HW_ERR "Accelerator Check Architecture events logged\n", + bank->seq_no); + /* plus 1 for output format, e.g: ACA[08/08]: xxxx */ + for (i = 0; i < ARRAY_SIZE(aca_regs); i++) + RAS_DEV_INFO(ras_core->dev, + "{%llu}" RAS_HW_ERR "ACA[%02d/%02d].%s=0x%016llx\n", + bank->seq_no, idx + 1, total, + aca_regs[i].name, bank->regs[aca_regs[i].reg_idx]); +} + +static void aca_log_bank_data(struct ras_core_context *ras_core, + struct aca_bank_reg *bank, struct aca_bank_ecc *bank_ecc, + struct ras_log_batch_tag *batch) +{ + if (bank_ecc->ue_count) + ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_UE, bank->regs, batch); + else if (bank_ecc->de_count) + ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_DE, bank->regs, batch); + else + ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_CE, bank->regs, batch); +} + +static int aca_get_bank_count(struct ras_core_context *ras_core, + enum ras_err_type type, u32 *count) +{ + return ras_mp1_get_bank_count(ras_core, type, count); +} + +static bool aca_match_bank(struct aca_block *aca_blk, struct aca_bank_reg *bank) +{ + const struct aca_bank_hw_ops *bank_ops; + + if (!aca_blk->blk_info) + return false; + + bank_ops = &aca_blk->blk_info->bank_ops; + if (!bank_ops->bank_match) + return false; + + return bank_ops->bank_match(aca_blk, bank); +} + +static int aca_parse_bank(struct ras_core_context *ras_core, + struct aca_block *aca_blk, + struct aca_bank_reg *bank, + struct aca_bank_ecc *ecc) +{ + const struct aca_bank_hw_ops *bank_ops = &aca_blk->blk_info->bank_ops; + + if (!bank_ops || !bank_ops->bank_parse) + return -RAS_CORE_NOT_SUPPORTED; + + return bank_ops->bank_parse(ras_core, aca_blk, bank, ecc); +} + +static int aca_check_block_ecc_info(struct ras_core_context *ras_core, + struct aca_block *aca_blk, struct aca_ecc_info *info) +{ + if (info->socket_id >= aca_blk->ecc.socket_num_per_hive) { + RAS_DEV_ERR(ras_core->dev, + "Socket id (%d) is out of config! max:%u\n", + info->socket_id, aca_blk->ecc.socket_num_per_hive); + return -ENODATA; + } + + if (info->die_id >= aca_blk->ecc.socket[info->socket_id].aid_num) { + RAS_DEV_ERR(ras_core->dev, + "Die id (%d) is out of config! max:%u\n", + info->die_id, aca_blk->ecc.socket[info->socket_id].aid_num); + return -ENODATA; + } + + if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX) && + (info->xcd_id >= + aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num)) { + RAS_DEV_ERR(ras_core->dev, + "Xcd id (%d) is out of config! max:%u\n", + info->xcd_id, + aca_blk->ecc.socket[info->socket_id].aid[info->die_id].xcd.xcd_num); + return -ENODATA; + } + + return 0; +} + +static int aca_log_bad_bank(struct ras_core_context *ras_core, + struct aca_block *aca_blk, struct aca_bank_reg *bank, + struct aca_bank_ecc *bank_ecc) +{ + struct aca_ecc_info *info; + struct aca_ecc_count *ecc_err; + struct aca_aid_ecc *aid_ecc; + int ret; + + info = &bank_ecc->bank_info; + + ret = aca_check_block_ecc_info(ras_core, aca_blk, info); + if (ret) + return ret; + + mutex_lock(&ras_core->ras_aca.aca_lock); + aid_ecc = &aca_blk->ecc.socket[info->socket_id].aid[info->die_id]; + if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX) + ecc_err = &aid_ecc->xcd.xcd[info->xcd_id].ecc_err; + else + ecc_err = &aid_ecc->ecc_err; + + ecc_err->new_ce_count += bank_ecc->ce_count; + ecc_err->total_ce_count += bank_ecc->ce_count; + ecc_err->new_ue_count += bank_ecc->ue_count; + ecc_err->total_ue_count += bank_ecc->ue_count; + ecc_err->new_de_count += bank_ecc->de_count; + ecc_err->total_de_count += bank_ecc->de_count; + mutex_unlock(&ras_core->ras_aca.aca_lock); + + if ((aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC) && + bank_ecc->de_count) { + struct ras_bank_ecc ras_ecc = {0}; + + ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); + ras_ecc.addr = bank_ecc->bank_info.addr; + ras_ecc.ipid = bank_ecc->bank_info.ipid; + ras_ecc.status = bank_ecc->bank_info.status; + ras_ecc.seq_no = bank->seq_no; + + if (ras_core_gpu_in_reset(ras_core)) + ras_umc_log_bad_bank_pending(ras_core, &ras_ecc); + else + ras_umc_log_bad_bank(ras_core, &ras_ecc); + } + + aca_report_ecc_info(ras_core, + bank->seq_no, aca_blk->blk_info->ras_block_id, info->socket_id, info->die_id, + &aca_blk->ecc.socket[info->socket_id].aid[info->die_id], bank_ecc); + + return 0; +} + +static struct aca_block *aca_get_bank_aca_block(struct ras_core_context *ras_core, + struct aca_bank_reg *bank) +{ + int i = 0; + + for (i = 0; i < RAS_BLOCK_ID__LAST; i++) + if (aca_match_bank(&ras_core->ras_aca.aca_blk[i], bank)) + return &ras_core->ras_aca.aca_blk[i]; + + return NULL; +} + +static int aca_dump_bank(struct ras_core_context *ras_core, u32 ecc_type, + int idx, void *data) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + int i, ret, reg_cnt; + + reg_cnt = min_t(int, 16, ARRAY_SIZE(bank->regs)); + for (i = 0; i < reg_cnt; i++) { + ret = ras_mp1_dump_bank(ras_core, ecc_type, idx, i, &bank->regs[i]); + if (ret) + return ret; + } + + return 0; +} + +static uint64_t aca_get_bank_seqno(struct ras_core_context *ras_core, + enum ras_err_type err_type, struct aca_block *aca_blk, + struct aca_bank_ecc *bank_ecc) +{ + uint64_t seq_no = 0; + + if (bank_ecc->de_count) { + if (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__UMC) + seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_DE, true); + else + seq_no = ras_core_get_seqno(ras_core, + RAS_SEQNO_TYPE_POISON_CONSUMPTION, true); + } else if (bank_ecc->ue_count) { + seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_UE, true); + } else { + seq_no = ras_core_get_seqno(ras_core, RAS_SEQNO_TYPE_CE, true); + } + + return seq_no; +} + +static bool aca_dup_update_ue_in_fatal(struct ras_core_context *ras_core, + u32 ecc_type) +{ + struct ras_aca *aca = &ras_core->ras_aca; + + if (ecc_type != RAS_ERR_TYPE__UE) + return false; + + if (aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) { + if (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG) + return true; + + aca->ue_updated_mark |= ACA_MARK_UE_READ_FLAG; + } + + return false; +} + +void ras_aca_mark_fatal_flag(struct ras_core_context *ras_core) +{ + struct ras_aca *aca = &ras_core->ras_aca; + + if (!aca) + return; + + aca->ue_updated_mark |= ACA_MARK_FATAL_FLAG; +} + +void ras_aca_clear_fatal_flag(struct ras_core_context *ras_core) +{ + struct ras_aca *aca = &ras_core->ras_aca; + + if (!aca) + return; + + if ((aca->ue_updated_mark & ACA_MARK_FATAL_FLAG) && + (aca->ue_updated_mark & ACA_MARK_UE_READ_FLAG)) + aca->ue_updated_mark = 0; +} + +static int aca_banks_update(struct ras_core_context *ras_core, + u32 ecc_type, void *data) +{ + struct aca_bank_reg bank; + struct aca_block *aca_blk; + struct aca_bank_ecc bank_ecc; + struct ras_log_batch_tag *batch_tag = NULL; + u32 count = 0; + int ret = 0; + int i; + + mutex_lock(&ras_core->ras_aca.bank_op_lock); + + if (aca_dup_update_ue_in_fatal(ras_core, ecc_type)) + goto out; + + ret = aca_get_bank_count(ras_core, ecc_type, &count); + if (ret) + goto out; + + if (!count) + goto out; + + batch_tag = ras_log_ring_create_batch_tag(ras_core); + for (i = 0; i < count; i++) { + memset(&bank, 0, sizeof(bank)); + ret = aca_dump_bank(ras_core, ecc_type, i, &bank); + if (ret) + break; + + bank.ecc_type = ecc_type; + + memset(&bank_ecc, 0, sizeof(bank_ecc)); + aca_blk = aca_get_bank_aca_block(ras_core, &bank); + if (aca_blk) + ret = aca_parse_bank(ras_core, aca_blk, &bank, &bank_ecc); + + bank.seq_no = aca_get_bank_seqno(ras_core, ecc_type, aca_blk, &bank_ecc); + + aca_log_bank_data(ras_core, &bank, &bank_ecc, batch_tag); + aca_bank_log(ras_core, i, count, &bank, &bank_ecc); + + if (!ret && aca_blk) + ret = aca_log_bad_bank(ras_core, aca_blk, &bank, &bank_ecc); + + if (ret) + break; + } + ras_log_ring_destroy_batch_tag(ras_core, batch_tag); + +out: + mutex_unlock(&ras_core->ras_aca.bank_op_lock); + return ret; +} + +int ras_aca_update_ecc(struct ras_core_context *ras_core, u32 type, void *data) +{ + /* Update aca bank to aca source error_cache first */ + return aca_banks_update(ras_core, type, data); +} + +static struct aca_block *ras_aca_get_block_handle(struct ras_core_context *ras_core, uint32_t blk) +{ + return &ras_core->ras_aca.aca_blk[blk]; +} + +static int ras_aca_clear_block_ecc_count(struct ras_core_context *ras_core, u32 blk) +{ + struct aca_block *aca_blk; + struct aca_aid_ecc *aid_ecc; + int skt, aid, xcd; + + mutex_lock(&ras_core->ras_aca.aca_lock); + aca_blk = ras_aca_get_block_handle(ras_core, blk); + for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { + for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { + aid_ecc = &aca_blk->ecc.socket[skt].aid[aid]; + if (blk == RAS_BLOCK_ID__GFX) { + for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++) + memset(&aid_ecc->xcd.xcd[xcd], + 0, sizeof(struct aca_xcd_ecc)); + } else { + memset(&aid_ecc->ecc_err, 0, sizeof(aid_ecc->ecc_err)); + } + } + } + mutex_unlock(&ras_core->ras_aca.aca_lock); + + return 0; +} + +int ras_aca_clear_all_blocks_ecc_count(struct ras_core_context *ras_core) +{ + enum ras_block_id blk; + int ret; + + for (blk = RAS_BLOCK_ID__UMC; blk < RAS_BLOCK_ID__LAST; blk++) { + ret = ras_aca_clear_block_ecc_count(ras_core, blk); + if (ret) + break; + } + + return ret; +} + +int ras_aca_clear_block_new_ecc_count(struct ras_core_context *ras_core, u32 blk) +{ + struct aca_block *aca_blk; + int skt, aid, xcd; + struct aca_ecc_count *ecc_err; + struct aca_aid_ecc *aid_ecc; + + mutex_lock(&ras_core->ras_aca.aca_lock); + aca_blk = ras_aca_get_block_handle(ras_core, blk); + for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { + for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { + aid_ecc = &aca_blk->ecc.socket[skt].aid[aid]; + if (blk == RAS_BLOCK_ID__GFX) { + for (xcd = 0; xcd < aid_ecc->xcd.xcd_num; xcd++) { + ecc_err = &aid_ecc->xcd.xcd[xcd].ecc_err; + ecc_err->new_ce_count = 0; + ecc_err->new_ue_count = 0; + ecc_err->new_de_count = 0; + } + } else { + ecc_err = &aid_ecc->ecc_err; + ecc_err->new_ce_count = 0; + ecc_err->new_ue_count = 0; + ecc_err->new_de_count = 0; + } + } + } + mutex_unlock(&ras_core->ras_aca.aca_lock); + + return 0; +} + +static int ras_aca_get_block_each_aid_ecc_count(struct ras_core_context *ras_core, + u32 blk, u32 skt, u32 aid, u32 xcd, + struct aca_ecc_count *ecc_count) +{ + struct aca_block *aca_blk; + struct aca_ecc_count *ecc_err; + + aca_blk = ras_aca_get_block_handle(ras_core, blk); + if (blk == RAS_BLOCK_ID__GFX) + ecc_err = &aca_blk->ecc.socket[skt].aid[aid].xcd.xcd[xcd].ecc_err; + else + ecc_err = &aca_blk->ecc.socket[skt].aid[aid].ecc_err; + + ecc_count->new_ce_count = ecc_err->new_ce_count; + ecc_count->total_ce_count = ecc_err->total_ce_count; + ecc_count->new_ue_count = ecc_err->new_ue_count; + ecc_count->total_ue_count = ecc_err->total_ue_count; + ecc_count->new_de_count = ecc_err->new_de_count; + ecc_count->total_de_count = ecc_err->total_de_count; + + return 0; +} + +static inline void _add_ecc_count(struct aca_ecc_count *des, struct aca_ecc_count *src) +{ + des->new_ce_count += src->new_ce_count; + des->total_ce_count += src->total_ce_count; + des->new_ue_count += src->new_ue_count; + des->total_ue_count += src->total_ue_count; + des->new_de_count += src->new_de_count; + des->total_de_count += src->total_de_count; +} + +static const struct ras_aca_ip_func *aca_get_ip_func( + struct ras_core_context *ras_core, uint32_t ip_version) +{ + switch (ip_version) { + case IP_VERSION(1, 0, 0): + return &ras_aca_func_v1_0; + default: + RAS_DEV_ERR(ras_core->dev, + "ACA ip version(0x%x) is not supported!\n", ip_version); + break; + } + + return NULL; +} + +int ras_aca_get_block_ecc_count(struct ras_core_context *ras_core, + u32 blk, void *data) +{ + struct ras_ecc_count *err_data = (struct ras_ecc_count *)data; + struct aca_block *aca_blk; + int skt, aid, xcd; + struct aca_ecc_count ecc_xcd; + struct aca_ecc_count ecc_aid; + struct aca_ecc_count ecc; + + if (blk >= RAS_BLOCK_ID__LAST) + return -EINVAL; + + if (!err_data) + return -EINVAL; + + aca_blk = ras_aca_get_block_handle(ras_core, blk); + memset(&ecc, 0, sizeof(ecc)); + + mutex_lock(&ras_core->ras_aca.aca_lock); + if (blk == RAS_BLOCK_ID__GFX) { + for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { + for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { + memset(&ecc_aid, 0, sizeof(ecc_aid)); + for (xcd = 0; + xcd < aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num; + xcd++) { + memset(&ecc_xcd, 0, sizeof(ecc_xcd)); + if (ras_aca_get_block_each_aid_ecc_count(ras_core, + blk, skt, aid, xcd, &ecc_xcd)) + continue; + _add_ecc_count(&ecc_aid, &ecc_xcd); + } + _add_ecc_count(&ecc, &ecc_aid); + } + } + } else { + for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { + for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) { + memset(&ecc_aid, 0, sizeof(ecc_aid)); + if (ras_aca_get_block_each_aid_ecc_count(ras_core, + blk, skt, aid, 0, &ecc_aid)) + continue; + _add_ecc_count(&ecc, &ecc_aid); + } + } + } + + err_data->new_ce_count = ecc.new_ce_count; + err_data->total_ce_count = ecc.total_ce_count; + err_data->new_ue_count = ecc.new_ue_count; + err_data->total_ue_count = ecc.total_ue_count; + err_data->new_de_count = ecc.new_de_count; + err_data->total_de_count = ecc.total_de_count; + mutex_unlock(&ras_core->ras_aca.aca_lock); + + return 0; +} + +int ras_aca_sw_init(struct ras_core_context *ras_core) +{ + struct ras_aca *ras_aca = &ras_core->ras_aca; + struct ras_aca_config *aca_cfg = &ras_core->config->aca_cfg; + struct aca_block *aca_blk; + uint32_t socket_num_per_hive; + uint32_t aid_num_per_socket; + uint32_t xcd_num_per_aid; + int blk, skt, aid; + + socket_num_per_hive = aca_cfg->socket_num_per_hive; + aid_num_per_socket = aca_cfg->aid_num_per_socket; + xcd_num_per_aid = aca_cfg->xcd_num_per_aid; + + if (!xcd_num_per_aid || !aid_num_per_socket || + (socket_num_per_hive > MAX_SOCKET_NUM_PER_HIVE) || + (aid_num_per_socket > MAX_AID_NUM_PER_SOCKET) || + (xcd_num_per_aid > MAX_XCD_NUM_PER_AID)) { + RAS_DEV_ERR(ras_core->dev, "Invalid ACA system configuration: %d, %d, %d\n", + socket_num_per_hive, aid_num_per_socket, xcd_num_per_aid); + return -EINVAL; + } + + memset(ras_aca, 0, sizeof(*ras_aca)); + + for (blk = 0; blk < RAS_BLOCK_ID__LAST; blk++) { + aca_blk = &ras_aca->aca_blk[blk]; + aca_blk->ecc.socket_num_per_hive = socket_num_per_hive; + for (skt = 0; skt < aca_blk->ecc.socket_num_per_hive; skt++) { + aca_blk->ecc.socket[skt].aid_num = aid_num_per_socket; + if (blk == RAS_BLOCK_ID__GFX) { + for (aid = 0; aid < aca_blk->ecc.socket[skt].aid_num; aid++) + aca_blk->ecc.socket[skt].aid[aid].xcd.xcd_num = + xcd_num_per_aid; + } + } + } + + mutex_init(&ras_aca->aca_lock); + mutex_init(&ras_aca->bank_op_lock); + + return 0; +} + +int ras_aca_sw_fini(struct ras_core_context *ras_core) +{ + struct ras_aca *ras_aca = &ras_core->ras_aca; + + mutex_destroy(&ras_aca->aca_lock); + mutex_destroy(&ras_aca->bank_op_lock); + + return 0; +} + +int ras_aca_hw_init(struct ras_core_context *ras_core) +{ + struct ras_aca *ras_aca = &ras_core->ras_aca; + struct aca_block *aca_blk; + const struct ras_aca_ip_func *ip_func; + int i; + + ras_aca->aca_ip_version = ras_core->config->aca_ip_version; + ip_func = aca_get_ip_func(ras_core, ras_aca->aca_ip_version); + if (!ip_func) + return -EINVAL; + + for (i = 0; i < ip_func->block_num; i++) { + aca_blk = &ras_aca->aca_blk[ip_func->block_info[i]->ras_block_id]; + aca_blk->blk_info = ip_func->block_info[i]; + } + + ras_aca->ue_updated_mark = 0; + + return 0; +} + +int ras_aca_hw_fini(struct ras_core_context *ras_core) +{ + struct ras_aca *ras_aca = &ras_core->ras_aca; + + ras_aca->ue_updated_mark = 0; + + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca.h b/drivers/gpu/drm/amd/ras/rascore/ras_aca.h new file mode 100644 index 000000000000..f61b02a5f0fc --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca.h @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_ACA_H__ +#define __RAS_ACA_H__ +#include "ras.h" + +#define MAX_SOCKET_NUM_PER_HIVE 8 +#define MAX_AID_NUM_PER_SOCKET 4 +#define MAX_XCD_NUM_PER_AID 2 +#define MAX_ACA_RAS_BLOCK 20 + +#define ACA_ERROR__UE_MASK (0x1 << RAS_ERR_TYPE__UE) +#define ACA_ERROR__CE_MASK (0x1 << RAS_ERR_TYPE__CE) +#define ACA_ERROR__DE_MASK (0x1 << RAS_ERR_TYPE__DE) + +enum ras_aca_reg_idx { + ACA_REG_IDX__CTL = 0, + ACA_REG_IDX__STATUS = 1, + ACA_REG_IDX__ADDR = 2, + ACA_REG_IDX__MISC0 = 3, + ACA_REG_IDX__CONFG = 4, + ACA_REG_IDX__IPID = 5, + ACA_REG_IDX__SYND = 6, + ACA_REG_IDX__DESTAT = 8, + ACA_REG_IDX__DEADDR = 9, + ACA_REG_IDX__CTL_MASK = 10, + ACA_REG_MAX_COUNT = 16, +}; + +struct ras_core_context; +struct aca_block; + +struct aca_bank_reg { + u32 ecc_type; + u64 seq_no; + u64 regs[ACA_REG_MAX_COUNT]; +}; + +enum aca_ecc_hwip { + ACA_ECC_HWIP__UNKNOWN = -1, + ACA_ECC_HWIP__PSP = 0, + ACA_ECC_HWIP__UMC, + ACA_ECC_HWIP__SMU, + ACA_ECC_HWIP__PCS_XGMI, + ACA_ECC_HWIP_COUNT, +}; + +struct aca_ecc_info { + int die_id; + int socket_id; + int xcd_id; + int hwid; + int mcatype; + uint64_t status; + uint64_t ipid; + uint64_t addr; +}; + +struct aca_bank_ecc { + struct aca_ecc_info bank_info; + u32 ce_count; + u32 ue_count; + u32 de_count; +}; + +struct aca_ecc_count { + u32 new_ce_count; + u32 total_ce_count; + u32 new_ue_count; + u32 total_ue_count; + u32 new_de_count; + u32 total_de_count; +}; + +struct aca_xcd_ecc { + struct aca_ecc_count ecc_err; +}; + +struct aca_aid_ecc { + union { + struct aca_xcd { + struct aca_xcd_ecc xcd[MAX_XCD_NUM_PER_AID]; + u32 xcd_num; + } xcd; + struct aca_ecc_count ecc_err; + }; +}; + +struct aca_socket_ecc { + struct aca_aid_ecc aid[MAX_AID_NUM_PER_SOCKET]; + u32 aid_num; +}; + +struct aca_block_ecc { + struct aca_socket_ecc socket[MAX_SOCKET_NUM_PER_HIVE]; + u32 socket_num_per_hive; +}; + +struct aca_bank_hw_ops { + bool (*bank_match)(struct aca_block *ras_blk, void *data); + int (*bank_parse)(struct ras_core_context *ras_core, + struct aca_block *aca_blk, void *data, void *buf); +}; + +struct aca_block_info { + char name[32]; + u32 ras_block_id; + enum aca_ecc_hwip hwip; + struct aca_bank_hw_ops bank_ops; + u32 mask; +}; + +struct aca_block { + const struct aca_block_info *blk_info; + struct aca_block_ecc ecc; +}; + +struct ras_aca_ip_func { + uint32_t block_num; + const struct aca_block_info **block_info; +}; + +struct ras_aca { + uint32_t aca_ip_version; + const struct ras_aca_ip_func *ip_func; + struct mutex aca_lock; + struct mutex bank_op_lock; + struct aca_block aca_blk[MAX_ACA_RAS_BLOCK]; + uint32_t ue_updated_mark; +}; + +int ras_aca_sw_init(struct ras_core_context *ras_core); +int ras_aca_sw_fini(struct ras_core_context *ras_core); +int ras_aca_hw_init(struct ras_core_context *ras_core); +int ras_aca_hw_fini(struct ras_core_context *ras_core); +int ras_aca_get_block_ecc_count(struct ras_core_context *ras_core, u32 blk, void *data); +int ras_aca_clear_block_new_ecc_count(struct ras_core_context *ras_core, u32 blk); +int ras_aca_clear_all_blocks_ecc_count(struct ras_core_context *ras_core); +int ras_aca_update_ecc(struct ras_core_context *ras_core, u32 ecc_type, void *data); +void ras_aca_mark_fatal_flag(struct ras_core_context *ras_core); +void ras_aca_clear_fatal_flag(struct ras_core_context *ras_core); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c new file mode 100644 index 000000000000..29df98948703 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_aca.h" +#include "ras_core_status.h" +#include "ras_aca_v1_0.h" + +struct ras_aca_hwip { + int hwid; + int mcatype; +}; + +static struct ras_aca_hwip aca_hwid_mcatypes[ACA_ECC_HWIP_COUNT] = { + [ACA_ECC_HWIP__SMU] = {0x01, 0x01}, + [ACA_ECC_HWIP__PCS_XGMI] = {0x50, 0x00}, + [ACA_ECC_HWIP__UMC] = {0x96, 0x00}, +}; + +static int aca_decode_bank_info(struct aca_block *aca_blk, + struct aca_bank_reg *bank, struct aca_ecc_info *info) +{ + u64 ipid; + u32 instidhi, instidlo; + + ipid = bank->regs[ACA_REG_IDX__IPID]; + info->hwid = ACA_REG_IPID_HARDWAREID(ipid); + info->mcatype = ACA_REG_IPID_MCATYPE(ipid); + /* + * Unified DieID Format: SAASS. A:AID, S:Socket. + * Unified DieID[4:4] = InstanceId[0:0] + * Unified DieID[0:3] = InstanceIdHi[0:3] + */ + instidhi = ACA_REG_IPID_INSTANCEIDHI(ipid); + instidlo = ACA_REG_IPID_INSTANCEIDLO(ipid); + info->die_id = ((instidhi >> 2) & 0x03); + info->socket_id = ((instidlo & 0x1) << 2) | (instidhi & 0x03); + + if ((aca_blk->blk_info->hwip == ACA_ECC_HWIP__SMU) && + (aca_blk->blk_info->ras_block_id == RAS_BLOCK_ID__GFX)) + info->xcd_id = + ((instidlo & GENMASK_ULL(31, 1)) == mmSMNAID_XCD0_MCA_SMU) ? 0 : 1; + + return 0; +} + +static bool aca_check_bank_hwip(struct aca_bank_reg *bank, enum aca_ecc_hwip type) +{ + struct ras_aca_hwip *hwip; + int hwid, mcatype; + u64 ipid; + + if (!bank || (type == ACA_ECC_HWIP__UNKNOWN)) + return false; + + hwip = &aca_hwid_mcatypes[type]; + if (!hwip->hwid) + return false; + + ipid = bank->regs[ACA_REG_IDX__IPID]; + hwid = ACA_REG_IPID_HARDWAREID(ipid); + mcatype = ACA_REG_IPID_MCATYPE(ipid); + + return hwip->hwid == hwid && hwip->mcatype == mcatype; +} + +static bool aca_match_bank_default(struct aca_block *aca_blk, void *data) +{ + return aca_check_bank_hwip((struct aca_bank_reg *)data, aca_blk->blk_info->hwip); +} + +static bool aca_match_gfx_bank(struct aca_block *aca_blk, void *data) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + u32 instlo; + + if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip)) + return false; + + instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); + instlo &= GENMASK_ULL(31, 1); + switch (instlo) { + case mmSMNAID_XCD0_MCA_SMU: + case mmSMNAID_XCD1_MCA_SMU: + case mmSMNXCD_XCD0_MCA_SMU: + return true; + default: + break; + } + + return false; +} + +static bool aca_match_sdma_bank(struct aca_block *aca_blk, void *data) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + /* CODE_SDMA0 - CODE_SDMA4, reference to smu driver if header file */ + static int sdma_err_codes[] = { 33, 34, 35, 36 }; + u32 instlo; + int errcode, i; + + if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip)) + return false; + + instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); + instlo &= GENMASK_ULL(31, 1); + if (instlo != mmSMNAID_AID0_MCA_SMU) + return false; + + errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]); + errcode &= 0xff; + + /* Check SDMA error codes */ + for (i = 0; i < ARRAY_SIZE(sdma_err_codes); i++) { + if (errcode == sdma_err_codes[i]) + return true; + } + + return false; +} + +static bool aca_match_mmhub_bank(struct aca_block *aca_blk, void *data) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + /* reference to smu driver if header file */ + const int mmhub_err_codes[] = { + 0, 1, 2, 3, 4, /* CODE_DAGB0 - 4 */ + 5, 6, 7, 8, 9, /* CODE_EA0 - 4 */ + 10, /* CODE_UTCL2_ROUTER */ + 11, /* CODE_VML2 */ + 12, /* CODE_VML2_WALKER */ + 13, /* CODE_MMCANE */ + }; + u32 instlo; + int errcode, i; + + if (!aca_check_bank_hwip(bank, aca_blk->blk_info->hwip)) + return false; + + instlo = ACA_REG_IPID_INSTANCEIDLO(bank->regs[ACA_REG_IDX__IPID]); + instlo &= GENMASK_ULL(31, 1); + if (instlo != mmSMNAID_AID0_MCA_SMU) + return false; + + errcode = ACA_REG_SYND_ERRORINFORMATION(bank->regs[ACA_REG_IDX__SYND]); + errcode &= 0xff; + + /* Check MMHUB error codes */ + for (i = 0; i < ARRAY_SIZE(mmhub_err_codes); i++) { + if (errcode == mmhub_err_codes[i]) + return true; + } + + return false; +} + +static bool aca_check_umc_de(struct ras_core_context *ras_core, uint64_t mc_umc_status) +{ + return (ras_core->poison_supported && + ACA_REG_STATUS_VAL(mc_umc_status) && + ACA_REG_STATUS_DEFERRED(mc_umc_status)); +} + +static bool aca_check_umc_ue(struct ras_core_context *ras_core, uint64_t mc_umc_status) +{ + if (aca_check_umc_de(ras_core, mc_umc_status)) + return false; + + return (ACA_REG_STATUS_VAL(mc_umc_status) && + (ACA_REG_STATUS_PCC(mc_umc_status) || + ACA_REG_STATUS_UC(mc_umc_status) || + ACA_REG_STATUS_TCC(mc_umc_status))); +} + +static bool aca_check_umc_ce(struct ras_core_context *ras_core, uint64_t mc_umc_status) +{ + if (aca_check_umc_de(ras_core, mc_umc_status)) + return false; + + return (ACA_REG_STATUS_VAL(mc_umc_status) && + (ACA_REG_STATUS_CECC(mc_umc_status) || + (ACA_REG_STATUS_UECC(mc_umc_status) && + ACA_REG_STATUS_UC(mc_umc_status) == 0) || + /* Identify data parity error in replay mode */ + ((ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0x5 || + ACA_REG_STATUS_ERRORCODEEXT(mc_umc_status) == 0xb) && + !(aca_check_umc_ue(ras_core, mc_umc_status))))); +} + +static int aca_parse_umc_bank(struct ras_core_context *ras_core, + struct aca_block *ras_blk, void *data, void *buf) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; + struct aca_ecc_info bank_info; + uint32_t ext_error_code; + uint64_t status0; + + status0 = bank->regs[ACA_REG_IDX__STATUS]; + if (!ACA_REG_STATUS_VAL(status0)) + return 0; + + memset(&bank_info, 0, sizeof(bank_info)); + aca_decode_bank_info(ras_blk, bank, &bank_info); + memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); + ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS]; + ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; + ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; + + ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status0); + + if (aca_check_umc_de(ras_core, status0)) + ecc->de_count = 1; + else if (aca_check_umc_ue(ras_core, status0)) + ecc->ue_count = ext_error_code ? + 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); + else if (aca_check_umc_ce(ras_core, status0)) + ecc->ce_count = ext_error_code ? + 1 : ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); + + return 0; +} + +static bool aca_check_bank_is_de(struct ras_core_context *ras_core, + uint64_t status) +{ + return (ACA_REG_STATUS_POISON(status) || + ACA_REG_STATUS_DEFERRED(status)); +} + +static int aca_parse_bank_default(struct ras_core_context *ras_core, + struct aca_block *ras_blk, + void *data, void *buf) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; + struct aca_ecc_info bank_info; + u64 misc0 = bank->regs[ACA_REG_IDX__MISC0]; + u64 status = bank->regs[ACA_REG_IDX__STATUS]; + + memset(&bank_info, 0, sizeof(bank_info)); + aca_decode_bank_info(ras_blk, bank, &bank_info); + memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); + ecc->bank_info.status = status; + ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; + ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; + + if (aca_check_bank_is_de(ras_core, status)) { + ecc->de_count = 1; + } else { + if (bank->ecc_type == RAS_ERR_TYPE__UE) + ecc->ue_count = 1; + else if (bank->ecc_type == RAS_ERR_TYPE__CE) + ecc->ce_count = ACA_REG_MISC0_ERRCNT(misc0); + } + + return 0; +} + +static int aca_parse_xgmi_bank(struct ras_core_context *ras_core, + struct aca_block *ras_blk, + void *data, void *buf) +{ + struct aca_bank_reg *bank = (struct aca_bank_reg *)data; + struct aca_bank_ecc *ecc = (struct aca_bank_ecc *)buf; + struct aca_ecc_info bank_info; + u64 status, count; + int ext_error_code; + + memset(&bank_info, 0, sizeof(bank_info)); + aca_decode_bank_info(ras_blk, bank, &bank_info); + memcpy(&ecc->bank_info, &bank_info, sizeof(bank_info)); + ecc->bank_info.status = bank->regs[ACA_REG_IDX__STATUS]; + ecc->bank_info.ipid = bank->regs[ACA_REG_IDX__IPID]; + ecc->bank_info.addr = bank->regs[ACA_REG_IDX__ADDR]; + + status = bank->regs[ACA_REG_IDX__STATUS]; + ext_error_code = ACA_REG_STATUS_ERRORCODEEXT(status); + + count = ACA_REG_MISC0_ERRCNT(bank->regs[ACA_REG_IDX__MISC0]); + if (bank->ecc_type == RAS_ERR_TYPE__UE) { + if (ext_error_code != 0 && ext_error_code != 9) + count = 0ULL; + ecc->ue_count = count; + } else if (bank->ecc_type == RAS_ERR_TYPE__CE) { + count = ext_error_code == 6 ? count : 0ULL; + ecc->ce_count = count; + } + + return 0; +} + +static const struct aca_block_info aca_v1_0_umc = { + .name = "umc", + .ras_block_id = RAS_BLOCK_ID__UMC, + .hwip = ACA_ECC_HWIP__UMC, + .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK | ACA_ERROR__DE_MASK, + .bank_ops = { + .bank_match = aca_match_bank_default, + .bank_parse = aca_parse_umc_bank, + }, +}; + +static const struct aca_block_info aca_v1_0_gfx = { + .name = "gfx", + .ras_block_id = RAS_BLOCK_ID__GFX, + .hwip = ACA_ECC_HWIP__SMU, + .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK, + .bank_ops = { + .bank_match = aca_match_gfx_bank, + .bank_parse = aca_parse_bank_default, + }, +}; + +static const struct aca_block_info aca_v1_0_sdma = { + .name = "sdma", + .ras_block_id = RAS_BLOCK_ID__SDMA, + .hwip = ACA_ECC_HWIP__SMU, + .mask = ACA_ERROR__UE_MASK, + .bank_ops = { + .bank_match = aca_match_sdma_bank, + .bank_parse = aca_parse_bank_default, + }, +}; + +static const struct aca_block_info aca_v1_0_mmhub = { + .name = "mmhub", + .ras_block_id = RAS_BLOCK_ID__MMHUB, + .hwip = ACA_ECC_HWIP__SMU, + .mask = ACA_ERROR__UE_MASK, + .bank_ops = { + .bank_match = aca_match_mmhub_bank, + .bank_parse = aca_parse_bank_default, + }, +}; + +static const struct aca_block_info aca_v1_0_xgmi = { + .name = "xgmi", + .ras_block_id = RAS_BLOCK_ID__XGMI_WAFL, + .hwip = ACA_ECC_HWIP__PCS_XGMI, + .mask = ACA_ERROR__UE_MASK | ACA_ERROR__CE_MASK, + .bank_ops = { + .bank_match = aca_match_bank_default, + .bank_parse = aca_parse_xgmi_bank, + }, +}; + +static const struct aca_block_info *aca_block_info_v1_0[] = { + &aca_v1_0_umc, + &aca_v1_0_gfx, + &aca_v1_0_sdma, + &aca_v1_0_mmhub, + &aca_v1_0_xgmi, +}; + +const struct ras_aca_ip_func ras_aca_func_v1_0 = { + .block_num = ARRAY_SIZE(aca_block_info_v1_0), + .block_info = aca_block_info_v1_0, +}; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h new file mode 100644 index 000000000000..40e5d94b037f --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_aca_v1_0.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_ACA_V1_0_H__ +#define __RAS_ACA_V1_0_H__ +#include "ras.h" + +#define ACA__REG__FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> l) +#define ACA_REG_STATUS_VAL(x) ACA__REG__FIELD(x, 63, 63) +#define ACA_REG_STATUS_OVERFLOW(x) ACA__REG__FIELD(x, 62, 62) +#define ACA_REG_STATUS_UC(x) ACA__REG__FIELD(x, 61, 61) +#define ACA_REG_STATUS_EN(x) ACA__REG__FIELD(x, 60, 60) +#define ACA_REG_STATUS_MISCV(x) ACA__REG__FIELD(x, 59, 59) +#define ACA_REG_STATUS_ADDRV(x) ACA__REG__FIELD(x, 58, 58) +#define ACA_REG_STATUS_PCC(x) ACA__REG__FIELD(x, 57, 57) +#define ACA_REG_STATUS_ERRCOREIDVAL(x) ACA__REG__FIELD(x, 56, 56) +#define ACA_REG_STATUS_TCC(x) ACA__REG__FIELD(x, 55, 55) +#define ACA_REG_STATUS_SYNDV(x) ACA__REG__FIELD(x, 53, 53) +#define ACA_REG_STATUS_CECC(x) ACA__REG__FIELD(x, 46, 46) +#define ACA_REG_STATUS_UECC(x) ACA__REG__FIELD(x, 45, 45) +#define ACA_REG_STATUS_DEFERRED(x) ACA__REG__FIELD(x, 44, 44) +#define ACA_REG_STATUS_POISON(x) ACA__REG__FIELD(x, 43, 43) +#define ACA_REG_STATUS_SCRUB(x) ACA__REG__FIELD(x, 40, 40) +#define ACA_REG_STATUS_ERRCOREID(x) ACA__REG__FIELD(x, 37, 32) +#define ACA_REG_STATUS_ADDRLSB(x) ACA__REG__FIELD(x, 29, 24) +#define ACA_REG_STATUS_ERRORCODEEXT(x) ACA__REG__FIELD(x, 21, 16) +#define ACA_REG_STATUS_ERRORCODE(x) ACA__REG__FIELD(x, 15, 0) + +#define ACA_REG_IPID_MCATYPE(x) ACA__REG__FIELD(x, 63, 48) +#define ACA_REG_IPID_INSTANCEIDHI(x) ACA__REG__FIELD(x, 47, 44) +#define ACA_REG_IPID_HARDWAREID(x) ACA__REG__FIELD(x, 43, 32) +#define ACA_REG_IPID_INSTANCEIDLO(x) ACA__REG__FIELD(x, 31, 0) + +#define ACA_REG_MISC0_VALID(x) ACA__REG__FIELD(x, 63, 63) +#define ACA_REG_MISC0_OVRFLW(x) ACA__REG__FIELD(x, 48, 48) +#define ACA_REG_MISC0_ERRCNT(x) ACA__REG__FIELD(x, 43, 32) + +#define ACA_REG_SYND_ERRORINFORMATION(x) ACA__REG__FIELD(x, 17, 0) + +/* NOTE: The following codes refers to the smu header file */ +#define ACA_EXTERROR_CODE_CE 0x3a +#define ACA_EXTERROR_CODE_FAULT 0x3b + +#define mmSMNAID_XCD0_MCA_SMU 0x36430400 /* SMN AID XCD0 */ +#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */ +#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */ +#define mmSMNAID_AID0_MCA_SMU 0x03b30400 /* SMN AID AID0 */ + +extern const struct ras_aca_ip_func ras_aca_func_v1_0; +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c new file mode 100644 index 000000000000..94e6d7420d94 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.c @@ -0,0 +1,522 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_cmd.h" + +#define RAS_CMD_MAJOR_VERSION 6 +#define RAS_CMD_MINOR_VERSION 0 +#define RAS_CMD_VERSION (((RAS_CMD_MAJOR_VERSION) << 10) | (RAS_CMD_MINOR_VERSION)) + +static int ras_cmd_add_device(struct ras_core_context *ras_core) +{ + INIT_LIST_HEAD(&ras_core->ras_cmd.head); + ras_core->ras_cmd.ras_core = ras_core; + ras_core->ras_cmd.dev_handle = (uintptr_t)ras_core ^ RAS_CMD_DEV_HANDLE_MAGIC; + return 0; +} + +static int ras_cmd_remove_device(struct ras_core_context *ras_core) +{ + memset(&ras_core->ras_cmd, 0, sizeof(ras_core->ras_cmd)); + return 0; +} + +static int ras_get_block_ecc_info(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_block_ecc_info_req *input_data = + (struct ras_cmd_block_ecc_info_req *)cmd->input_buff_raw; + struct ras_cmd_block_ecc_info_rsp *output_data = + (struct ras_cmd_block_ecc_info_rsp *)cmd->output_buff_raw; + struct ras_ecc_count err_data; + int ret; + + if (cmd->input_size != sizeof(struct ras_cmd_block_ecc_info_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + memset(&err_data, 0, sizeof(err_data)); + ret = ras_aca_get_block_ecc_count(ras_core, input_data->block_id, &err_data); + if (ret) + return RAS_CMD__ERROR_GENERIC; + + output_data->ce_count = err_data.total_ce_count; + output_data->ue_count = err_data.total_ue_count; + output_data->de_count = err_data.total_de_count; + + cmd->output_size = sizeof(struct ras_cmd_block_ecc_info_rsp); + return RAS_CMD__SUCCESS; +} + +static void ras_cmd_update_bad_page_info(struct ras_cmd_bad_page_record *ras_cmd_record, + struct eeprom_umc_record *record) +{ + ras_cmd_record->retired_page = record->cur_nps_retired_row_pfn; + ras_cmd_record->ts = record->ts; + ras_cmd_record->err_type = record->err_type; + ras_cmd_record->mem_channel = record->mem_channel; + ras_cmd_record->mcumc_id = record->mcumc_id; + ras_cmd_record->address = record->address; + ras_cmd_record->bank = record->bank; + ras_cmd_record->valid = 1; +} + +static int ras_cmd_get_group_bad_pages(struct ras_core_context *ras_core, + uint32_t group_index, struct ras_cmd_bad_pages_info_rsp *output_data) +{ + struct eeprom_umc_record record; + struct ras_cmd_bad_page_record *ras_cmd_record; + uint32_t i = 0, bp_cnt = 0, group_cnt = 0; + + output_data->bp_in_group = 0; + output_data->group_index = 0; + + bp_cnt = ras_umc_get_badpage_count(ras_core); + if (bp_cnt) { + output_data->group_index = group_index; + group_cnt = bp_cnt / RAS_CMD_MAX_BAD_PAGES_PER_GROUP + + ((bp_cnt % RAS_CMD_MAX_BAD_PAGES_PER_GROUP) ? 1 : 0); + + if (group_index >= group_cnt) + return RAS_CMD__ERROR_INVALID_INPUT_DATA; + + i = group_index * RAS_CMD_MAX_BAD_PAGES_PER_GROUP; + for (; + i < bp_cnt && output_data->bp_in_group < RAS_CMD_MAX_BAD_PAGES_PER_GROUP; + i++) { + if (ras_umc_get_badpage_record(ras_core, i, &record)) + return RAS_CMD__ERROR_GENERIC; + + ras_cmd_record = &output_data->records[i % RAS_CMD_MAX_BAD_PAGES_PER_GROUP]; + + memset(ras_cmd_record, 0, sizeof(*ras_cmd_record)); + ras_cmd_update_bad_page_info(ras_cmd_record, &record); + output_data->bp_in_group++; + } + } + output_data->bp_total_cnt = bp_cnt; + return RAS_CMD__SUCCESS; +} + +static int ras_cmd_get_bad_pages(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_bad_pages_info_req *input_data = + (struct ras_cmd_bad_pages_info_req *)cmd->input_buff_raw; + struct ras_cmd_bad_pages_info_rsp *output_data = + (struct ras_cmd_bad_pages_info_rsp *)cmd->output_buff_raw; + int ret; + + if (cmd->input_size != sizeof(struct ras_cmd_bad_pages_info_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + ret = ras_cmd_get_group_bad_pages(ras_core, input_data->group_index, output_data); + if (ret) + return RAS_CMD__ERROR_GENERIC; + + output_data->version = 0; + + cmd->output_size = sizeof(struct ras_cmd_bad_pages_info_rsp); + return RAS_CMD__SUCCESS; +} + +static int ras_cmd_clear_bad_page_info(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + if (cmd->input_size != sizeof(struct ras_cmd_dev_handle)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + if (ras_eeprom_reset_table(ras_core)) + return RAS_CMD__ERROR_GENERIC; + + if (ras_umc_clean_badpage_data(ras_core)) + return RAS_CMD__ERROR_GENERIC; + + return RAS_CMD__SUCCESS; +} + +static int ras_cmd_reset_all_error_counts(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + if (cmd->input_size != sizeof(struct ras_cmd_dev_handle)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + if (ras_aca_clear_all_blocks_ecc_count(ras_core)) + return RAS_CMD__ERROR_GENERIC; + + if (ras_umc_clear_logged_ecc(ras_core)) + return RAS_CMD__ERROR_GENERIC; + + return RAS_CMD__SUCCESS; +} + +static int ras_cmd_get_cper_snapshot(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_cper_snapshot_rsp *output_data = + (struct ras_cmd_cper_snapshot_rsp *)cmd->output_buff_raw; + struct ras_log_batch_overview overview; + + if (cmd->input_size != sizeof(struct ras_cmd_cper_snapshot_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + ras_log_ring_get_batch_overview(ras_core, &overview); + + output_data->total_cper_num = overview.logged_batch_count; + output_data->start_cper_id = overview.first_batch_id; + output_data->latest_cper_id = overview.last_batch_id; + + output_data->version = 0; + + cmd->output_size = sizeof(struct ras_cmd_cper_snapshot_rsp); + return RAS_CMD__SUCCESS; +} + +static int ras_cmd_get_cper_records(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_cper_record_req *req = + (struct ras_cmd_cper_record_req *)cmd->input_buff_raw; + struct ras_cmd_cper_record_rsp *rsp = + (struct ras_cmd_cper_record_rsp *)cmd->output_buff_raw; + struct ras_log_info *trace[MAX_RECORD_PER_BATCH] = {0}; + struct ras_log_batch_overview overview; + uint32_t offset = 0, real_data_len = 0; + uint64_t batch_id; + uint8_t *buffer; + int ret = 0, i, count; + + if (cmd->input_size != sizeof(struct ras_cmd_cper_record_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + if (!req->buf_size || !req->buf_ptr || !req->cper_num) + return RAS_CMD__ERROR_INVALID_INPUT_DATA; + + buffer = kzalloc(req->buf_size, GFP_KERNEL); + if (!buffer) + return RAS_CMD__ERROR_GENERIC; + + ras_log_ring_get_batch_overview(ras_core, &overview); + for (i = 0; i < req->cper_num; i++) { + batch_id = req->cper_start_id + i; + if (batch_id >= overview.last_batch_id) + break; + + count = ras_log_ring_get_batch_records(ras_core, batch_id, trace, + ARRAY_SIZE(trace)); + if (count > 0) { + ret = ras_cper_generate_cper(ras_core, trace, count, + &buffer[offset], req->buf_size - offset, &real_data_len); + if (ret) + break; + + offset += real_data_len; + } + } + + if ((ret && (ret != -ENOMEM)) || + copy_to_user(u64_to_user_ptr(req->buf_ptr), buffer, offset)) { + kfree(buffer); + return RAS_CMD__ERROR_GENERIC; + } + + rsp->real_data_size = offset; + rsp->real_cper_num = i; + rsp->remain_num = (ret == -ENOMEM) ? (req->cper_num - i) : 0; + rsp->version = 0; + + cmd->output_size = sizeof(struct ras_cmd_cper_record_rsp); + + kfree(buffer); + + return RAS_CMD__SUCCESS; +} + +static int ras_cmd_get_batch_trace_snapshot(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_batch_trace_snapshot_rsp *rsp = + (struct ras_cmd_batch_trace_snapshot_rsp *)cmd->output_buff_raw; + struct ras_log_batch_overview overview; + + + if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_snapshot_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + ras_log_ring_get_batch_overview(ras_core, &overview); + + rsp->total_batch_num = overview.logged_batch_count; + rsp->start_batch_id = overview.first_batch_id; + rsp->latest_batch_id = overview.last_batch_id; + rsp->version = 0; + + cmd->output_size = sizeof(struct ras_cmd_batch_trace_snapshot_rsp); + return RAS_CMD__SUCCESS; +} + +static int ras_cmd_get_batch_trace_records(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_batch_trace_record_req *input_data = + (struct ras_cmd_batch_trace_record_req *)cmd->input_buff_raw; + struct ras_cmd_batch_trace_record_rsp *output_data = + (struct ras_cmd_batch_trace_record_rsp *)cmd->output_buff_raw; + struct ras_log_batch_overview overview; + struct ras_log_info *trace_arry[MAX_RECORD_PER_BATCH] = {0}; + struct ras_log_info *record; + int i, j, count = 0, offset = 0; + uint64_t id; + bool completed = false; + + if (cmd->input_size != sizeof(struct ras_cmd_batch_trace_record_req)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + if ((!input_data->batch_num) || (input_data->batch_num > RAS_CMD_MAX_BATCH_NUM)) + return RAS_CMD__ERROR_INVALID_INPUT_DATA; + + ras_log_ring_get_batch_overview(ras_core, &overview); + if ((input_data->start_batch_id < overview.first_batch_id) || + (input_data->start_batch_id >= overview.last_batch_id)) + return RAS_CMD__ERROR_INVALID_INPUT_SIZE; + + for (i = 0; i < input_data->batch_num; i++) { + id = input_data->start_batch_id + i; + if (id >= overview.last_batch_id) { + completed = true; + break; + } + + count = ras_log_ring_get_batch_records(ras_core, + id, trace_arry, ARRAY_SIZE(trace_arry)); + if (count > 0) { + if ((offset + count) > RAS_CMD_MAX_TRACE_NUM) + break; + for (j = 0; j < count; j++) { + record = &output_data->records[offset + j]; + record->seqno = trace_arry[j]->seqno; + record->timestamp = trace_arry[j]->timestamp; + record->event = trace_arry[j]->event; + memcpy(&record->aca_reg, + &trace_arry[j]->aca_reg, sizeof(trace_arry[j]->aca_reg)); + } + } else { + count = 0; + } + + output_data->batchs[i].batch_id = id; + output_data->batchs[i].offset = offset; + output_data->batchs[i].trace_num = count; + offset += count; + } + + output_data->start_batch_id = input_data->start_batch_id; + output_data->real_batch_num = i; + output_data->remain_num = completed ? 0 : (input_data->batch_num - i); + output_data->version = 0; + + cmd->output_size = sizeof(struct ras_cmd_batch_trace_record_rsp); + + return RAS_CMD__SUCCESS; +} + +static enum ras_ta_block __get_ras_ta_block(enum ras_block_id block) +{ + switch (block) { + case RAS_BLOCK_ID__UMC: + return RAS_TA_BLOCK__UMC; + case RAS_BLOCK_ID__SDMA: + return RAS_TA_BLOCK__SDMA; + case RAS_BLOCK_ID__GFX: + return RAS_TA_BLOCK__GFX; + case RAS_BLOCK_ID__MMHUB: + return RAS_TA_BLOCK__MMHUB; + case RAS_BLOCK_ID__ATHUB: + return RAS_TA_BLOCK__ATHUB; + case RAS_BLOCK_ID__PCIE_BIF: + return RAS_TA_BLOCK__PCIE_BIF; + case RAS_BLOCK_ID__HDP: + return RAS_TA_BLOCK__HDP; + case RAS_BLOCK_ID__XGMI_WAFL: + return RAS_TA_BLOCK__XGMI_WAFL; + case RAS_BLOCK_ID__DF: + return RAS_TA_BLOCK__DF; + case RAS_BLOCK_ID__SMN: + return RAS_TA_BLOCK__SMN; + case RAS_BLOCK_ID__SEM: + return RAS_TA_BLOCK__SEM; + case RAS_BLOCK_ID__MP0: + return RAS_TA_BLOCK__MP0; + case RAS_BLOCK_ID__MP1: + return RAS_TA_BLOCK__MP1; + case RAS_BLOCK_ID__FUSE: + return RAS_TA_BLOCK__FUSE; + case RAS_BLOCK_ID__MCA: + return RAS_TA_BLOCK__MCA; + case RAS_BLOCK_ID__VCN: + return RAS_TA_BLOCK__VCN; + case RAS_BLOCK_ID__JPEG: + return RAS_TA_BLOCK__JPEG; + default: + return RAS_TA_BLOCK__UMC; + } +} + +static enum ras_ta_error_type __get_ras_ta_err_type(enum ras_ecc_err_type error) +{ + switch (error) { + case RAS_ECC_ERR__NONE: + return RAS_TA_ERROR__NONE; + case RAS_ECC_ERR__PARITY: + return RAS_TA_ERROR__PARITY; + case RAS_ECC_ERR__SINGLE_CORRECTABLE: + return RAS_TA_ERROR__SINGLE_CORRECTABLE; + case RAS_ECC_ERR__MULTI_UNCORRECTABLE: + return RAS_TA_ERROR__MULTI_UNCORRECTABLE; + case RAS_ECC_ERR__POISON: + return RAS_TA_ERROR__POISON; + default: + return RAS_TA_ERROR__NONE; + } +} + +static int ras_cmd_inject_error(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_inject_error_req *req = + (struct ras_cmd_inject_error_req *)cmd->input_buff_raw; + struct ras_cmd_inject_error_rsp *output_data = + (struct ras_cmd_inject_error_rsp *)cmd->output_buff_raw; + int ret = 0; + struct ras_ta_trigger_error_input block_info = { + .block_id = __get_ras_ta_block(req->block_id), + .sub_block_index = req->subblock_id, + .inject_error_type = __get_ras_ta_err_type(req->error_type), + .address = req->address, + .value = req->method, + }; + + ret = ras_psp_trigger_error(ras_core, &block_info, req->instance_mask); + if (!ret) { + output_data->version = 0; + output_data->address = block_info.address; + cmd->output_size = sizeof(struct ras_cmd_inject_error_rsp); + } else { + RAS_DEV_ERR(ras_core->dev, "ras inject block %u failed %d\n", req->block_id, ret); + ret = RAS_CMD__ERROR_ACCESS_DENIED; + } + + return ret; +} + +static struct ras_cmd_func_map ras_cmd_maps[] = { + {RAS_CMD__INJECT_ERROR, ras_cmd_inject_error}, + {RAS_CMD__GET_BLOCK_ECC_STATUS, ras_get_block_ecc_info}, + {RAS_CMD__GET_BAD_PAGES, ras_cmd_get_bad_pages}, + {RAS_CMD__CLEAR_BAD_PAGE_INFO, ras_cmd_clear_bad_page_info}, + {RAS_CMD__RESET_ALL_ERROR_COUNTS, ras_cmd_reset_all_error_counts}, + {RAS_CMD__GET_CPER_SNAPSHOT, ras_cmd_get_cper_snapshot}, + {RAS_CMD__GET_CPER_RECORD, ras_cmd_get_cper_records}, + {RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, ras_cmd_get_batch_trace_snapshot}, + {RAS_CMD__GET_BATCH_TRACE_RECORD, ras_cmd_get_batch_trace_records}, +}; + +int rascore_handle_cmd(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data) +{ + struct ras_cmd_func_map *ras_cmd = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(ras_cmd_maps); i++) { + if (cmd->cmd_id == ras_cmd_maps[i].cmd_id) { + ras_cmd = &ras_cmd_maps[i]; + break; + } + } + + if (!ras_cmd) + return RAS_CMD__ERROR_UKNOWN_CMD; + + return ras_cmd->func(ras_core, cmd, data); +} + +int ras_cmd_init(struct ras_core_context *ras_core) +{ + return ras_cmd_add_device(ras_core); +} + +int ras_cmd_fini(struct ras_core_context *ras_core) +{ + ras_cmd_remove_device(ras_core); + return 0; +} + +int ras_cmd_query_interface_info(struct ras_core_context *ras_core, + struct ras_query_interface_info_rsp *rsp) +{ + rsp->ras_cmd_major_ver = RAS_CMD_MAJOR_VERSION; + rsp->ras_cmd_minor_ver = RAS_CMD_MINOR_VERSION; + + return 0; +} + +int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core, + uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr) +{ + struct umc_bank_addr umc_bank = {0}; + int ret; + + ret = ras_umc_translate_soc_pa_and_bank(ras_core, &soc_pa, &umc_bank, false); + if (ret) + return RAS_CMD__ERROR_GENERIC; + + bank_addr->stack_id = umc_bank.stack_id; + bank_addr->bank_group = umc_bank.bank_group; + bank_addr->bank = umc_bank.bank; + bank_addr->row = umc_bank.row; + bank_addr->column = umc_bank.column; + bank_addr->channel = umc_bank.channel; + bank_addr->subchannel = umc_bank.subchannel; + + return 0; +} + +int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core, + struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa) +{ + struct umc_bank_addr umc_bank = {0}; + + umc_bank.stack_id = bank_addr.stack_id; + umc_bank.bank_group = bank_addr.bank_group; + umc_bank.bank = bank_addr.bank; + umc_bank.row = bank_addr.row; + umc_bank.column = bank_addr.column; + umc_bank.channel = bank_addr.channel; + umc_bank.subchannel = bank_addr.subchannel; + + return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, &umc_bank, true); +} + +uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core) +{ + return ras_core->ras_cmd.dev_handle; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h new file mode 100644 index 000000000000..48a0715eb821 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cmd.h @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_CMD_H__ +#define __RAS_CMD_H__ +#include "ras.h" +#include "ras_eeprom.h" +#include "ras_log_ring.h" +#include "ras_cper.h" + +#define RAS_CMD_DEV_HANDLE_MAGIC 0xFEEDAD00UL + +#define RAS_CMD_MAX_IN_SIZE 256 +#define RAS_CMD_MAX_GPU_NUM 32 +#define RAS_CMD_MAX_BAD_PAGES_PER_GROUP 32 + +/* position of instance value in sub_block_index of + * ta_ras_trigger_error_input, the sub block uses lower 12 bits + */ +#define RAS_TA_INST_MASK 0xfffff000 +#define RAS_TA_INST_SHIFT 0xc + +enum ras_cmd_interface_type { + RAS_CMD_INTERFACE_TYPE_NONE, + RAS_CMD_INTERFACE_TYPE_AMDGPU, + RAS_CMD_INTERFACE_TYPE_VF, + RAS_CMD_INTERFACE_TYPE_PF, +}; + +enum ras_cmd_id_range { + RAS_CMD_ID_COMMON_START = 0, + RAS_CMD_ID_COMMON_END = 0x10000, + RAS_CMD_ID_AMDGPU_START = RAS_CMD_ID_COMMON_END, + RAS_CMD_ID_AMDGPU_END = 0x20000, + RAS_CMD_ID_MXGPU_START = RAS_CMD_ID_AMDGPU_END, + RAS_CMD_ID_MXGPU_END = 0x30000, + RAS_CMD_ID_MXGPU_VF_START = RAS_CMD_ID_MXGPU_END, + RAS_CMD_ID_MXGPU_VF_END = 0x40000, +}; + +enum ras_cmd_id { + RAS_CMD__BEGIN = RAS_CMD_ID_COMMON_START, + RAS_CMD__QUERY_INTERFACE_INFO, + RAS_CMD__GET_DEVICES_INFO, + RAS_CMD__GET_BLOCK_ECC_STATUS, + RAS_CMD__INJECT_ERROR, + RAS_CMD__GET_BAD_PAGES, + RAS_CMD__CLEAR_BAD_PAGE_INFO, + RAS_CMD__RESET_ALL_ERROR_COUNTS, + RAS_CMD__GET_SAFE_FB_ADDRESS_RANGES, + RAS_CMD__TRANSLATE_FB_ADDRESS, + RAS_CMD__GET_LINK_TOPOLOGY, + RAS_CMD__GET_CPER_SNAPSHOT, + RAS_CMD__GET_CPER_RECORD, + RAS_CMD__GET_BATCH_TRACE_SNAPSHOT, + RAS_CMD__GET_BATCH_TRACE_RECORD, + RAS_CMD__SUPPORTED_MAX = RAS_CMD_ID_COMMON_END, +}; + +enum ras_cmd_response { + RAS_CMD__SUCCESS = 0, + RAS_CMD__SUCCESS_EXEED_BUFFER, + RAS_CMD__ERROR_UKNOWN_CMD, + RAS_CMD__ERROR_INVALID_CMD, + RAS_CMD__ERROR_VERSION, + RAS_CMD__ERROR_INVALID_INPUT_SIZE, + RAS_CMD__ERROR_INVALID_INPUT_DATA, + RAS_CMD__ERROR_DRV_INIT_FAIL, + RAS_CMD__ERROR_ACCESS_DENIED, + RAS_CMD__ERROR_GENERIC, + RAS_CMD__ERROR_TIMEOUT, +}; + +enum ras_error_type { + RAS_TYPE_ERROR__NONE = 0, + RAS_TYPE_ERROR__PARITY = 1, + RAS_TYPE_ERROR__SINGLE_CORRECTABLE = 2, + RAS_TYPE_ERROR__MULTI_UNCORRECTABLE = 4, + RAS_TYPE_ERROR__POISON = 8, +}; + +struct ras_core_context; +struct ras_cmd_ctx; + +struct ras_cmd_mgr { + struct list_head head; + struct ras_core_context *ras_core; + uint64_t dev_handle; +}; + +struct ras_cmd_func_map { + uint32_t cmd_id; + int (*func)(struct ras_core_context *ras_core, + struct ras_cmd_ctx *cmd, void *data); +}; + +struct ras_device_bdf { + union { + struct { + uint32_t function : 3; + uint32_t device : 5; + uint32_t bus : 8; + uint32_t domain : 16; + }; + uint32_t u32_all; + }; +}; + +struct ras_cmd_param { + uint32_t idx_vf; + void *data; +}; + +#pragma pack(push, 8) +struct ras_cmd_ctx { + uint32_t magic; + union { + struct { + uint16_t ras_cmd_minor_ver : 10; + uint16_t ras_cmd_major_ver : 6; + }; + uint16_t ras_cmd_ver; + }; + union { + struct { + uint16_t plat_major_ver : 10; + uint16_t plat_minor_ver : 6; + }; + uint16_t plat_ver; + }; + uint32_t cmd_id; + uint32_t cmd_res; + uint32_t input_size; + uint32_t output_size; + uint32_t output_buf_size; + uint32_t reserved[5]; + uint8_t input_buff_raw[RAS_CMD_MAX_IN_SIZE]; + uint8_t output_buff_raw[]; +}; + +struct ras_cmd_dev_handle { + uint64_t dev_handle; +}; + +struct ras_cmd_block_ecc_info_req { + struct ras_cmd_dev_handle dev; + uint32_t block_id; + uint32_t subblock_id; + uint32_t reserved[4]; +}; + +struct ras_cmd_block_ecc_info_rsp { + uint32_t version; + uint32_t ce_count; + uint32_t ue_count; + uint32_t de_count; + uint32_t reserved[6]; +}; + +struct ras_cmd_inject_error_req { + struct ras_cmd_dev_handle dev; + uint32_t block_id; + uint32_t subblock_id; + uint64_t address; + uint32_t error_type; + uint32_t instance_mask; + union { + struct { + /* vf index */ + uint64_t vf_idx : 6; + /* method of error injection. i.e persistent, coherent etc */ + uint64_t method : 10; + uint64_t rsv : 48; + }; + uint64_t value; + }; + uint32_t reserved[8]; +}; + +struct ras_cmd_inject_error_rsp { + uint32_t version; + uint32_t reserved[5]; + uint64_t address; +}; + +struct ras_cmd_dev_info { + uint64_t dev_handle; + uint32_t location_id; + uint32_t ecc_enabled; + uint32_t ecc_supported; + uint32_t vf_num; + uint32_t asic_type; + uint32_t oam_id; + uint32_t reserved[8]; +}; + +struct ras_cmd_devices_info_rsp { + uint32_t version; + uint32_t dev_num; + uint32_t reserved[6]; + struct ras_cmd_dev_info devs[RAS_CMD_MAX_GPU_NUM]; +}; + +struct ras_cmd_bad_page_record { + union { + uint64_t address; + uint64_t offset; + }; + uint64_t retired_page; + uint64_t ts; + + uint32_t err_type; + + union { + unsigned char bank; + unsigned char cu; + }; + + unsigned char mem_channel; + unsigned char mcumc_id; + + unsigned char valid; + unsigned char reserved[8]; +}; + +struct ras_cmd_bad_pages_info_req { + struct ras_cmd_dev_handle device; + uint32_t group_index; + uint32_t reserved[5]; +}; + +struct ras_cmd_bad_pages_info_rsp { + uint32_t version; + uint32_t group_index; + uint32_t bp_in_group; + uint32_t bp_total_cnt; + uint32_t reserved[4]; + struct ras_cmd_bad_page_record records[RAS_CMD_MAX_BAD_PAGES_PER_GROUP]; +}; + +struct ras_query_interface_info_req { + uint32_t reserved[8]; +}; + +struct ras_query_interface_info_rsp { + uint32_t version; + uint32_t ras_cmd_major_ver; + uint32_t ras_cmd_minor_ver; + uint32_t plat_major_ver; + uint32_t plat_minor_ver; + uint8_t interface_type; + uint8_t rsv[3]; + uint32_t reserved[8]; +}; + +#define RAS_MAX_NUM_SAFE_RANGES 64 +struct ras_cmd_ras_safe_fb_address_ranges_rsp { + uint32_t version; + uint32_t num_ranges; + uint32_t reserved[4]; + struct { + uint64_t start; + uint64_t size; + uint32_t idx; + uint32_t reserved[3]; + } range[RAS_MAX_NUM_SAFE_RANGES]; +}; + +enum ras_fb_addr_type { + RAS_FB_ADDR_SOC_PHY, /* SPA */ + RAS_FB_ADDR_BANK, + RAS_FB_ADDR_VF_PHY, /* GPA */ + RAS_FB_ADDR_UNKNOWN +}; + +struct ras_fb_bank_addr { + uint32_t stack_id; /* SID */ + uint32_t bank_group; + uint32_t bank; + uint32_t row; + uint32_t column; + uint32_t channel; + uint32_t subchannel; /* Also called Pseudochannel (PC) */ + uint32_t reserved[3]; +}; + +struct ras_fb_vf_phy_addr { + uint32_t vf_idx; + uint32_t reserved; + uint64_t addr; +}; + +union ras_translate_fb_address { + struct ras_fb_bank_addr bank_addr; + uint64_t soc_phy_addr; + struct ras_fb_vf_phy_addr vf_phy_addr; +}; + +struct ras_cmd_translate_fb_address_req { + struct ras_cmd_dev_handle dev; + enum ras_fb_addr_type src_addr_type; + enum ras_fb_addr_type dest_addr_type; + union ras_translate_fb_address trans_addr; +}; + +struct ras_cmd_translate_fb_address_rsp { + uint32_t version; + uint32_t reserved[5]; + union ras_translate_fb_address trans_addr; +}; + +struct ras_dev_link_topology_req { + struct ras_cmd_dev_handle src; + struct ras_cmd_dev_handle dst; +}; + +struct ras_dev_link_topology_rsp { + uint32_t version; + uint32_t link_status; /* HW status of the link */ + uint32_t link_type; /* type of the link */ + uint32_t num_hops; /* number of hops */ + uint32_t reserved[8]; +}; + +struct ras_cmd_cper_snapshot_req { + struct ras_cmd_dev_handle dev; +}; + +struct ras_cmd_cper_snapshot_rsp { + uint32_t version; + uint32_t reserved[4]; + uint32_t total_cper_num; + uint64_t start_cper_id; + uint64_t latest_cper_id; +}; + +struct ras_cmd_cper_record_req { + struct ras_cmd_dev_handle dev; + uint64_t cper_start_id; + uint32_t cper_num; + uint32_t buf_size; + uint64_t buf_ptr; + uint32_t reserved[4]; +}; + +struct ras_cmd_cper_record_rsp { + uint32_t version; + uint32_t real_data_size; + uint32_t real_cper_num; + uint32_t remain_num; + uint32_t reserved[4]; +}; + +struct ras_cmd_batch_trace_snapshot_req { + struct ras_cmd_dev_handle dev; +}; + +struct ras_cmd_batch_trace_snapshot_rsp { + uint32_t version; + uint32_t reserved[4]; + uint32_t total_batch_num; + uint64_t start_batch_id; + uint64_t latest_batch_id; +}; + +struct ras_cmd_batch_trace_record_req { + struct ras_cmd_dev_handle dev; + uint64_t start_batch_id; + uint32_t batch_num; + uint32_t reserved[5]; +}; + +struct batch_ras_trace_info { + uint64_t batch_id; + uint16_t offset; + uint8_t trace_num; + uint8_t rsv; + uint32_t reserved; +}; + +#define RAS_CMD_MAX_BATCH_NUM 300 +#define RAS_CMD_MAX_TRACE_NUM 300 +struct ras_cmd_batch_trace_record_rsp { + uint32_t version; + uint16_t real_batch_num; + uint16_t remain_num; + uint64_t start_batch_id; + uint32_t reserved[2]; + struct batch_ras_trace_info batchs[RAS_CMD_MAX_BATCH_NUM]; + struct ras_log_info records[RAS_CMD_MAX_TRACE_NUM]; +}; + +#pragma pack(pop) + +int ras_cmd_init(struct ras_core_context *ras_core); +int ras_cmd_fini(struct ras_core_context *ras_core); +int rascore_handle_cmd(struct ras_core_context *ras_core, struct ras_cmd_ctx *cmd, void *data); +uint64_t ras_cmd_get_dev_handle(struct ras_core_context *ras_core); +int ras_cmd_query_interface_info(struct ras_core_context *ras_core, + struct ras_query_interface_info_rsp *rsp); +int ras_cmd_translate_soc_pa_to_bank(struct ras_core_context *ras_core, + uint64_t soc_pa, struct ras_fb_bank_addr *bank_addr); +int ras_cmd_translate_bank_to_soc_pa(struct ras_core_context *ras_core, + struct ras_fb_bank_addr bank_addr, uint64_t *soc_pa); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_core.c b/drivers/gpu/drm/amd/ras/rascore/ras_core.c new file mode 100644 index 000000000000..01122b55c98a --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_core.c @@ -0,0 +1,603 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_core_status.h" + +#define RAS_SEQNO_FIFO_SIZE (128 * sizeof(uint64_t)) + +#define IS_LEAP_YEAR(x) ((x % 4 == 0 && x % 100 != 0) || x % 400 == 0) + +static const char * const ras_block_name[] = { + "umc", + "sdma", + "gfx", + "mmhub", + "athub", + "pcie_bif", + "hdp", + "xgmi_wafl", + "df", + "smn", + "sem", + "mp0", + "mp1", + "fuse", + "mca", + "vcn", + "jpeg", + "ih", + "mpio", +}; + +const char *ras_core_get_ras_block_name(enum ras_block_id block_id) +{ + if (block_id >= ARRAY_SIZE(ras_block_name)) + return ""; + + return ras_block_name[block_id]; +} + +int ras_core_convert_timestamp_to_time(struct ras_core_context *ras_core, + uint64_t timestamp, struct ras_time *tm) +{ + int days_in_month[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; + uint64_t month = 0, day = 0, hour = 0, minute = 0, second = 0; + uint32_t year = 0; + int seconds_per_day = 24 * 60 * 60; + int seconds_per_hour = 60 * 60; + int seconds_per_minute = 60; + int days, remaining_seconds; + + days = div64_u64_rem(timestamp, seconds_per_day, (uint64_t *)&remaining_seconds); + + /* utc_timestamp follows the Unix epoch */ + year = 1970; + while (days >= 365) { + if (IS_LEAP_YEAR(year)) { + if (days < 366) + break; + days -= 366; + } else { + days -= 365; + } + year++; + } + + days_in_month[1] += IS_LEAP_YEAR(year); + + month = 0; + while (days >= days_in_month[month]) { + days -= days_in_month[month]; + month++; + } + month++; + day = days + 1; + + if (remaining_seconds) { + hour = remaining_seconds / seconds_per_hour; + minute = (remaining_seconds % seconds_per_hour) / seconds_per_minute; + second = remaining_seconds % seconds_per_minute; + } + + tm->tm_year = year; + tm->tm_mon = month; + tm->tm_mday = day; + tm->tm_hour = hour; + tm->tm_min = minute; + tm->tm_sec = second; + + return 0; +} + +bool ras_core_gpu_in_reset(struct ras_core_context *ras_core) +{ + uint32_t status = 0; + + if (ras_core->sys_fn && + ras_core->sys_fn->check_gpu_status) + ras_core->sys_fn->check_gpu_status(ras_core, &status); + + return (status & RAS_GPU_STATUS__IN_RESET) ? true : false; +} + +bool ras_core_gpu_is_vf(struct ras_core_context *ras_core) +{ + uint32_t status = 0; + + if (ras_core->sys_fn && + ras_core->sys_fn->check_gpu_status) + ras_core->sys_fn->check_gpu_status(ras_core, &status); + + return (status & RAS_GPU_STATUS__IS_VF) ? true : false; +} + +bool ras_core_gpu_is_rma(struct ras_core_context *ras_core) +{ + if (!ras_core) + return false; + + return ras_core->is_rma; +} + +static int ras_core_seqno_fifo_write(struct ras_core_context *ras_core, + enum ras_seqno_fifo fifo_type, uint64_t seqno) +{ + int ret = 0; + struct kfifo *seqno_fifo = NULL; + + if (fifo_type == SEQNO_FIFO_POISON_CREATION) + seqno_fifo = &ras_core->de_seqno_fifo; + else if (fifo_type == SEQNO_FIFO_POISON_CONSUMPTION) + seqno_fifo = &ras_core->consumption_seqno_fifo; + + if (seqno_fifo) + ret = kfifo_in_spinlocked(seqno_fifo, + &seqno, sizeof(seqno), &ras_core->seqno_lock); + + return ret ? 0 : -EINVAL; +} + +static int ras_core_seqno_fifo_read(struct ras_core_context *ras_core, + enum ras_seqno_fifo fifo_type, uint64_t *seqno, bool pop) +{ + int ret = 0; + struct kfifo *seqno_fifo = NULL; + + if (fifo_type == SEQNO_FIFO_POISON_CREATION) + seqno_fifo = &ras_core->de_seqno_fifo; + else if (fifo_type == SEQNO_FIFO_POISON_CONSUMPTION) + seqno_fifo = &ras_core->consumption_seqno_fifo; + + if (seqno_fifo) { + if (pop) + ret = kfifo_out_spinlocked(seqno_fifo, + seqno, sizeof(*seqno), &ras_core->seqno_lock); + else + ret = kfifo_out_peek(seqno_fifo, seqno, sizeof(*seqno)); + } + + return ret ? 0 : -EINVAL; +} + +uint64_t ras_core_gen_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type type) +{ + uint64_t seqno = 0; + + if (ras_core->sys_fn && + ras_core->sys_fn->gen_seqno) + ras_core->sys_fn->gen_seqno(ras_core, type, &seqno); + + return seqno; +} + +int ras_core_put_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type, uint64_t seqno) +{ + int ret = 0; + + if (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX) + return -EINVAL; + + if (seqno_type == RAS_SEQNO_TYPE_DE) + ret = ras_core_seqno_fifo_write(ras_core, + SEQNO_FIFO_POISON_CREATION, seqno); + else if (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION) + ret = ras_core_seqno_fifo_write(ras_core, + SEQNO_FIFO_POISON_CONSUMPTION, seqno); + else + ret = -EINVAL; + + return ret; +} + +uint64_t ras_core_get_seqno(struct ras_core_context *ras_core, + enum ras_seqno_type seqno_type, bool pop) +{ + uint64_t seq_no; + int ret = -ENODATA; + + if (seqno_type >= RAS_SEQNO_TYPE_COUNT_MAX) + return 0; + + if (seqno_type == RAS_SEQNO_TYPE_DE) + ret = ras_core_seqno_fifo_read(ras_core, + SEQNO_FIFO_POISON_CREATION, &seq_no, pop); + else if (seqno_type == RAS_SEQNO_TYPE_POISON_CONSUMPTION) + ret = ras_core_seqno_fifo_read(ras_core, + SEQNO_FIFO_POISON_CONSUMPTION, &seq_no, pop); + + if (ret) + seq_no = ras_core_gen_seqno(ras_core, seqno_type); + + return seq_no; +} + +static int ras_core_eeprom_recovery(struct ras_core_context *ras_core) +{ + int count; + int ret; + + count = ras_eeprom_get_record_count(ras_core); + if (!count) + return 0; + + /* Avoid bad page to be loaded again after gpu reset */ + if (ras_umc_get_saved_eeprom_count(ras_core) >= count) + return 0; + + ret = ras_umc_load_bad_pages(ras_core); + if (ret) { + RAS_DEV_ERR(ras_core->dev, "ras_umc_load_bad_pages failed: %d\n", ret); + return ret; + } + + ras_eeprom_sync_info(ras_core); + + return ret; +} + +struct ras_core_context *ras_core_create(struct ras_core_config *init_config) +{ + struct ras_core_context *ras_core; + struct ras_core_config *config; + + ras_core = kzalloc(sizeof(*ras_core), GFP_KERNEL); + if (!ras_core) + return NULL; + + config = kzalloc(sizeof(*config), GFP_KERNEL); + if (!config) { + kfree(ras_core); + return NULL; + } + + memcpy(config, init_config, sizeof(*config)); + ras_core->config = config; + + return ras_core; +} + +void ras_core_destroy(struct ras_core_context *ras_core) +{ + if (ras_core) + kfree(ras_core->config); + + kfree(ras_core); +} + +int ras_core_sw_init(struct ras_core_context *ras_core) +{ + int ret; + + if (!ras_core->config) { + RAS_DEV_ERR(ras_core->dev, "No ras core config!\n"); + return -EINVAL; + } + + ras_core->sys_fn = ras_core->config->sys_fn; + if (!ras_core->sys_fn) + return -EINVAL; + + ret = kfifo_alloc(&ras_core->de_seqno_fifo, + RAS_SEQNO_FIFO_SIZE, GFP_KERNEL); + if (ret) + return ret; + + ret = kfifo_alloc(&ras_core->consumption_seqno_fifo, + RAS_SEQNO_FIFO_SIZE, GFP_KERNEL); + if (ret) + return ret; + + spin_lock_init(&ras_core->seqno_lock); + + ret = ras_aca_sw_init(ras_core); + if (ret) + return ret; + + ret = ras_umc_sw_init(ras_core); + if (ret) + return ret; + + ret = ras_cmd_init(ras_core); + if (ret) + return ret; + + ret = ras_log_ring_sw_init(ras_core); + if (ret) + return ret; + + ret = ras_psp_sw_init(ras_core); + if (ret) + return ret; + + return 0; +} + +int ras_core_sw_fini(struct ras_core_context *ras_core) +{ + kfifo_free(&ras_core->de_seqno_fifo); + kfifo_free(&ras_core->consumption_seqno_fifo); + + ras_psp_sw_fini(ras_core); + ras_log_ring_sw_fini(ras_core); + ras_cmd_fini(ras_core); + ras_umc_sw_fini(ras_core); + ras_aca_sw_fini(ras_core); + + return 0; +} + +int ras_core_hw_init(struct ras_core_context *ras_core) +{ + int ret; + + ras_core->ras_eeprom_supported = + ras_core->config->ras_eeprom_supported; + + ras_core->poison_supported = ras_core->config->poison_supported; + + ret = ras_psp_hw_init(ras_core); + if (ret) + return ret; + + ret = ras_aca_hw_init(ras_core); + if (ret) + goto init_err1; + + ret = ras_mp1_hw_init(ras_core); + if (ret) + goto init_err2; + + ret = ras_nbio_hw_init(ras_core); + if (ret) + goto init_err3; + + ret = ras_umc_hw_init(ras_core); + if (ret) + goto init_err4; + + ret = ras_gfx_hw_init(ras_core); + if (ret) + goto init_err5; + + ret = ras_eeprom_hw_init(ras_core); + if (ret) + goto init_err6; + + ret = ras_core_eeprom_recovery(ras_core); + if (ret) { + RAS_DEV_ERR(ras_core->dev, + "Failed to recovery ras core, ret:%d\n", ret); + goto init_err6; + } + + ret = ras_eeprom_check_storage_status(ras_core); + if (ret) + goto init_err6; + + ret = ras_process_init(ras_core); + if (ret) + goto init_err7; + + ras_core->is_initialized = true; + + return 0; + +init_err7: + ras_eeprom_hw_fini(ras_core); +init_err6: + ras_gfx_hw_fini(ras_core); +init_err5: + ras_umc_hw_fini(ras_core); +init_err4: + ras_nbio_hw_fini(ras_core); +init_err3: + ras_mp1_hw_fini(ras_core); +init_err2: + ras_aca_hw_fini(ras_core); +init_err1: + ras_psp_hw_fini(ras_core); + return ret; +} + +int ras_core_hw_fini(struct ras_core_context *ras_core) +{ + ras_core->is_initialized = false; + + ras_process_fini(ras_core); + ras_eeprom_hw_fini(ras_core); + ras_gfx_hw_fini(ras_core); + ras_nbio_hw_fini(ras_core); + ras_umc_hw_fini(ras_core); + ras_mp1_hw_fini(ras_core); + ras_aca_hw_fini(ras_core); + ras_psp_hw_fini(ras_core); + + return 0; +} + +bool ras_core_handle_nbio_irq(struct ras_core_context *ras_core, void *data) +{ + return ras_nbio_handle_irq_error(ras_core, data); +} + +int ras_core_handle_fatal_error(struct ras_core_context *ras_core) +{ + int ret = 0; + + ras_aca_mark_fatal_flag(ras_core); + + ret = ras_core_event_notify(ras_core, + RAS_EVENT_ID__FATAL_ERROR_DETECTED, NULL); + + return ret; +} + +uint32_t ras_core_get_curr_nps_mode(struct ras_core_context *ras_core) +{ + if (ras_core->ras_nbio.ip_func && + ras_core->ras_nbio.ip_func->get_memory_partition_mode) + return ras_core->ras_nbio.ip_func->get_memory_partition_mode(ras_core); + + RAS_DEV_ERR(ras_core->dev, "Failed to get gpu memory nps mode!\n"); + return 0; +} + +int ras_core_update_ecc_info(struct ras_core_context *ras_core) +{ + int ret; + + ret = ras_aca_update_ecc(ras_core, RAS_ERR_TYPE__CE, NULL); + if (!ret) + ret = ras_aca_update_ecc(ras_core, RAS_ERR_TYPE__UE, NULL); + + return ret; +} + +int ras_core_query_block_ecc_data(struct ras_core_context *ras_core, + enum ras_block_id block, struct ras_ecc_count *ecc_count) +{ + int ret; + + if (!ecc_count || (block >= RAS_BLOCK_ID__LAST) || !ras_core) + return -EINVAL; + + ret = ras_aca_get_block_ecc_count(ras_core, block, ecc_count); + if (!ret) + ras_aca_clear_block_new_ecc_count(ras_core, block); + + return ret; +} + +int ras_core_set_status(struct ras_core_context *ras_core, bool enable) +{ + ras_core->ras_core_enabled = enable; + + return 0; +} + +bool ras_core_is_enabled(struct ras_core_context *ras_core) +{ + return ras_core->ras_core_enabled; +} + +uint64_t ras_core_get_utc_second_timestamp(struct ras_core_context *ras_core) +{ + if (ras_core && ras_core->sys_fn && + ras_core->sys_fn->get_utc_second_timestamp) + return ras_core->sys_fn->get_utc_second_timestamp(ras_core); + + RAS_DEV_ERR(ras_core->dev, "Failed to get system timestamp!\n"); + return 0; +} + +int ras_core_translate_soc_pa_and_bank(struct ras_core_context *ras_core, + uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa) +{ + if (!ras_core || !soc_pa || !bank_addr) + return -EINVAL; + + return ras_umc_translate_soc_pa_and_bank(ras_core, soc_pa, bank_addr, bank_to_pa); +} + +bool ras_core_ras_interrupt_detected(struct ras_core_context *ras_core) +{ + if (ras_core && ras_core->sys_fn && + ras_core->sys_fn->detect_ras_interrupt) + return ras_core->sys_fn->detect_ras_interrupt(ras_core); + + RAS_DEV_ERR(ras_core->dev, "Failed to detect ras interrupt!\n"); + return false; +} + +int ras_core_get_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) +{ + if (ras_core->sys_fn && ras_core->sys_fn->get_gpu_mem) + return ras_core->sys_fn->get_gpu_mem(ras_core, mem_type, gpu_mem); + + RAS_DEV_ERR(ras_core->dev, "Not config get gpu memory API!\n"); + return -EACCES; +} + +int ras_core_put_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type, struct gpu_mem_block *gpu_mem) +{ + if (ras_core->sys_fn && ras_core->sys_fn->put_gpu_mem) + return ras_core->sys_fn->put_gpu_mem(ras_core, mem_type, gpu_mem); + + RAS_DEV_ERR(ras_core->dev, "Not config put gpu memory API!!\n"); + return -EACCES; +} + +bool ras_core_is_ready(struct ras_core_context *ras_core) +{ + return ras_core ? ras_core->is_initialized : false; +} + +bool ras_core_check_safety_watermark(struct ras_core_context *ras_core) +{ + return ras_eeprom_check_safety_watermark(ras_core); +} + +int ras_core_down_trylock_gpu_reset_lock(struct ras_core_context *ras_core) +{ + if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock) + return ras_core->sys_fn->gpu_reset_lock(ras_core, true, true); + + return 1; +} + +void ras_core_down_gpu_reset_lock(struct ras_core_context *ras_core) +{ + if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock) + ras_core->sys_fn->gpu_reset_lock(ras_core, true, false); +} + +void ras_core_up_gpu_reset_lock(struct ras_core_context *ras_core) +{ + if (ras_core->sys_fn && ras_core->sys_fn->gpu_reset_lock) + ras_core->sys_fn->gpu_reset_lock(ras_core, false, false); +} + +int ras_core_event_notify(struct ras_core_context *ras_core, + enum ras_notify_event event_id, void *data) +{ + if (ras_core && ras_core->sys_fn && + ras_core->sys_fn->ras_notifier) + return ras_core->sys_fn->ras_notifier(ras_core, event_id, data); + + return -RAS_CORE_NOT_SUPPORTED; +} + +int ras_core_get_device_system_info(struct ras_core_context *ras_core, + struct device_system_info *dev_info) +{ + if (ras_core && ras_core->sys_fn && + ras_core->sys_fn->get_device_system_info) + return ras_core->sys_fn->get_device_system_info(ras_core, dev_info); + + return -RAS_CORE_NOT_SUPPORTED; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.c b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c new file mode 100644 index 000000000000..0fc7522b7ab6 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_core_status.h" +#include "ras_log_ring.h" +#include "ras_cper.h" + +static const struct ras_cper_guid MCE = CPER_NOTIFY__MCE; +static const struct ras_cper_guid CMC = CPER_NOTIFY__CMC; +static const struct ras_cper_guid BOOT = BOOT__TYPE; + +static const struct ras_cper_guid CRASHDUMP = GPU__CRASHDUMP; +static const struct ras_cper_guid RUNTIME = GPU__NONSTANDARD_ERROR; + +static void cper_get_timestamp(struct ras_core_context *ras_core, + struct ras_cper_timestamp *timestamp, uint64_t utc_second_timestamp) +{ + struct ras_time tm = {0}; + + ras_core_convert_timestamp_to_time(ras_core, utc_second_timestamp, &tm); + timestamp->seconds = tm.tm_sec; + timestamp->minutes = tm.tm_min; + timestamp->hours = tm.tm_hour; + timestamp->flag = 0; + timestamp->day = tm.tm_mday; + timestamp->month = tm.tm_mon; + timestamp->year = tm.tm_year % 100; + timestamp->century = tm.tm_year / 100; +} + +static void fill_section_hdr(struct ras_core_context *ras_core, + struct cper_section_hdr *hdr, enum ras_cper_type type, + enum ras_cper_severity sev, struct ras_log_info *trace) +{ + struct device_system_info dev_info = {0}; + char record_id[32]; + + hdr->signature[0] = 'C'; + hdr->signature[1] = 'P'; + hdr->signature[2] = 'E'; + hdr->signature[3] = 'R'; + hdr->revision = CPER_HDR__REV_1; + hdr->signature_end = 0xFFFFFFFF; + hdr->error_severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); + + hdr->valid_bits.platform_id = 1; + hdr->valid_bits.timestamp = 1; + + ras_core_get_device_system_info(ras_core, &dev_info); + + cper_get_timestamp(ras_core, &hdr->timestamp, trace->timestamp); + + snprintf(record_id, sizeof(record_id), "%d:%llX", dev_info.socket_id, + RAS_LOG_SEQNO_TO_BATCH_IDX(trace->seqno)); + memcpy(hdr->record_id, record_id, 8); + + snprintf(hdr->platform_id, 16, "0x%04X:0x%04X", + dev_info.vendor_id, dev_info.device_id); + /* pmfw version should be part of creator_id according to CPER spec */ + snprintf(hdr->creator_id, 16, "%s", CPER_CREATOR_ID__AMDGPU); + + switch (type) { + case RAS_CPER_TYPE_BOOT: + hdr->notify_type = BOOT; + break; + case RAS_CPER_TYPE_FATAL: + case RAS_CPER_TYPE_RMA: + hdr->notify_type = MCE; + break; + case RAS_CPER_TYPE_RUNTIME: + if (sev == RAS_CPER_SEV_NON_FATAL_CE) + hdr->notify_type = CMC; + else + hdr->notify_type = MCE; + break; + default: + RAS_DEV_ERR(ras_core->dev, "Unknown CPER Type\n"); + break; + } +} + +static int fill_section_descriptor(struct ras_core_context *ras_core, + struct cper_section_descriptor *descriptor, + enum ras_cper_severity sev, + struct ras_cper_guid sec_type, + uint32_t section_offset, + uint32_t section_length) +{ + struct device_system_info dev_info = {0}; + + descriptor->revision_minor = CPER_SEC__MINOR_REV_1; + descriptor->revision_major = CPER_SEC__MAJOR_REV_22; + descriptor->sec_offset = section_offset; + descriptor->sec_length = section_length; + descriptor->valid_bits.fru_text = 1; + descriptor->flag_bits.primary = 1; + descriptor->severity = (sev == RAS_CPER_SEV_RMA ? RAS_CPER_SEV_FATAL_UE : sev); + descriptor->sec_type = sec_type; + + ras_core_get_device_system_info(ras_core, &dev_info); + + snprintf(descriptor->fru_text, 20, "OAM%d", dev_info.socket_id); + + if (sev == RAS_CPER_SEV_RMA) + descriptor->flag_bits.exceed_err_threshold = 1; + + if (sev == RAS_CPER_SEV_NON_FATAL_UE) + descriptor->flag_bits.latent_err = 1; + + return 0; +} + +static int fill_section_fatal(struct ras_core_context *ras_core, + struct cper_section_fatal *fatal, struct ras_log_info *trace) +{ + fatal->data.reg_ctx_type = CPER_CTX_TYPE__CRASH; + fatal->data.reg_arr_size = sizeof(fatal->data.reg); + + fatal->data.reg.status = trace->aca_reg.regs[RAS_CPER_ACA_REG_STATUS]; + fatal->data.reg.addr = trace->aca_reg.regs[RAS_CPER_ACA_REG_ADDR]; + fatal->data.reg.ipid = trace->aca_reg.regs[RAS_CPER_ACA_REG_IPID]; + fatal->data.reg.synd = trace->aca_reg.regs[RAS_CPER_ACA_REG_SYND]; + + return 0; +} + +static int fill_section_runtime(struct ras_core_context *ras_core, + struct cper_section_runtime *runtime, struct ras_log_info *trace, + enum ras_cper_severity sev) +{ + runtime->hdr.valid_bits.err_info_cnt = 1; + runtime->hdr.valid_bits.err_context_cnt = 1; + + runtime->descriptor.error_type = RUNTIME; + runtime->descriptor.ms_chk_bits.err_type_valid = 1; + if (sev == RAS_CPER_SEV_RMA) { + runtime->descriptor.valid_bits.ms_chk = 1; + runtime->descriptor.ms_chk_bits.err_type = 1; + runtime->descriptor.ms_chk_bits.pcc = 1; + } + + runtime->reg.reg_ctx_type = CPER_CTX_TYPE__CRASH; + runtime->reg.reg_arr_size = sizeof(runtime->reg.reg_dump); + + runtime->reg.reg_dump[RAS_CPER_ACA_REG_CTL] = trace->aca_reg.regs[ACA_REG_IDX__CTL]; + runtime->reg.reg_dump[RAS_CPER_ACA_REG_STATUS] = trace->aca_reg.regs[ACA_REG_IDX__STATUS]; + runtime->reg.reg_dump[RAS_CPER_ACA_REG_ADDR] = trace->aca_reg.regs[ACA_REG_IDX__ADDR]; + runtime->reg.reg_dump[RAS_CPER_ACA_REG_MISC0] = trace->aca_reg.regs[ACA_REG_IDX__MISC0]; + runtime->reg.reg_dump[RAS_CPER_ACA_REG_CONFIG] = trace->aca_reg.regs[ACA_REG_IDX__CONFG]; + runtime->reg.reg_dump[RAS_CPER_ACA_REG_IPID] = trace->aca_reg.regs[ACA_REG_IDX__IPID]; + runtime->reg.reg_dump[RAS_CPER_ACA_REG_SYND] = trace->aca_reg.regs[ACA_REG_IDX__SYND]; + + return 0; +} + +static int cper_generate_runtime_record(struct ras_core_context *ras_core, + struct cper_section_hdr *hdr, struct ras_log_info **trace_arr, uint32_t arr_num, + enum ras_cper_severity sev) +{ + struct cper_section_descriptor *descriptor; + struct cper_section_runtime *runtime; + int i; + + fill_section_hdr(ras_core, hdr, RAS_CPER_TYPE_RUNTIME, sev, trace_arr[0]); + hdr->record_length = RAS_HDR_LEN + ((RAS_SEC_DESC_LEN + RAS_NONSTD_SEC_LEN) * arr_num); + hdr->sec_cnt = arr_num; + for (i = 0; i < arr_num; i++) { + descriptor = (struct cper_section_descriptor *)((uint8_t *)hdr + + RAS_SEC_DESC_OFFSET(i)); + runtime = (struct cper_section_runtime *)((uint8_t *)hdr + + RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i)); + + fill_section_descriptor(ras_core, descriptor, sev, RUNTIME, + RAS_NONSTD_SEC_OFFSET(hdr->sec_cnt, i), + sizeof(struct cper_section_runtime)); + fill_section_runtime(ras_core, runtime, trace_arr[i], sev); + } + + return 0; +} + +static int cper_generate_fatal_record(struct ras_core_context *ras_core, + uint8_t *buffer, struct ras_log_info **trace_arr, uint32_t arr_num) +{ + struct ras_cper_fatal_record record = {0}; + int i = 0; + + for (i = 0; i < arr_num; i++) { + fill_section_hdr(ras_core, &record.hdr, RAS_CPER_TYPE_FATAL, + RAS_CPER_SEV_FATAL_UE, trace_arr[i]); + record.hdr.record_length = RAS_HDR_LEN + RAS_SEC_DESC_LEN + RAS_FATAL_SEC_LEN; + record.hdr.sec_cnt = 1; + + fill_section_descriptor(ras_core, &record.descriptor, RAS_CPER_SEV_FATAL_UE, + CRASHDUMP, offsetof(struct ras_cper_fatal_record, fatal), + sizeof(struct cper_section_fatal)); + + fill_section_fatal(ras_core, &record.fatal, trace_arr[i]); + + memcpy(buffer + (i * record.hdr.record_length), + &record, record.hdr.record_length); + } + + return 0; +} + +static int cper_get_record_size(enum ras_cper_type type, uint16_t section_count) +{ + int size = 0; + + size += RAS_HDR_LEN; + size += (RAS_SEC_DESC_LEN * section_count); + + switch (type) { + case RAS_CPER_TYPE_RUNTIME: + case RAS_CPER_TYPE_RMA: + size += (RAS_NONSTD_SEC_LEN * section_count); + break; + case RAS_CPER_TYPE_FATAL: + size += (RAS_FATAL_SEC_LEN * section_count); + size += (RAS_HDR_LEN * (section_count - 1)); + break; + case RAS_CPER_TYPE_BOOT: + size += (RAS_BOOT_SEC_LEN * section_count); + break; + default: + /* should never reach here */ + break; + } + + return size; +} + +static enum ras_cper_type cper_ras_log_event_to_cper_type(enum ras_log_event event) +{ + switch (event) { + case RAS_LOG_EVENT_UE: + return RAS_CPER_TYPE_FATAL; + case RAS_LOG_EVENT_DE: + case RAS_LOG_EVENT_CE: + case RAS_LOG_EVENT_POISON_CREATION: + case RAS_LOG_EVENT_POISON_CONSUMPTION: + return RAS_CPER_TYPE_RUNTIME; + case RAS_LOG_EVENT_RMA: + return RAS_CPER_TYPE_RMA; + default: + /* should never reach here */ + return RAS_CPER_TYPE_RUNTIME; + } +} + +int ras_cper_generate_cper(struct ras_core_context *ras_core, + struct ras_log_info **trace_list, uint32_t count, + uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len) +{ + uint8_t *buffer = buf; + uint64_t buf_size = buf_len; + int record_size, saved_size = 0; + struct cper_section_hdr *hdr; + + /* All the batch traces share the same event */ + record_size = cper_get_record_size( + cper_ras_log_event_to_cper_type(trace_list[0]->event), count); + + if ((record_size + saved_size) > buf_size) + return -ENOMEM; + + hdr = (struct cper_section_hdr *)(buffer + saved_size); + + switch (trace_list[0]->event) { + case RAS_LOG_EVENT_RMA: + cper_generate_runtime_record(ras_core, hdr, trace_list, count, RAS_CPER_SEV_RMA); + break; + case RAS_LOG_EVENT_DE: + cper_generate_runtime_record(ras_core, + hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_UE); + break; + case RAS_LOG_EVENT_CE: + cper_generate_runtime_record(ras_core, + hdr, trace_list, count, RAS_CPER_SEV_NON_FATAL_CE); + break; + case RAS_LOG_EVENT_UE: + cper_generate_fatal_record(ras_core, buffer + saved_size, trace_list, count); + break; + default: + RAS_DEV_WARN(ras_core->dev, "Unprocessed trace event: %d\n", trace_list[0]->event); + break; + } + + saved_size += record_size; + + *real_data_len = saved_size; + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_cper.h b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h new file mode 100644 index 000000000000..076c1883c1ce --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_cper.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_CPER_H__ +#define __RAS_CPER_H__ + +#define CPER_UUID_MAX_SIZE 16 +struct ras_cper_guid { + uint8_t b[CPER_UUID_MAX_SIZE]; +}; + +#define CPER_GUID__INIT(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ + ((struct ras_cper_guid) \ + {{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, \ + (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) + +#define CPER_HDR__REV_1 (0x100) +#define CPER_SEC__MINOR_REV_1 (0x01) +#define CPER_SEC__MAJOR_REV_22 (0x22) +#define CPER_OAM_MAX_COUNT (8) + +#define CPER_CTX_TYPE__CRASH (1) +#define CPER_CTX_TYPE__BOOT (9) + +#define CPER_CREATOR_ID__AMDGPU "amdgpu" + +#define CPER_NOTIFY__MCE \ + CPER_GUID__INIT(0xE8F56FFE, 0x919C, 0x4cc5, 0xBA, 0x88, 0x65, 0xAB, \ + 0xE1, 0x49, 0x13, 0xBB) +#define CPER_NOTIFY__CMC \ + CPER_GUID__INIT(0x2DCE8BB1, 0xBDD7, 0x450e, 0xB9, 0xAD, 0x9C, 0xF4, \ + 0xEB, 0xD4, 0xF8, 0x90) +#define BOOT__TYPE \ + CPER_GUID__INIT(0x3D61A466, 0xAB40, 0x409a, 0xA6, 0x98, 0xF3, 0x62, \ + 0xD4, 0x64, 0xB3, 0x8F) + +#define GPU__CRASHDUMP \ + CPER_GUID__INIT(0x32AC0C78, 0x2623, 0x48F6, 0xB0, 0xD0, 0x73, 0x65, \ + 0x72, 0x5F, 0xD6, 0xAE) +#define GPU__NONSTANDARD_ERROR \ + CPER_GUID__INIT(0x32AC0C78, 0x2623, 0x48F6, 0x81, 0xA2, 0xAC, 0x69, \ + 0x17, 0x80, 0x55, 0x1D) +#define PROC_ERR__SECTION_TYPE \ + CPER_GUID__INIT(0xDC3EA0B0, 0xA144, 0x4797, 0xB9, 0x5B, 0x53, 0xFA, \ + 0x24, 0x2B, 0x6E, 0x1D) + +enum ras_cper_type { + RAS_CPER_TYPE_RUNTIME, + RAS_CPER_TYPE_FATAL, + RAS_CPER_TYPE_BOOT, + RAS_CPER_TYPE_RMA, +}; + +enum ras_cper_severity { + RAS_CPER_SEV_NON_FATAL_UE = 0, + RAS_CPER_SEV_FATAL_UE = 1, + RAS_CPER_SEV_NON_FATAL_CE = 2, + RAS_CPER_SEV_RMA = 3, + + RAS_CPER_SEV_UNUSED = 10, +}; + +enum ras_cper_aca_reg { + RAS_CPER_ACA_REG_CTL = 0, + RAS_CPER_ACA_REG_STATUS = 1, + RAS_CPER_ACA_REG_ADDR = 2, + RAS_CPER_ACA_REG_MISC0 = 3, + RAS_CPER_ACA_REG_CONFIG = 4, + RAS_CPER_ACA_REG_IPID = 5, + RAS_CPER_ACA_REG_SYND = 6, + RAS_CPER_ACA_REG_DESTAT = 8, + RAS_CPER_ACA_REG_DEADDR = 9, + RAS_CPER_ACA_REG_MASK = 10, + + RAS_CPER_ACA_REG_COUNT = 16, +}; + +#pragma pack(push, 1) + +struct ras_cper_timestamp { + uint8_t seconds; + uint8_t minutes; + uint8_t hours; + uint8_t flag; + uint8_t day; + uint8_t month; + uint8_t year; + uint8_t century; +}; + +struct cper_section_hdr { + char signature[4]; /* "CPER" */ + uint16_t revision; + uint32_t signature_end; /* 0xFFFFFFFF */ + uint16_t sec_cnt; + enum ras_cper_severity error_severity; + union { + struct { + uint32_t platform_id : 1; + uint32_t timestamp : 1; + uint32_t partition_id : 1; + uint32_t reserved : 29; + } valid_bits; + uint32_t valid_mask; + }; + uint32_t record_length; /* Total size of CPER Entry */ + struct ras_cper_timestamp timestamp; + char platform_id[16]; + struct ras_cper_guid partition_id; /* Reserved */ + char creator_id[16]; + struct ras_cper_guid notify_type; /* CMC, MCE */ + char record_id[8]; /* Unique CPER Entry ID */ + uint32_t flags; /* Reserved */ + uint64_t persistence_info; /* Reserved */ + uint8_t reserved[12]; /* Reserved */ +}; + +struct cper_section_descriptor { + uint32_t sec_offset; /* Offset from the start of CPER entry */ + uint32_t sec_length; + uint8_t revision_minor; /* CPER_SEC_MINOR_REV_1 */ + uint8_t revision_major; /* CPER_SEC_MAJOR_REV_22 */ + union { + struct { + uint8_t fru_id : 1; + uint8_t fru_text : 1; + uint8_t reserved : 6; + } valid_bits; + uint8_t valid_mask; + }; + uint8_t reserved; + union { + struct { + uint32_t primary : 1; + uint32_t reserved1 : 2; + uint32_t exceed_err_threshold : 1; + uint32_t latent_err : 1; + uint32_t reserved2 : 27; + } flag_bits; + uint32_t flag_mask; + }; + struct ras_cper_guid sec_type; + char fru_id[16]; + enum ras_cper_severity severity; + char fru_text[20]; +}; + +struct runtime_hdr { + union { + struct { + uint64_t apic_id : 1; + uint64_t fw_id : 1; + uint64_t err_info_cnt : 6; + uint64_t err_context_cnt : 6; + } valid_bits; + uint64_t valid_mask; + }; + uint64_t apic_id; + char fw_id[48]; +}; + +struct runtime_descriptor { + struct ras_cper_guid error_type; + union { + struct { + uint64_t ms_chk : 1; + uint64_t target_addr_id : 1; + uint64_t req_id : 1; + uint64_t resp_id : 1; + uint64_t instr_ptr : 1; + uint64_t reserved : 59; + } valid_bits; + uint64_t valid_mask; + }; + union { + struct { + uint64_t err_type_valid : 1; + uint64_t pcc_valid : 1; + uint64_t uncorr_valid : 1; + uint64_t precise_ip_valid : 1; + uint64_t restartable_ip_valid : 1; + uint64_t overflow_valid : 1; + uint64_t reserved1 : 10; + uint64_t err_type : 2; + uint64_t pcc : 1; + uint64_t uncorr : 1; + uint64_t precised_ip : 1; + uint64_t restartable_ip : 1; + uint64_t overflow : 1; + uint64_t reserved2 : 41; + } ms_chk_bits; + uint64_t ms_chk_mask; + }; + uint64_t target_addr_id; + uint64_t req_id; + uint64_t resp_id; + uint64_t instr_ptr; +}; + +struct runtime_error_reg { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t msr_addr; + uint64_t mm_reg_addr; + uint64_t reg_dump[RAS_CPER_ACA_REG_COUNT]; +}; + +struct cper_section_runtime { + struct runtime_hdr hdr; + struct runtime_descriptor descriptor; + struct runtime_error_reg reg; +}; + +struct crashdump_hdr { + uint64_t reserved1; + uint64_t reserved2; + char fw_id[48]; + uint64_t reserved3[8]; +}; + +struct fatal_reg_info { + uint64_t status; + uint64_t addr; + uint64_t ipid; + uint64_t synd; +}; + +struct crashdump_fatal { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t reserved1; + uint64_t reserved2; + struct fatal_reg_info reg; +}; + +struct crashdump_boot { + uint16_t reg_ctx_type; + uint16_t reg_arr_size; + uint32_t reserved1; + uint64_t reserved2; + uint64_t msg[CPER_OAM_MAX_COUNT]; +}; + +struct cper_section_fatal { + struct crashdump_hdr hdr; + struct crashdump_fatal data; +}; + +struct cper_section_boot { + struct crashdump_hdr hdr; + struct crashdump_boot data; +}; + +struct ras_cper_fatal_record { + struct cper_section_hdr hdr; + struct cper_section_descriptor descriptor; + struct cper_section_fatal fatal; +}; +#pragma pack(pop) + +#define RAS_HDR_LEN (sizeof(struct cper_section_hdr)) +#define RAS_SEC_DESC_LEN (sizeof(struct cper_sec_desc)) + +#define RAS_BOOT_SEC_LEN (sizeof(struct cper_sec_crashdump_boot)) +#define RAS_FATAL_SEC_LEN (sizeof(struct cper_sec_crashdump_fatal)) +#define RAS_NONSTD_SEC_LEN (sizeof(struct cper_sec_nonstd_err)) + +#define RAS_SEC_DESC_OFFSET(idx) (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * idx)) + +#define RAS_BOOT_SEC_OFFSET(count, idx) \ + (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * count) + (RAS_BOOT_SEC_LEN * idx)) +#define RAS_FATAL_SEC_OFFSET(count, idx) \ + (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * count) + (RAS_FATAL_SEC_LEN * idx)) +#define RAS_NONSTD_SEC_OFFSET(count, idx) \ + (RAS_HDR_LEN + (RAS_SEC_DESC_LEN * count) + (RAS_NONSTD_SEC_LEN * idx)) + +struct ras_core_context; +struct ras_log_info; +int ras_cper_generate_cper(struct ras_core_context *ras_core, + struct ras_log_info **trace_list, uint32_t count, + uint8_t *buf, uint32_t buf_len, uint32_t *real_data_len); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.c b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.c new file mode 100644 index 000000000000..cd6b057bdaf3 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.c @@ -0,0 +1,1339 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "ras_eeprom.h" +#include "ras.h" + +/* These are memory addresses as would be seen by one or more EEPROM + * chips strung on the I2C bus, usually by manipulating pins 1-3 of a + * set of EEPROM devices. They form a continuous memory space. + * + * The I2C device address includes the device type identifier, 1010b, + * which is a reserved value and indicates that this is an I2C EEPROM + * device. It also includes the top 3 bits of the 19 bit EEPROM memory + * address, namely bits 18, 17, and 16. This makes up the 7 bit + * address sent on the I2C bus with bit 0 being the direction bit, + * which is not represented here, and sent by the hardware directly. + * + * For instance, + * 50h = 1010000b => device type identifier 1010b, bits 18:16 = 000b, address 0. + * 54h = 1010100b => --"--, bits 18:16 = 100b, address 40000h. + * 56h = 1010110b => --"--, bits 18:16 = 110b, address 60000h. + * Depending on the size of the I2C EEPROM device(s), bits 18:16 may + * address memory in a device or a device on the I2C bus, depending on + * the status of pins 1-3. + * + * The RAS table lives either at address 0 or address 40000h of EEPROM. + */ +#define EEPROM_I2C_MADDR_0 0x0 +#define EEPROM_I2C_MADDR_4 0x40000 + +#define EEPROM_PAGE_BITS 8 +#define EEPROM_PAGE_SIZE (1U << EEPROM_PAGE_BITS) +#define EEPROM_PAGE_MASK (EEPROM_PAGE_SIZE - 1) + +#define EEPROM_OFFSET_SIZE 2 +#define MAKE_I2C_ADDR(_aa) ((0xA << 3) | (((_aa) >> 16) & 0xF)) + +/* + * The 2 macros bellow represent the actual size in bytes that + * those entities occupy in the EEPROM memory. + * RAS_TABLE_RECORD_SIZE is different than sizeof(eeprom_umc_record) which + * uses uint64 to store 6b fields such as retired_page. + */ +#define RAS_TABLE_HEADER_SIZE 20 +#define RAS_TABLE_RECORD_SIZE 24 + +/* Table hdr is 'AMDR' */ +#define RAS_TABLE_HDR_VAL 0x414d4452 + +/* Bad GPU tag ‘BADG’ */ +#define RAS_TABLE_HDR_BAD 0x42414447 + +/* + * EEPROM Table structure v1 + * --------------------------------- + * | | + * | EEPROM TABLE HEADER | + * | ( size 20 Bytes ) | + * | | + * --------------------------------- + * | | + * | BAD PAGE RECORD AREA | + * | | + * --------------------------------- + */ + +/* Assume 2-Mbit size EEPROM and take up the whole space. */ +#define RAS_TBL_SIZE_BYTES (256 * 1024) +#define RAS_TABLE_START 0 +#define RAS_HDR_START RAS_TABLE_START +#define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) +#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ + / RAS_TABLE_RECORD_SIZE) + +/* + * EEPROM Table structrue v2.1 + * --------------------------------- + * | | + * | EEPROM TABLE HEADER | + * | ( size 20 Bytes ) | + * | | + * --------------------------------- + * | | + * | EEPROM TABLE RAS INFO | + * | (available info size 4 Bytes) | + * | ( reserved size 252 Bytes ) | + * | | + * --------------------------------- + * | | + * | BAD PAGE RECORD AREA | + * | | + * --------------------------------- + */ + +/* EEPROM Table V2_1 */ +#define RAS_TABLE_V2_1_INFO_SIZE 256 +#define RAS_TABLE_V2_1_INFO_START RAS_TABLE_HEADER_SIZE +#define RAS_RECORD_START_V2_1 (RAS_HDR_START + RAS_TABLE_HEADER_SIZE + \ + RAS_TABLE_V2_1_INFO_SIZE) +#define RAS_MAX_RECORD_COUNT_V2_1 ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE - \ + RAS_TABLE_V2_1_INFO_SIZE) \ + / RAS_TABLE_RECORD_SIZE) + +/* Given a zero-based index of an EEPROM RAS record, yields the EEPROM + * offset off of RAS_TABLE_START. That is, this is something you can + * add to control->i2c_address, and then tell I2C layer to read + * from/write to there. _N is the so called absolute index, + * because it starts right after the table header. + */ +#define RAS_INDEX_TO_OFFSET(_C, _N) ((_C)->ras_record_offset + \ + (_N) * RAS_TABLE_RECORD_SIZE) + +#define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \ + (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE) + +/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off + * of "fri", return the absolute record index off of the end of + * the table header. + */ +#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \ + (_C)->ras_max_record_count) + +#define RAS_NUM_RECS(_tbl_hdr) (((_tbl_hdr)->tbl_size - \ + RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE) + +#define RAS_NUM_RECS_V2_1(_tbl_hdr) (((_tbl_hdr)->tbl_size - \ + RAS_TABLE_HEADER_SIZE - \ + RAS_TABLE_V2_1_INFO_SIZE) / RAS_TABLE_RECORD_SIZE) + +#define to_ras_core_context(x) (container_of(x, struct ras_core_context, ras_eeprom)) + +static bool __is_ras_eeprom_supported(struct ras_core_context *ras_core) +{ + return ras_core->ras_eeprom_supported; +} + +static bool __get_eeprom_i2c_addr(struct ras_core_context *ras_core, + struct ras_eeprom_control *control) +{ + int ret = -EINVAL; + + if (control->sys_func && + control->sys_func->update_eeprom_i2c_config) + ret = control->sys_func->update_eeprom_i2c_config(ras_core); + else + RAS_DEV_WARN(ras_core->dev, + "No eeprom i2c system config!\n"); + + return !ret ? true : false; +} + +static int __ras_eeprom_xfer(struct ras_core_context *ras_core, u32 eeprom_addr, + u8 *eeprom_buf, u32 buf_size, bool read) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + int ret; + + if (control->sys_func && control->sys_func->eeprom_i2c_xfer) { + ret = control->sys_func->eeprom_i2c_xfer(ras_core, + eeprom_addr, eeprom_buf, buf_size, read); + + if ((ret > 0) && !read) { + /* According to EEPROM specs the length of the + * self-writing cycle, tWR (tW), is 10 ms. + * + * TODO: Use polling on ACK, aka Acknowledge + * Polling, to minimize waiting for the + * internal write cycle to complete, as it is + * usually smaller than tWR (tW). + */ + msleep(10); + } + + return ret; + } + + RAS_DEV_ERR(ras_core->dev, "Error: No eeprom i2c system xfer function!\n"); + return -EINVAL; +} + +static int __eeprom_xfer(struct ras_core_context *ras_core, u32 eeprom_addr, + u8 *eeprom_buf, u32 buf_size, bool read) +{ + u16 limit; + u16 ps; /* Partial size */ + int res = 0, r; + + if (read) + limit = ras_core->ras_eeprom.max_read_len; + else + limit = ras_core->ras_eeprom.max_write_len; + + if (limit && (limit <= EEPROM_OFFSET_SIZE)) { + RAS_DEV_ERR(ras_core->dev, + "maddr:0x%04X size:0x%02X:quirk max_%s_len must be > %d", + eeprom_addr, buf_size, + read ? "read" : "write", EEPROM_OFFSET_SIZE); + return -EINVAL; + } + + ras_core_down_gpu_reset_lock(ras_core); + + if (limit == 0) { + res = __ras_eeprom_xfer(ras_core, eeprom_addr, + eeprom_buf, buf_size, read); + } else { + /* The "limit" includes all data bytes sent/received, + * which would include the EEPROM_OFFSET_SIZE bytes. + * Account for them here. + */ + limit -= EEPROM_OFFSET_SIZE; + for ( ; buf_size > 0; + buf_size -= ps, eeprom_addr += ps, eeprom_buf += ps) { + ps = (buf_size < limit) ? buf_size : limit; + + r = __ras_eeprom_xfer(ras_core, eeprom_addr, + eeprom_buf, ps, read); + if (r < 0) + break; + + res += r; + } + } + + ras_core_up_gpu_reset_lock(ras_core); + + return res; +} + +static int __eeprom_read(struct ras_core_context *ras_core, + u32 eeprom_addr, u8 *eeprom_buf, u32 bytes) +{ + return __eeprom_xfer(ras_core, eeprom_addr, + eeprom_buf, bytes, true); +} + +static int __eeprom_write(struct ras_core_context *ras_core, + u32 eeprom_addr, u8 *eeprom_buf, u32 bytes) +{ + return __eeprom_xfer(ras_core, eeprom_addr, + eeprom_buf, bytes, false); +} + +static void +__encode_table_header_to_buf(struct ras_eeprom_table_header *hdr, + unsigned char *buf) +{ + u32 *pp = (uint32_t *)buf; + + pp[0] = cpu_to_le32(hdr->header); + pp[1] = cpu_to_le32(hdr->version); + pp[2] = cpu_to_le32(hdr->first_rec_offset); + pp[3] = cpu_to_le32(hdr->tbl_size); + pp[4] = cpu_to_le32(hdr->checksum); +} + +static void +__decode_table_header_from_buf(struct ras_eeprom_table_header *hdr, + unsigned char *buf) +{ + u32 *pp = (uint32_t *)buf; + + hdr->header = le32_to_cpu(pp[0]); + hdr->version = le32_to_cpu(pp[1]); + hdr->first_rec_offset = le32_to_cpu(pp[2]); + hdr->tbl_size = le32_to_cpu(pp[3]); + hdr->checksum = le32_to_cpu(pp[4]); +} + +static int __write_table_header(struct ras_eeprom_control *control) +{ + u8 buf[RAS_TABLE_HEADER_SIZE]; + struct ras_core_context *ras_core = to_ras_core_context(control); + int res; + + memset(buf, 0, sizeof(buf)); + __encode_table_header_to_buf(&control->tbl_hdr, buf); + + /* i2c may be unstable in gpu reset */ + res = __eeprom_write(ras_core, + control->i2c_address + + control->ras_header_offset, + buf, RAS_TABLE_HEADER_SIZE); + + if (res < 0) { + RAS_DEV_ERR(ras_core->dev, + "Failed to write EEPROM table header:%d\n", res); + } else if (res < RAS_TABLE_HEADER_SIZE) { + RAS_DEV_ERR(ras_core->dev, + "Short write:%d out of %d\n", res, RAS_TABLE_HEADER_SIZE); + res = -EIO; + } else { + res = 0; + } + + return res; +} + +static void +__encode_table_ras_info_to_buf(struct ras_eeprom_table_ras_info *rai, + unsigned char *buf) +{ + u32 *pp = (uint32_t *)buf; + u32 tmp; + + tmp = ((uint32_t)(rai->rma_status) & 0xFF) | + (((uint32_t)(rai->health_percent) << 8) & 0xFF00) | + (((uint32_t)(rai->ecc_page_threshold) << 16) & 0xFFFF0000); + pp[0] = cpu_to_le32(tmp); +} + +static void +__decode_table_ras_info_from_buf(struct ras_eeprom_table_ras_info *rai, + unsigned char *buf) +{ + u32 *pp = (uint32_t *)buf; + u32 tmp; + + tmp = le32_to_cpu(pp[0]); + rai->rma_status = tmp & 0xFF; + rai->health_percent = (tmp >> 8) & 0xFF; + rai->ecc_page_threshold = (tmp >> 16) & 0xFFFF; +} + +static int __write_table_ras_info(struct ras_eeprom_control *control) +{ + struct ras_core_context *ras_core = to_ras_core_context(control); + u8 *buf; + int res; + + buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL); + if (!buf) { + RAS_DEV_ERR(ras_core->dev, + "Failed to alloc buf to write table ras info\n"); + return -ENOMEM; + } + + __encode_table_ras_info_to_buf(&control->tbl_rai, buf); + + /* i2c may be unstable in gpu reset */ + res = __eeprom_write(ras_core, + control->i2c_address + + control->ras_info_offset, + buf, RAS_TABLE_V2_1_INFO_SIZE); + + if (res < 0) { + RAS_DEV_ERR(ras_core->dev, + "Failed to write EEPROM table ras info:%d\n", res); + } else if (res < RAS_TABLE_V2_1_INFO_SIZE) { + RAS_DEV_ERR(ras_core->dev, + "Short write:%d out of %d\n", res, RAS_TABLE_V2_1_INFO_SIZE); + res = -EIO; + } else { + res = 0; + } + + kfree(buf); + + return res; +} + +static u8 __calc_hdr_byte_sum(const struct ras_eeprom_control *control) +{ + int ii; + u8 *pp, csum; + u32 sz; + + /* Header checksum, skip checksum field in the calculation */ + sz = sizeof(control->tbl_hdr) - sizeof(control->tbl_hdr.checksum); + pp = (u8 *) &control->tbl_hdr; + csum = 0; + for (ii = 0; ii < sz; ii++, pp++) + csum += *pp; + + return csum; +} + +static u8 __calc_ras_info_byte_sum(const struct ras_eeprom_control *control) +{ + int ii; + u8 *pp, csum; + u32 sz; + + sz = sizeof(control->tbl_rai); + pp = (u8 *) &control->tbl_rai; + csum = 0; + for (ii = 0; ii < sz; ii++, pp++) + csum += *pp; + + return csum; +} + +static int ras_eeprom_correct_header_tag( + struct ras_eeprom_control *control, + uint32_t header) +{ + struct ras_eeprom_table_header *hdr = &control->tbl_hdr; + u8 *hh; + int res; + u8 csum; + + csum = -hdr->checksum; + + hh = (void *) &hdr->header; + csum -= (hh[0] + hh[1] + hh[2] + hh[3]); + hh = (void *) &header; + csum += hh[0] + hh[1] + hh[2] + hh[3]; + csum = -csum; + mutex_lock(&control->ras_tbl_mutex); + hdr->header = header; + hdr->checksum = csum; + res = __write_table_header(control); + mutex_unlock(&control->ras_tbl_mutex); + + return res; +} + +static void ras_set_eeprom_table_version(struct ras_eeprom_control *control) +{ + struct ras_eeprom_table_header *hdr = &control->tbl_hdr; + + hdr->version = RAS_TABLE_VER_V3; +} + +int ras_eeprom_reset_table(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + struct ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct ras_eeprom_table_ras_info *rai = &control->tbl_rai; + u8 csum; + int res; + + mutex_lock(&control->ras_tbl_mutex); + + hdr->header = RAS_TABLE_HDR_VAL; + ras_set_eeprom_table_version(control); + + if (hdr->version >= RAS_TABLE_VER_V2_1) { + hdr->first_rec_offset = RAS_RECORD_START_V2_1; + hdr->tbl_size = RAS_TABLE_HEADER_SIZE + + RAS_TABLE_V2_1_INFO_SIZE; + rai->rma_status = RAS_GPU_HEALTH_USABLE; + /** + * GPU health represented as a percentage. + * 0 means worst health, 100 means fully health. + */ + rai->health_percent = 100; + /* ecc_page_threshold = 0 means disable bad page retirement */ + rai->ecc_page_threshold = control->record_threshold_count; + } else { + hdr->first_rec_offset = RAS_RECORD_START; + hdr->tbl_size = RAS_TABLE_HEADER_SIZE; + } + + csum = __calc_hdr_byte_sum(control); + if (hdr->version >= RAS_TABLE_VER_V2_1) + csum += __calc_ras_info_byte_sum(control); + csum = -csum; + hdr->checksum = csum; + res = __write_table_header(control); + if (!res && hdr->version > RAS_TABLE_VER_V1) + res = __write_table_ras_info(control); + + control->ras_num_recs = 0; + control->ras_fri = 0; + + control->bad_channel_bitmap = 0; + ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM, + &control->ras_num_recs); + ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP, + &control->bad_channel_bitmap); + control->update_channel_flag = false; + + mutex_unlock(&control->ras_tbl_mutex); + + return res; +} + +static void +__encode_table_record_to_buf(struct ras_eeprom_control *control, + struct eeprom_umc_record *record, + unsigned char *buf) +{ + __le64 tmp = 0; + int i = 0; + + /* Next are all record fields according to EEPROM page spec in LE foramt */ + buf[i++] = record->err_type; + + buf[i++] = record->bank; + + tmp = cpu_to_le64(record->ts); + memcpy(buf + i, &tmp, 8); + i += 8; + + tmp = cpu_to_le64((record->offset & 0xffffffffffff)); + memcpy(buf + i, &tmp, 6); + i += 6; + + buf[i++] = record->mem_channel; + buf[i++] = record->mcumc_id; + + tmp = cpu_to_le64((record->retired_row_pfn & 0xffffffffffff)); + memcpy(buf + i, &tmp, 6); +} + +static void +__decode_table_record_from_buf(struct ras_eeprom_control *control, + struct eeprom_umc_record *record, + unsigned char *buf) +{ + __le64 tmp = 0; + int i = 0; + + /* Next are all record fields according to EEPROM page spec in LE foramt */ + record->err_type = buf[i++]; + + record->bank = buf[i++]; + + memcpy(&tmp, buf + i, 8); + record->ts = le64_to_cpu(tmp); + i += 8; + + memcpy(&tmp, buf + i, 6); + record->offset = (le64_to_cpu(tmp) & 0xffffffffffff); + i += 6; + + record->mem_channel = buf[i++]; + record->mcumc_id = buf[i++]; + + memcpy(&tmp, buf + i, 6); + record->retired_row_pfn = (le64_to_cpu(tmp) & 0xffffffffffff); +} + +bool ras_eeprom_check_safety_watermark(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + bool ret = false; + int bad_page_count; + + if (!__is_ras_eeprom_supported(ras_core) || + !control->record_threshold_config) + return false; + + bad_page_count = ras_umc_get_badpage_count(ras_core); + if (control->tbl_hdr.header == RAS_TABLE_HDR_BAD) { + if (bad_page_count > control->record_threshold_count) + RAS_DEV_WARN(ras_core->dev, "RAS records:%d exceed threshold:%d", + bad_page_count, control->record_threshold_count); + + if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) || + (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) { + RAS_DEV_WARN(ras_core->dev, + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n"); + ret = false; + } else { + ras_core->is_rma = true; + RAS_DEV_WARN(ras_core->dev, + "Please consider adjusting the customized threshold.\n"); + ret = true; + } + } + + return ret; +} + +/** + * __ras_eeprom_write -- write indexed from buffer to EEPROM + * @control: pointer to control structure + * @buf: pointer to buffer containing data to write + * @fri: start writing at this index + * @num: number of records to write + * + * The caller must hold the table mutex in @control. + * Return 0 on success, -errno otherwise. + */ +static int __ras_eeprom_write(struct ras_eeprom_control *control, + u8 *buf, const u32 fri, const u32 num) +{ + struct ras_core_context *ras_core = to_ras_core_context(control); + u32 buf_size; + int res; + + /* i2c may be unstable in gpu reset */ + buf_size = num * RAS_TABLE_RECORD_SIZE; + res = __eeprom_write(ras_core, + control->i2c_address + RAS_INDEX_TO_OFFSET(control, fri), + buf, buf_size); + if (res < 0) { + RAS_DEV_ERR(ras_core->dev, + "Writing %d EEPROM table records error:%d\n", num, res); + } else if (res < buf_size) { + /* Short write, return error.*/ + RAS_DEV_ERR(ras_core->dev, + "Wrote %d records out of %d\n", + (res/RAS_TABLE_RECORD_SIZE), num); + res = -EIO; + } else { + res = 0; + } + + return res; +} + +static int ras_eeprom_append_table(struct ras_eeprom_control *control, + struct eeprom_umc_record *record, + const u32 num) +{ + u32 a, b, i; + u8 *buf, *pp; + int res; + + buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Encode all of them in one go. + */ + pp = buf; + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { + __encode_table_record_to_buf(control, &record[i], pp); + + /* update bad channel bitmap */ + if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && + !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + control->bad_channel_bitmap |= 1 << record[i].mem_channel; + control->update_channel_flag = true; + } + } + + /* a, first record index to write into. + * b, last record index to write into. + * a = first index to read (fri) + number of records in the table, + * b = a + @num - 1. + * Let N = control->ras_max_num_record_count, then we have, + * case 0: 0 <= a <= b < N, + * just append @num records starting at a; + * case 1: 0 <= a < N <= b, + * append (N - a) records starting at a, and + * append the remainder, b % N + 1, starting at 0. + * case 2: 0 <= fri < N <= a <= b, then modulo N we get two subcases, + * case 2a: 0 <= a <= b < N + * append num records starting at a; and fix fri if b overwrote it, + * and since a <= b, if b overwrote it then a must've also, + * and if b didn't overwrite it, then a didn't also. + * case 2b: 0 <= b < a < N + * write num records starting at a, which wraps around 0=N + * and overwrite fri unconditionally. Now from case 2a, + * this means that b eclipsed fri to overwrite it and wrap + * around 0 again, i.e. b = 2N+r pre modulo N, so we unconditionally + * set fri = b + 1 (mod N). + * Now, since fri is updated in every case, except the trivial case 0, + * the number of records present in the table after writing, is, + * num_recs - 1 = b - fri (mod N), and we take the positive value, + * by adding an arbitrary multiple of N before taking the modulo N + * as shown below. + */ + a = control->ras_fri + control->ras_num_recs; + b = a + num - 1; + if (b < control->ras_max_record_count) { + res = __ras_eeprom_write(control, buf, a, num); + } else if (a < control->ras_max_record_count) { + u32 g0, g1; + + g0 = control->ras_max_record_count - a; + g1 = b % control->ras_max_record_count + 1; + res = __ras_eeprom_write(control, buf, a, g0); + if (res) + goto Out; + res = __ras_eeprom_write(control, + buf + g0 * RAS_TABLE_RECORD_SIZE, + 0, g1); + if (res) + goto Out; + if (g1 > control->ras_fri) + control->ras_fri = g1 % control->ras_max_record_count; + } else { + a %= control->ras_max_record_count; + b %= control->ras_max_record_count; + + if (a <= b) { + /* Note that, b - a + 1 = num. */ + res = __ras_eeprom_write(control, buf, a, num); + if (res) + goto Out; + if (b >= control->ras_fri) + control->ras_fri = (b + 1) % control->ras_max_record_count; + } else { + u32 g0, g1; + + /* b < a, which means, we write from + * a to the end of the table, and from + * the start of the table to b. + */ + g0 = control->ras_max_record_count - a; + g1 = b + 1; + res = __ras_eeprom_write(control, buf, a, g0); + if (res) + goto Out; + res = __ras_eeprom_write(control, + buf + g0 * RAS_TABLE_RECORD_SIZE, 0, g1); + if (res) + goto Out; + control->ras_fri = g1 % control->ras_max_record_count; + } + } + control->ras_num_recs = 1 + + (control->ras_max_record_count + b - control->ras_fri) + % control->ras_max_record_count; +Out: + kfree(buf); + return res; +} + +static int ras_eeprom_update_header(struct ras_eeprom_control *control) +{ + struct ras_core_context *ras_core = to_ras_core_context(control); + int threshold_config = control->record_threshold_config; + u8 *buf, *pp, csum; + u32 buf_size; + int bad_page_count; + int res; + + bad_page_count = ras_umc_get_badpage_count(ras_core); + /* Modify the header if it exceeds. + */ + if (threshold_config != 0 && + bad_page_count > control->record_threshold_count) { + RAS_DEV_WARN(ras_core->dev, + "Saved bad pages %d reaches threshold value %d\n", + bad_page_count, control->record_threshold_count); + control->tbl_hdr.header = RAS_TABLE_HDR_BAD; + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { + control->tbl_rai.rma_status = RAS_GPU_RETIRED__ECC_REACH_THRESHOLD; + control->tbl_rai.health_percent = 0; + } + + if ((threshold_config != WARN_NONSTOP_OVER_THRESHOLD) && + (threshold_config != NONSTOP_OVER_THRESHOLD)) + ras_core->is_rma = true; + + /* ignore the -ENOTSUPP return value */ + ras_core_event_notify(ras_core, RAS_EVENT_ID__DEVICE_RMA, NULL); + } + + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) + control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + + RAS_TABLE_V2_1_INFO_SIZE + + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + else + control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + control->tbl_hdr.checksum = 0; + + buf_size = control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + buf = kcalloc(control->ras_num_recs, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); + if (!buf) { + RAS_DEV_ERR(ras_core->dev, + "allocating memory for table of size %d bytes failed\n", + control->tbl_hdr.tbl_size); + res = -ENOMEM; + goto Out; + } + + res = __eeprom_read(ras_core, + control->i2c_address + + control->ras_record_offset, + buf, buf_size); + if (res < 0) { + RAS_DEV_ERR(ras_core->dev, + "EEPROM failed reading records:%d\n", res); + goto Out; + } else if (res < buf_size) { + RAS_DEV_ERR(ras_core->dev, + "EEPROM read %d out of %d bytes\n", res, buf_size); + res = -EIO; + goto Out; + } + + /** + * bad page records have been stored in eeprom, + * now calculate gpu health percent + */ + if (threshold_config != 0 && + control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && + bad_page_count <= control->record_threshold_count) + control->tbl_rai.health_percent = ((control->record_threshold_count - + bad_page_count) * 100) / control->record_threshold_count; + + /* Recalc the checksum. + */ + csum = 0; + for (pp = buf; pp < buf + buf_size; pp++) + csum += *pp; + + csum += __calc_hdr_byte_sum(control); + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) + csum += __calc_ras_info_byte_sum(control); + /* avoid sign extension when assigning to "checksum" */ + csum = -csum; + control->tbl_hdr.checksum = csum; + res = __write_table_header(control); + if (!res && control->tbl_hdr.version > RAS_TABLE_VER_V1) + res = __write_table_ras_info(control); +Out: + kfree(buf); + return res; +} + +/** + * ras_core_eeprom_append -- append records to the EEPROM RAS table + * @control: pointer to control structure + * @record: array of records to append + * @num: number of records in @record array + * + * Append @num records to the table, calculate the checksum and write + * the table back to EEPROM. The maximum number of records that + * can be appended is between 1 and control->ras_max_record_count, + * regardless of how many records are already stored in the table. + * + * Return 0 on success or if EEPROM is not supported, -errno on error. + */ +int ras_eeprom_append(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, const u32 num) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + int res; + + if (!__is_ras_eeprom_supported(ras_core)) + return 0; + + if (num == 0) { + RAS_DEV_ERR(ras_core->dev, "will not append 0 records\n"); + return -EINVAL; + } else if ((num + control->ras_num_recs) > control->ras_max_record_count) { + RAS_DEV_ERR(ras_core->dev, + "cannot append %d records than the size of table %d\n", + num, control->ras_max_record_count); + return -EINVAL; + } + + mutex_lock(&control->ras_tbl_mutex); + res = ras_eeprom_append_table(control, record, num); + if (!res) + res = ras_eeprom_update_header(control); + + mutex_unlock(&control->ras_tbl_mutex); + + return res; +} + +/** + * __ras_eeprom_read -- read indexed from EEPROM into buffer + * @control: pointer to control structure + * @buf: pointer to buffer to read into + * @fri: first record index, start reading at this index, absolute index + * @num: number of records to read + * + * The caller must hold the table mutex in @control. + * Return 0 on success, -errno otherwise. + */ +static int __ras_eeprom_read(struct ras_eeprom_control *control, + u8 *buf, const u32 fri, const u32 num) +{ + struct ras_core_context *ras_core = to_ras_core_context(control); + u32 buf_size; + int res; + + /* i2c may be unstable in gpu reset */ + buf_size = num * RAS_TABLE_RECORD_SIZE; + res = __eeprom_read(ras_core, + control->i2c_address + + RAS_INDEX_TO_OFFSET(control, fri), + buf, buf_size); + if (res < 0) { + RAS_DEV_ERR(ras_core->dev, + "Reading %d EEPROM table records error:%d\n", num, res); + } else if (res < buf_size) { + /* Short read, return error. + */ + RAS_DEV_ERR(ras_core->dev, + "Read %d records out of %d\n", + (res/RAS_TABLE_RECORD_SIZE), num); + res = -EIO; + } else { + res = 0; + } + + return res; +} + +int ras_eeprom_read(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, const u32 num) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + int i, res; + u8 *buf, *pp; + u32 g0, g1; + + if (!__is_ras_eeprom_supported(ras_core)) + return 0; + + if (num == 0) { + RAS_DEV_ERR(ras_core->dev, "will not read 0 records\n"); + return -EINVAL; + } else if (num > control->ras_num_recs) { + RAS_DEV_ERR(ras_core->dev, + "too many records to read:%d available:%d\n", + num, control->ras_num_recs); + return -EINVAL; + } + + buf = kcalloc(num, RAS_TABLE_RECORD_SIZE, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* Determine how many records to read, from the first record + * index, fri, to the end of the table, and from the beginning + * of the table, such that the total number of records is + * @num, and we handle wrap around when fri > 0 and + * fri + num > RAS_MAX_RECORD_COUNT. + * + * First we compute the index of the last element + * which would be fetched from each region, + * g0 is in [fri, fri + num - 1], and + * g1 is in [0, RAS_MAX_RECORD_COUNT - 1]. + * Then, if g0 < RAS_MAX_RECORD_COUNT, the index of + * the last element to fetch, we set g0 to _the number_ + * of elements to fetch, @num, since we know that the last + * indexed to be fetched does not exceed the table. + * + * If, however, g0 >= RAS_MAX_RECORD_COUNT, then + * we set g0 to the number of elements to read + * until the end of the table, and g1 to the number of + * elements to read from the beginning of the table. + */ + g0 = control->ras_fri + num - 1; + g1 = g0 % control->ras_max_record_count; + if (g0 < control->ras_max_record_count) { + g0 = num; + g1 = 0; + } else { + g0 = control->ras_max_record_count - control->ras_fri; + g1 += 1; + } + + mutex_lock(&control->ras_tbl_mutex); + res = __ras_eeprom_read(control, buf, control->ras_fri, g0); + if (res) + goto Out; + if (g1) { + res = __ras_eeprom_read(control, + buf + g0 * RAS_TABLE_RECORD_SIZE, 0, g1); + if (res) + goto Out; + } + + res = 0; + + /* Read up everything? Then transform. + */ + pp = buf; + for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) { + __decode_table_record_from_buf(control, &record[i], pp); + + /* update bad channel bitmap */ + if ((record[i].mem_channel < BITS_PER_TYPE(control->bad_channel_bitmap)) && + !(control->bad_channel_bitmap & (1 << record[i].mem_channel))) { + control->bad_channel_bitmap |= 1 << record[i].mem_channel; + control->update_channel_flag = true; + } + } +Out: + kfree(buf); + mutex_unlock(&control->ras_tbl_mutex); + + return res; +} + +uint32_t ras_eeprom_max_record_count(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + + /* get available eeprom table version first before eeprom table init */ + ras_set_eeprom_table_version(control); + + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) + return RAS_MAX_RECORD_COUNT_V2_1; + else + return RAS_MAX_RECORD_COUNT; +} + +/** + * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum + * @control: pointer to control structure + * + * Check the checksum of the stored in EEPROM RAS table. + * + * Return 0 if the checksum is correct, + * positive if it is not correct, and + * -errno on I/O error. + */ +static int __verify_ras_table_checksum(struct ras_eeprom_control *control) +{ + struct ras_core_context *ras_core = to_ras_core_context(control); + int buf_size, res; + u8 csum, *buf, *pp; + + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) + buf_size = RAS_TABLE_HEADER_SIZE + + RAS_TABLE_V2_1_INFO_SIZE + + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + else + buf_size = RAS_TABLE_HEADER_SIZE + + control->ras_num_recs * RAS_TABLE_RECORD_SIZE; + + buf = kzalloc(buf_size, GFP_KERNEL); + if (!buf) { + RAS_DEV_ERR(ras_core->dev, + "Out of memory checking RAS table checksum.\n"); + return -ENOMEM; + } + + res = __eeprom_read(ras_core, + control->i2c_address + + control->ras_header_offset, + buf, buf_size); + if (res < buf_size) { + RAS_DEV_ERR(ras_core->dev, + "Partial read for checksum, res:%d\n", res); + /* On partial reads, return -EIO. + */ + if (res >= 0) + res = -EIO; + goto Out; + } + + csum = 0; + for (pp = buf; pp < buf + buf_size; pp++) + csum += *pp; +Out: + kfree(buf); + return res < 0 ? res : csum; +} + +static int __read_table_ras_info(struct ras_eeprom_control *control) +{ + struct ras_eeprom_table_ras_info *rai = &control->tbl_rai; + struct ras_core_context *ras_core = to_ras_core_context(control); + unsigned char *buf; + int res; + + buf = kzalloc(RAS_TABLE_V2_1_INFO_SIZE, GFP_KERNEL); + if (!buf) { + RAS_DEV_ERR(ras_core->dev, + "Failed to alloc buf to read EEPROM table ras info\n"); + return -ENOMEM; + } + + /** + * EEPROM table V2_1 supports ras info, + * read EEPROM table ras info + */ + res = __eeprom_read(ras_core, + control->i2c_address + control->ras_info_offset, + buf, RAS_TABLE_V2_1_INFO_SIZE); + if (res < RAS_TABLE_V2_1_INFO_SIZE) { + RAS_DEV_ERR(ras_core->dev, + "Failed to read EEPROM table ras info, res:%d\n", res); + res = res >= 0 ? -EIO : res; + goto Out; + } + + __decode_table_ras_info_from_buf(rai, buf); + +Out: + kfree(buf); + return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; +} + +static int __check_ras_table_status(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; + struct ras_eeprom_table_header *hdr; + int res; + + hdr = &control->tbl_hdr; + + if (!__is_ras_eeprom_supported(ras_core)) + return 0; + + if (!__get_eeprom_i2c_addr(ras_core, control)) + return -EINVAL; + + control->ras_header_offset = RAS_HDR_START; + control->ras_info_offset = RAS_TABLE_V2_1_INFO_START; + mutex_init(&control->ras_tbl_mutex); + + /* Read the table header from EEPROM address */ + res = __eeprom_read(ras_core, + control->i2c_address + control->ras_header_offset, + buf, RAS_TABLE_HEADER_SIZE); + if (res < RAS_TABLE_HEADER_SIZE) { + RAS_DEV_ERR(ras_core->dev, + "Failed to read EEPROM table header, res:%d\n", res); + return res >= 0 ? -EIO : res; + } + + __decode_table_header_from_buf(hdr, buf); + + if (hdr->header != RAS_TABLE_HDR_VAL && + hdr->header != RAS_TABLE_HDR_BAD) { + RAS_DEV_INFO(ras_core->dev, "Creating a new EEPROM table"); + return ras_eeprom_reset_table(ras_core); + } + + switch (hdr->version) { + case RAS_TABLE_VER_V2_1: + case RAS_TABLE_VER_V3: + control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); + control->ras_record_offset = RAS_RECORD_START_V2_1; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; + break; + case RAS_TABLE_VER_V1: + control->ras_num_recs = RAS_NUM_RECS(hdr); + control->ras_record_offset = RAS_RECORD_START; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT; + break; + default: + RAS_DEV_ERR(ras_core->dev, + "RAS header invalid, unsupported version: %u", + hdr->version); + return -EINVAL; + } + + if (control->ras_num_recs > control->ras_max_record_count) { + RAS_DEV_ERR(ras_core->dev, + "RAS header invalid, records in header: %u max allowed :%u", + control->ras_num_recs, control->ras_max_record_count); + return -EINVAL; + } + + control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); + + return 0; +} + +int ras_eeprom_check_storage_status(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + struct ras_eeprom_table_header *hdr; + int bad_page_count; + int res = 0; + + if (!__is_ras_eeprom_supported(ras_core)) + return 0; + + if (!__get_eeprom_i2c_addr(ras_core, control)) + return -EINVAL; + + hdr = &control->tbl_hdr; + + bad_page_count = ras_umc_get_badpage_count(ras_core); + if (hdr->header == RAS_TABLE_HDR_VAL) { + RAS_DEV_INFO(ras_core->dev, + "Found existing EEPROM table with %d records\n", + bad_page_count); + + if (hdr->version >= RAS_TABLE_VER_V2_1) { + res = __read_table_ras_info(control); + if (res) + return res; + } + + res = __verify_ras_table_checksum(control); + if (res) + RAS_DEV_ERR(ras_core->dev, + "RAS table incorrect checksum or error:%d\n", res); + + /* Warn if we are at 90% of the threshold or above + */ + if (10 * bad_page_count >= 9 * control->record_threshold_count) + RAS_DEV_WARN(ras_core->dev, + "RAS records:%u exceeds 90%% of threshold:%d\n", + bad_page_count, + control->record_threshold_count); + + } else if (hdr->header == RAS_TABLE_HDR_BAD && + control->record_threshold_config != 0) { + if (hdr->version >= RAS_TABLE_VER_V2_1) { + res = __read_table_ras_info(control); + if (res) + return res; + } + + res = __verify_ras_table_checksum(control); + if (res) + RAS_DEV_ERR(ras_core->dev, + "RAS Table incorrect checksum or error:%d\n", res); + + if (control->record_threshold_count >= bad_page_count) { + /* This means that, the threshold was increased since + * the last time the system was booted, and now, + * ras->record_threshold_count - control->num_recs > 0, + * so that at least one more record can be saved, + * before the page count threshold is reached. + */ + RAS_DEV_INFO(ras_core->dev, + "records:%d threshold:%d, resetting RAS table header signature", + bad_page_count, + control->record_threshold_count); + res = ras_eeprom_correct_header_tag(control, RAS_TABLE_HDR_VAL); + } else { + RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d", + bad_page_count, control->record_threshold_count); + if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) || + (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) { + RAS_DEV_WARN(ras_core->dev, + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); + res = 0; + } else { + ras_core->is_rma = true; + RAS_DEV_ERR(ras_core->dev, + "User defined threshold is set, runtime service will be halt when threshold is reached\n"); + } + } + } + + return res < 0 ? res : 0; +} + +int ras_eeprom_hw_init(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control; + struct ras_eeprom_config *eeprom_cfg; + + if (!ras_core) + return -EINVAL; + + ras_core->is_rma = false; + + control = &ras_core->ras_eeprom; + + memset(control, 0, sizeof(*control)); + + eeprom_cfg = &ras_core->config->eeprom_cfg; + control->record_threshold_config = + eeprom_cfg->eeprom_record_threshold_config; + + control->record_threshold_count = ras_eeprom_max_record_count(ras_core); + if (eeprom_cfg->eeprom_record_threshold_count < + control->record_threshold_count) + control->record_threshold_count = + eeprom_cfg->eeprom_record_threshold_count; + + control->sys_func = eeprom_cfg->eeprom_sys_fn; + control->max_read_len = eeprom_cfg->max_i2c_read_len; + control->max_write_len = eeprom_cfg->max_i2c_write_len; + control->i2c_adapter = eeprom_cfg->eeprom_i2c_adapter; + control->i2c_port = eeprom_cfg->eeprom_i2c_port; + control->i2c_address = eeprom_cfg->eeprom_i2c_addr; + + control->update_channel_flag = false; + + return __check_ras_table_status(ras_core); +} + +int ras_eeprom_hw_fini(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control; + + if (!ras_core) + return -EINVAL; + + control = &ras_core->ras_eeprom; + mutex_destroy(&control->ras_tbl_mutex); + + return 0; +} + +uint32_t ras_eeprom_get_record_count(struct ras_core_context *ras_core) +{ + if (!ras_core) + return 0; + + return ras_core->ras_eeprom.ras_num_recs; +} + +void ras_eeprom_sync_info(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control; + + if (!ras_core) + return; + + control = &ras_core->ras_eeprom; + ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_PAGE_NUM, + &control->ras_num_recs); + ras_core_event_notify(ras_core, RAS_EVENT_ID__UPDATE_BAD_CHANNEL_BITMAP, + &control->bad_channel_bitmap); +} + +enum ras_gpu_health_status + ras_eeprom_check_gpu_status(struct ras_core_context *ras_core) +{ + struct ras_eeprom_control *control = &ras_core->ras_eeprom; + struct ras_eeprom_table_ras_info *rai = &control->tbl_rai; + + if (!__is_ras_eeprom_supported(ras_core) || + !control->record_threshold_config) + return RAS_GPU_HEALTH_NONE; + + if (control->tbl_hdr.header == RAS_TABLE_HDR_BAD) + return RAS_GPU_IN_BAD_STATUS; + + return rai->rma_status; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h new file mode 100644 index 000000000000..2abe566c18b6 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_EEPROM_H__ +#define __RAS_EEPROM_H__ +#include "ras_sys.h" + +#define RAS_TABLE_VER_V1 0x00010000 +#define RAS_TABLE_VER_V2_1 0x00021000 +#define RAS_TABLE_VER_V3 0x00030000 + +#define NONSTOP_OVER_THRESHOLD -2 +#define WARN_NONSTOP_OVER_THRESHOLD -1 +#define DISABLE_RETIRE_PAGE 0 + +/* + * Bad address pfn : eeprom_umc_record.retired_row_pfn[39:0], + * nps mode: eeprom_umc_record.retired_row_pfn[47:40] + */ +#define EEPROM_RECORD_UMC_ADDR_MASK 0xFFFFFFFFFFULL +#define EEPROM_RECORD_UMC_NPS_MASK 0xFF0000000000ULL +#define EEPROM_RECORD_UMC_NPS_SHIFT 40 + +#define EEPROM_RECORD_UMC_NPS_MODE(RECORD) \ + (((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_NPS_MASK) >> \ + EEPROM_RECORD_UMC_NPS_SHIFT) + +#define EEPROM_RECORD_UMC_ADDR_PFN(RECORD) \ + ((RECORD)->retired_row_pfn & EEPROM_RECORD_UMC_ADDR_MASK) + +#define EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(RECORD, ADDR, NPS) \ +do { \ + uint64_t tmp = (NPS); \ + tmp = ((tmp << EEPROM_RECORD_UMC_NPS_SHIFT) & EEPROM_RECORD_UMC_NPS_MASK); \ + tmp |= (ADDR) & EEPROM_RECORD_UMC_ADDR_MASK; \ + (RECORD)->retired_row_pfn = tmp; \ +} while (0) + +enum ras_gpu_health_status { + RAS_GPU_HEALTH_NONE = 0, + RAS_GPU_HEALTH_USABLE = 1, + RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2, + RAS_GPU_IN_BAD_STATUS = 3, +}; + +enum ras_eeprom_err_type { + RAS_EEPROM_ERR_NA, + RAS_EEPROM_ERR_RECOVERABLE, + RAS_EEPROM_ERR_NON_RECOVERABLE, + RAS_EEPROM_ERR_COUNT, +}; + +struct ras_eeprom_table_header { + uint32_t header; + uint32_t version; + uint32_t first_rec_offset; + uint32_t tbl_size; + uint32_t checksum; +} __packed; + +struct ras_eeprom_table_ras_info { + u8 rma_status; + u8 health_percent; + u16 ecc_page_threshold; + u32 padding[64 - 1]; +} __packed; + +struct ras_eeprom_control { + struct ras_eeprom_table_header tbl_hdr; + struct ras_eeprom_table_ras_info tbl_rai; + + /* record threshold */ + int record_threshold_config; + uint32_t record_threshold_count; + bool update_channel_flag; + + const struct ras_eeprom_sys_func *sys_func; + void *i2c_adapter; + u32 i2c_port; + u16 max_read_len; + u16 max_write_len; + + /* Base I2C EEPPROM 19-bit memory address, + * where the table is located. For more information, + * see top of amdgpu_eeprom.c. + */ + u32 i2c_address; + + /* The byte offset off of @i2c_address + * where the table header is found, + * and where the records start--always + * right after the header. + */ + u32 ras_header_offset; + u32 ras_info_offset; + u32 ras_record_offset; + + /* Number of records in the table. + */ + u32 ras_num_recs; + + /* First record index to read, 0-based. + * Range is [0, num_recs-1]. This is + * an absolute index, starting right after + * the table header. + */ + u32 ras_fri; + + /* Maximum possible number of records + * we could store, i.e. the maximum capacity + * of the table. + */ + u32 ras_max_record_count; + + /* Protect table access via this mutex. + */ + struct mutex ras_tbl_mutex; + + /* Record channel info which occurred bad pages + */ + u32 bad_channel_bitmap; +}; + +/* + * Represents single table record. Packed to be easily serialized into byte + * stream. + */ +struct eeprom_umc_record { + + union { + uint64_t address; + uint64_t offset; + }; + + uint64_t retired_row_pfn; + uint64_t ts; + + enum ras_eeprom_err_type err_type; + + union { + unsigned char bank; + unsigned char cu; + }; + + unsigned char mem_channel; + unsigned char mcumc_id; + + /* The following variables will not be saved to eeprom. + */ + uint64_t cur_nps_retired_row_pfn; + uint32_t cur_nps_bank; + uint32_t cur_nps; +}; + +struct ras_core_context; +int ras_eeprom_hw_init(struct ras_core_context *ras_core); +int ras_eeprom_hw_fini(struct ras_core_context *ras_core); + +int ras_eeprom_reset_table(struct ras_core_context *ras_core); + +bool ras_eeprom_check_safety_watermark(struct ras_core_context *ras_core); + +int ras_eeprom_read(struct ras_core_context *ras_core, + struct eeprom_umc_record *records, const u32 num); + +int ras_eeprom_append(struct ras_core_context *ras_core, + struct eeprom_umc_record *records, const u32 num); + +uint32_t ras_eeprom_max_record_count(struct ras_core_context *ras_core); +uint32_t ras_eeprom_get_record_count(struct ras_core_context *ras_core); +void ras_eeprom_sync_info(struct ras_core_context *ras_core); + +int ras_eeprom_check_storage_status(struct ras_core_context *ras_core); +enum ras_gpu_health_status + ras_eeprom_check_gpu_status(struct ras_core_context *ras_core); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx.c b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.c new file mode 100644 index 000000000000..f5ce28777705 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "ras.h" +#include "ras_gfx_v9_0.h" +#include "ras_gfx.h" +#include "ras_core_status.h" + +static const struct ras_gfx_ip_func *ras_gfx_get_ip_funcs( + struct ras_core_context *ras_core, uint32_t ip_version) +{ + switch (ip_version) { + case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): + case IP_VERSION(9, 5, 0): + return &gfx_ras_func_v9_0; + default: + RAS_DEV_ERR(ras_core->dev, + "GFX ip version(0x%x) is not supported!\n", ip_version); + break; + } + + return NULL; +} + +int ras_gfx_get_ta_subblock(struct ras_core_context *ras_core, + uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock) +{ + struct ras_gfx *gfx = &ras_core->ras_gfx; + + return gfx->ip_func->get_ta_subblock(ras_core, + error_type, subblock, ta_subblock); +} + +int ras_gfx_hw_init(struct ras_core_context *ras_core) +{ + struct ras_gfx *gfx = &ras_core->ras_gfx; + + gfx->gfx_ip_version = ras_core->config->gfx_ip_version; + + gfx->ip_func = ras_gfx_get_ip_funcs(ras_core, gfx->gfx_ip_version); + + return gfx->ip_func ? RAS_CORE_OK : -EINVAL; +} + +int ras_gfx_hw_fini(struct ras_core_context *ras_core) +{ + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx.h b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.h new file mode 100644 index 000000000000..8a42d69fb0ad --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_GFX_H__ +#define __RAS_GFX_H__ + +struct ras_gfx_ip_func { + int (*get_ta_subblock)(struct ras_core_context *ras_core, + uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock); +}; + +struct ras_gfx { + uint32_t gfx_ip_version; + const struct ras_gfx_ip_func *ip_func; +}; + +int ras_gfx_hw_init(struct ras_core_context *ras_core); +int ras_gfx_hw_fini(struct ras_core_context *ras_core); + +int ras_gfx_get_ta_subblock(struct ras_core_context *ras_core, + uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock); + +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.c new file mode 100644 index 000000000000..6213d3f125be --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.c @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_gfx_v9_0.h" +#include "ras_core_status.h" + +enum ta_gfx_v9_subblock { + /*CPC*/ + TA_GFX_V9__GFX_CPC_INDEX_START = 0, + TA_GFX_V9__GFX_CPC_SCRATCH = TA_GFX_V9__GFX_CPC_INDEX_START, + TA_GFX_V9__GFX_CPC_UCODE, + TA_GFX_V9__GFX_DC_STATE_ME1, + TA_GFX_V9__GFX_DC_CSINVOC_ME1, + TA_GFX_V9__GFX_DC_RESTORE_ME1, + TA_GFX_V9__GFX_DC_STATE_ME2, + TA_GFX_V9__GFX_DC_CSINVOC_ME2, + TA_GFX_V9__GFX_DC_RESTORE_ME2, + TA_GFX_V9__GFX_CPC_INDEX_END = TA_GFX_V9__GFX_DC_RESTORE_ME2, + /* CPF*/ + TA_GFX_V9__GFX_CPF_INDEX_START, + TA_GFX_V9__GFX_CPF_ROQ_ME2 = TA_GFX_V9__GFX_CPF_INDEX_START, + TA_GFX_V9__GFX_CPF_ROQ_ME1, + TA_GFX_V9__GFX_CPF_TAG, + TA_GFX_V9__GFX_CPF_INDEX_END = TA_GFX_V9__GFX_CPF_TAG, + /* CPG*/ + TA_GFX_V9__GFX_CPG_INDEX_START, + TA_GFX_V9__GFX_CPG_DMA_ROQ = TA_GFX_V9__GFX_CPG_INDEX_START, + TA_GFX_V9__GFX_CPG_DMA_TAG, + TA_GFX_V9__GFX_CPG_TAG, + TA_GFX_V9__GFX_CPG_INDEX_END = TA_GFX_V9__GFX_CPG_TAG, + /* GDS*/ + TA_GFX_V9__GFX_GDS_INDEX_START, + TA_GFX_V9__GFX_GDS_MEM = TA_GFX_V9__GFX_GDS_INDEX_START, + TA_GFX_V9__GFX_GDS_INPUT_QUEUE, + TA_GFX_V9__GFX_GDS_OA_PHY_CMD_RAM_MEM, + TA_GFX_V9__GFX_GDS_OA_PHY_DATA_RAM_MEM, + TA_GFX_V9__GFX_GDS_OA_PIPE_MEM, + TA_GFX_V9__GFX_GDS_INDEX_END = TA_GFX_V9__GFX_GDS_OA_PIPE_MEM, + /* SPI*/ + TA_GFX_V9__GFX_SPI_SR_MEM, + /* SQ*/ + TA_GFX_V9__GFX_SQ_INDEX_START, + TA_GFX_V9__GFX_SQ_SGPR = TA_GFX_V9__GFX_SQ_INDEX_START, + TA_GFX_V9__GFX_SQ_LDS_D, + TA_GFX_V9__GFX_SQ_LDS_I, + TA_GFX_V9__GFX_SQ_VGPR, /* VGPR = SP*/ + TA_GFX_V9__GFX_SQ_INDEX_END = TA_GFX_V9__GFX_SQ_VGPR, + /* SQC (3 ranges)*/ + TA_GFX_V9__GFX_SQC_INDEX_START, + /* SQC range 0*/ + TA_GFX_V9__GFX_SQC_INDEX0_START = TA_GFX_V9__GFX_SQC_INDEX_START, + TA_GFX_V9__GFX_SQC_INST_UTCL1_LFIFO = + TA_GFX_V9__GFX_SQC_INDEX0_START, + TA_GFX_V9__GFX_SQC_DATA_CU0_WRITE_DATA_BUF, + TA_GFX_V9__GFX_SQC_DATA_CU0_UTCL1_LFIFO, + TA_GFX_V9__GFX_SQC_DATA_CU1_WRITE_DATA_BUF, + TA_GFX_V9__GFX_SQC_DATA_CU1_UTCL1_LFIFO, + TA_GFX_V9__GFX_SQC_DATA_CU2_WRITE_DATA_BUF, + TA_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO, + TA_GFX_V9__GFX_SQC_INDEX0_END = + TA_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO, + /* SQC range 1*/ + TA_GFX_V9__GFX_SQC_INDEX1_START, + TA_GFX_V9__GFX_SQC_INST_BANKA_TAG_RAM = + TA_GFX_V9__GFX_SQC_INDEX1_START, + TA_GFX_V9__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, + TA_GFX_V9__GFX_SQC_INST_BANKA_MISS_FIFO, + TA_GFX_V9__GFX_SQC_INST_BANKA_BANK_RAM, + TA_GFX_V9__GFX_SQC_DATA_BANKA_TAG_RAM, + TA_GFX_V9__GFX_SQC_DATA_BANKA_HIT_FIFO, + TA_GFX_V9__GFX_SQC_DATA_BANKA_MISS_FIFO, + TA_GFX_V9__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, + TA_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM, + TA_GFX_V9__GFX_SQC_INDEX1_END = + TA_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM, + /* SQC range 2*/ + TA_GFX_V9__GFX_SQC_INDEX2_START, + TA_GFX_V9__GFX_SQC_INST_BANKB_TAG_RAM = + TA_GFX_V9__GFX_SQC_INDEX2_START, + TA_GFX_V9__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, + TA_GFX_V9__GFX_SQC_INST_BANKB_MISS_FIFO, + TA_GFX_V9__GFX_SQC_INST_BANKB_BANK_RAM, + TA_GFX_V9__GFX_SQC_DATA_BANKB_TAG_RAM, + TA_GFX_V9__GFX_SQC_DATA_BANKB_HIT_FIFO, + TA_GFX_V9__GFX_SQC_DATA_BANKB_MISS_FIFO, + TA_GFX_V9__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, + TA_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM, + TA_GFX_V9__GFX_SQC_INDEX2_END = + TA_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM, + TA_GFX_V9__GFX_SQC_INDEX_END = TA_GFX_V9__GFX_SQC_INDEX2_END, + /* TA*/ + TA_GFX_V9__GFX_TA_INDEX_START, + TA_GFX_V9__GFX_TA_FS_DFIFO = TA_GFX_V9__GFX_TA_INDEX_START, + TA_GFX_V9__GFX_TA_FS_AFIFO, + TA_GFX_V9__GFX_TA_FL_LFIFO, + TA_GFX_V9__GFX_TA_FX_LFIFO, + TA_GFX_V9__GFX_TA_FS_CFIFO, + TA_GFX_V9__GFX_TA_INDEX_END = TA_GFX_V9__GFX_TA_FS_CFIFO, + /* TCA*/ + TA_GFX_V9__GFX_TCA_INDEX_START, + TA_GFX_V9__GFX_TCA_HOLE_FIFO = TA_GFX_V9__GFX_TCA_INDEX_START, + TA_GFX_V9__GFX_TCA_REQ_FIFO, + TA_GFX_V9__GFX_TCA_INDEX_END = TA_GFX_V9__GFX_TCA_REQ_FIFO, + /* TCC (5 sub-ranges)*/ + TA_GFX_V9__GFX_TCC_INDEX_START, + /* TCC range 0*/ + TA_GFX_V9__GFX_TCC_INDEX0_START = TA_GFX_V9__GFX_TCC_INDEX_START, + TA_GFX_V9__GFX_TCC_CACHE_DATA = TA_GFX_V9__GFX_TCC_INDEX0_START, + TA_GFX_V9__GFX_TCC_CACHE_DATA_BANK_0_1, + TA_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_0, + TA_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_1, + TA_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_0, + TA_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_1, + TA_GFX_V9__GFX_TCC_HIGH_RATE_TAG, + TA_GFX_V9__GFX_TCC_LOW_RATE_TAG, + TA_GFX_V9__GFX_TCC_INDEX0_END = TA_GFX_V9__GFX_TCC_LOW_RATE_TAG, + /* TCC range 1*/ + TA_GFX_V9__GFX_TCC_INDEX1_START, + TA_GFX_V9__GFX_TCC_IN_USE_DEC = TA_GFX_V9__GFX_TCC_INDEX1_START, + TA_GFX_V9__GFX_TCC_IN_USE_TRANSFER, + TA_GFX_V9__GFX_TCC_INDEX1_END = + TA_GFX_V9__GFX_TCC_IN_USE_TRANSFER, + /* TCC range 2*/ + TA_GFX_V9__GFX_TCC_INDEX2_START, + TA_GFX_V9__GFX_TCC_RETURN_DATA = TA_GFX_V9__GFX_TCC_INDEX2_START, + TA_GFX_V9__GFX_TCC_RETURN_CONTROL, + TA_GFX_V9__GFX_TCC_UC_ATOMIC_FIFO, + TA_GFX_V9__GFX_TCC_WRITE_RETURN, + TA_GFX_V9__GFX_TCC_WRITE_CACHE_READ, + TA_GFX_V9__GFX_TCC_SRC_FIFO, + TA_GFX_V9__GFX_TCC_SRC_FIFO_NEXT_RAM, + TA_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO, + TA_GFX_V9__GFX_TCC_INDEX2_END = + TA_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO, + /* TCC range 3*/ + TA_GFX_V9__GFX_TCC_INDEX3_START, + TA_GFX_V9__GFX_TCC_LATENCY_FIFO = TA_GFX_V9__GFX_TCC_INDEX3_START, + TA_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM, + TA_GFX_V9__GFX_TCC_INDEX3_END = + TA_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM, + /* TCC range 4*/ + TA_GFX_V9__GFX_TCC_INDEX4_START, + TA_GFX_V9__GFX_TCC_WRRET_TAG_WRITE_RETURN = + TA_GFX_V9__GFX_TCC_INDEX4_START, + TA_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER, + TA_GFX_V9__GFX_TCC_INDEX4_END = + TA_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER, + TA_GFX_V9__GFX_TCC_INDEX_END = TA_GFX_V9__GFX_TCC_INDEX4_END, + /* TCI*/ + TA_GFX_V9__GFX_TCI_WRITE_RAM, + /* TCP*/ + TA_GFX_V9__GFX_TCP_INDEX_START, + TA_GFX_V9__GFX_TCP_CACHE_RAM = TA_GFX_V9__GFX_TCP_INDEX_START, + TA_GFX_V9__GFX_TCP_LFIFO_RAM, + TA_GFX_V9__GFX_TCP_CMD_FIFO, + TA_GFX_V9__GFX_TCP_VM_FIFO, + TA_GFX_V9__GFX_TCP_DB_RAM, + TA_GFX_V9__GFX_TCP_UTCL1_LFIFO0, + TA_GFX_V9__GFX_TCP_UTCL1_LFIFO1, + TA_GFX_V9__GFX_TCP_INDEX_END = TA_GFX_V9__GFX_TCP_UTCL1_LFIFO1, + /* TD*/ + TA_GFX_V9__GFX_TD_INDEX_START, + TA_GFX_V9__GFX_TD_SS_FIFO_LO = TA_GFX_V9__GFX_TD_INDEX_START, + TA_GFX_V9__GFX_TD_SS_FIFO_HI, + TA_GFX_V9__GFX_TD_CS_FIFO, + TA_GFX_V9__GFX_TD_INDEX_END = TA_GFX_V9__GFX_TD_CS_FIFO, + /* EA (3 sub-ranges)*/ + TA_GFX_V9__GFX_EA_INDEX_START, + /* EA range 0*/ + TA_GFX_V9__GFX_EA_INDEX0_START = TA_GFX_V9__GFX_EA_INDEX_START, + TA_GFX_V9__GFX_EA_DRAMRD_CMDMEM = TA_GFX_V9__GFX_EA_INDEX0_START, + TA_GFX_V9__GFX_EA_DRAMWR_CMDMEM, + TA_GFX_V9__GFX_EA_DRAMWR_DATAMEM, + TA_GFX_V9__GFX_EA_RRET_TAGMEM, + TA_GFX_V9__GFX_EA_WRET_TAGMEM, + TA_GFX_V9__GFX_EA_GMIRD_CMDMEM, + TA_GFX_V9__GFX_EA_GMIWR_CMDMEM, + TA_GFX_V9__GFX_EA_GMIWR_DATAMEM, + TA_GFX_V9__GFX_EA_INDEX0_END = TA_GFX_V9__GFX_EA_GMIWR_DATAMEM, + /* EA range 1*/ + TA_GFX_V9__GFX_EA_INDEX1_START, + TA_GFX_V9__GFX_EA_DRAMRD_PAGEMEM = TA_GFX_V9__GFX_EA_INDEX1_START, + TA_GFX_V9__GFX_EA_DRAMWR_PAGEMEM, + TA_GFX_V9__GFX_EA_IORD_CMDMEM, + TA_GFX_V9__GFX_EA_IOWR_CMDMEM, + TA_GFX_V9__GFX_EA_IOWR_DATAMEM, + TA_GFX_V9__GFX_EA_GMIRD_PAGEMEM, + TA_GFX_V9__GFX_EA_GMIWR_PAGEMEM, + TA_GFX_V9__GFX_EA_INDEX1_END = TA_GFX_V9__GFX_EA_GMIWR_PAGEMEM, + /* EA range 2*/ + TA_GFX_V9__GFX_EA_INDEX2_START, + TA_GFX_V9__GFX_EA_MAM_D0MEM = TA_GFX_V9__GFX_EA_INDEX2_START, + TA_GFX_V9__GFX_EA_MAM_D1MEM, + TA_GFX_V9__GFX_EA_MAM_D2MEM, + TA_GFX_V9__GFX_EA_MAM_D3MEM, + TA_GFX_V9__GFX_EA_INDEX2_END = TA_GFX_V9__GFX_EA_MAM_D3MEM, + TA_GFX_V9__GFX_EA_INDEX_END = TA_GFX_V9__GFX_EA_INDEX2_END, + /* UTC VM L2 bank*/ + TA_GFX_V9__UTC_VML2_BANK_CACHE, + /* UTC VM walker*/ + TA_GFX_V9__UTC_VML2_WALKER, + /* UTC ATC L2 2MB cache*/ + TA_GFX_V9__UTC_ATCL2_CACHE_2M_BANK, + /* UTC ATC L2 4KB cache*/ + TA_GFX_V9__UTC_ATCL2_CACHE_4K_BANK, + TA_GFX_V9__GFX_MAX +}; + +struct ras_gfx_subblock_t { + unsigned char *name; + int ta_subblock; + int hw_supported_error_type; + int sw_supported_error_type; +}; + +#define RAS_GFX_SUB_BLOCK(subblock, a, b, c, d, e, f, g, h) \ + [RAS_GFX_V9__##subblock] = { \ + #subblock, \ + TA_GFX_V9__##subblock, \ + ((a) | ((b) << 1) | ((c) << 2) | ((d) << 3)), \ + (((e) << 1) | ((f) << 3) | (g) | ((h) << 2)), \ + } + +const struct ras_gfx_subblock_t ras_gfx_v9_0_subblocks[] = { + RAS_GFX_SUB_BLOCK(GFX_CPC_SCRATCH, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_CPC_UCODE, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_DC_STATE_ME1, 1, 0, 0, 1, 0, 0, 1, 0), + RAS_GFX_SUB_BLOCK(GFX_DC_CSINVOC_ME1, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_DC_RESTORE_ME1, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_DC_STATE_ME2, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_DC_CSINVOC_ME2, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_DC_RESTORE_ME2, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_CPF_ROQ_ME2, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_CPF_ROQ_ME1, 1, 0, 0, 1, 0, 0, 1, 0), + RAS_GFX_SUB_BLOCK(GFX_CPF_TAG, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_CPG_DMA_ROQ, 1, 0, 0, 1, 0, 0, 1, 0), + RAS_GFX_SUB_BLOCK(GFX_CPG_DMA_TAG, 0, 1, 1, 1, 0, 1, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_CPG_TAG, 0, 1, 1, 1, 1, 1, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_GDS_MEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_GDS_INPUT_QUEUE, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_GDS_OA_PHY_CMD_RAM_MEM, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_GDS_OA_PHY_DATA_RAM_MEM, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_GDS_OA_PIPE_MEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SPI_SR_MEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQ_SGPR, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQ_LDS_D, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_SQ_LDS_I, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQ_VGPR, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_UTCL1_LFIFO, 0, 1, 1, 1, 0, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU0_WRITE_DATA_BUF, 0, 1, 1, 1, 0, 0, + 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU0_UTCL1_LFIFO, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU1_WRITE_DATA_BUF, 0, 1, 1, 1, 0, 0, + 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU1_UTCL1_LFIFO, 0, 1, 1, 1, 1, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU2_WRITE_DATA_BUF, 0, 1, 1, 1, 0, 0, + 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_CU2_UTCL1_LFIFO, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_TAG_RAM, 0, 1, 1, 1, 1, 0, 0, + 1), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, 1, 0, 0, 1, 0, + 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKA_BANK_RAM, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_TAG_RAM, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_HIT_FIFO, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, 1, 0, 0, 1, 0, 0, + 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKA_BANK_RAM, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_TAG_RAM, 0, 1, 1, 1, 1, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, 1, 0, 0, 1, 0, + 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_INST_BANKB_BANK_RAM, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_TAG_RAM, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_HIT_FIFO, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_MISS_FIFO, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, 1, 0, 0, 1, 0, 0, + 0, 0), + RAS_GFX_SUB_BLOCK(GFX_SQC_DATA_BANKB_BANK_RAM, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_TA_FS_DFIFO, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_TA_FS_AFIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TA_FL_LFIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TA_FX_LFIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TA_FS_CFIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCA_HOLE_FIFO, 1, 0, 0, 1, 0, 1, 1, 0), + RAS_GFX_SUB_BLOCK(GFX_TCA_REQ_FIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA_BANK_0_1, 0, 1, 1, 1, 1, 0, 0, + 1), + RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA_BANK_1_0, 0, 1, 1, 1, 1, 0, 0, + 1), + RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DATA_BANK_1_1, 0, 1, 1, 1, 1, 0, 0, + 1), + RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DIRTY_BANK_0, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_DIRTY_BANK_1, 0, 1, 1, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_HIGH_RATE_TAG, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_LOW_RATE_TAG, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_IN_USE_DEC, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_IN_USE_TRANSFER, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_RETURN_DATA, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_RETURN_CONTROL, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_UC_ATOMIC_FIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_WRITE_RETURN, 1, 0, 0, 1, 0, 1, 1, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_WRITE_CACHE_READ, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_SRC_FIFO, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_SRC_FIFO_NEXT_RAM, 1, 0, 0, 1, 0, 0, 1, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_CACHE_TAG_PROBE_FIFO, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_LATENCY_FIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_LATENCY_FIFO_NEXT_RAM, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_WRRET_TAG_WRITE_RETURN, 1, 0, 0, 1, 0, 0, + 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCC_ATOMIC_RETURN_BUFFER, 1, 0, 0, 1, 0, 0, 0, + 0), + RAS_GFX_SUB_BLOCK(GFX_TCI_WRITE_RAM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCP_CACHE_RAM, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_TCP_LFIFO_RAM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCP_CMD_FIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCP_VM_FIFO, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCP_DB_RAM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCP_UTCL1_LFIFO0, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TCP_UTCL1_LFIFO1, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TD_SS_FIFO_LO, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_TD_SS_FIFO_HI, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_TD_CS_FIFO, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_DRAMRD_CMDMEM, 0, 1, 1, 1, 1, 0, 0, 1), + RAS_GFX_SUB_BLOCK(GFX_EA_DRAMWR_CMDMEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_DRAMWR_DATAMEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_RRET_TAGMEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_WRET_TAGMEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_GMIRD_CMDMEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_GMIWR_CMDMEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_GMIWR_DATAMEM, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_DRAMRD_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_DRAMWR_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_IORD_CMDMEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_IOWR_CMDMEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_IOWR_DATAMEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_GMIRD_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_GMIWR_PAGEMEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D0MEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D1MEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D2MEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(GFX_EA_MAM_D3MEM, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(UTC_VML2_BANK_CACHE, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(UTC_VML2_WALKER, 0, 1, 1, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(UTC_ATCL2_CACHE_2M_BANK, 1, 0, 0, 1, 0, 0, 0, 0), + RAS_GFX_SUB_BLOCK(UTC_ATCL2_CACHE_4K_BANK, 0, 1, 1, 1, 0, 0, 0, 0), +}; + +static int gfx_v9_0_get_ta_subblock(struct ras_core_context *ras_core, + uint32_t error_type, uint32_t subblock, uint32_t *ta_subblock) +{ + const struct ras_gfx_subblock_t *gfx_subblock; + + if (subblock >= ARRAY_SIZE(ras_gfx_v9_0_subblocks)) + return -EINVAL; + + gfx_subblock = &ras_gfx_v9_0_subblocks[subblock]; + if (!gfx_subblock->name) + return -EPERM; + + if (!(gfx_subblock->hw_supported_error_type & error_type)) { + RAS_DEV_ERR(ras_core->dev, "GFX Subblock %s, hardware do not support type 0x%x\n", + gfx_subblock->name, error_type); + return -EPERM; + } + + if (!(gfx_subblock->sw_supported_error_type & error_type)) { + RAS_DEV_ERR(ras_core->dev, "GFX Subblock %s, driver do not support type 0x%x\n", + gfx_subblock->name, error_type); + return -EPERM; + } + + *ta_subblock = gfx_subblock->ta_subblock; + + return 0; +} + +const struct ras_gfx_ip_func gfx_ras_func_v9_0 = { + .get_ta_subblock = gfx_v9_0_get_ta_subblock, +}; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.h new file mode 100644 index 000000000000..659b56619747 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_gfx_v9_0.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_GFX_V9_0_H__ +#define __RAS_GFX_V9_0_H__ + +enum ras_gfx_v9_subblock { + /* CPC */ + RAS_GFX_V9__GFX_CPC_INDEX_START = 0, + RAS_GFX_V9__GFX_CPC_SCRATCH = + RAS_GFX_V9__GFX_CPC_INDEX_START, + RAS_GFX_V9__GFX_CPC_UCODE, + RAS_GFX_V9__GFX_DC_STATE_ME1, + RAS_GFX_V9__GFX_DC_CSINVOC_ME1, + RAS_GFX_V9__GFX_DC_RESTORE_ME1, + RAS_GFX_V9__GFX_DC_STATE_ME2, + RAS_GFX_V9__GFX_DC_CSINVOC_ME2, + RAS_GFX_V9__GFX_DC_RESTORE_ME2, + RAS_GFX_V9__GFX_CPC_INDEX_END = + RAS_GFX_V9__GFX_DC_RESTORE_ME2, + /* CPF */ + RAS_GFX_V9__GFX_CPF_INDEX_START, + RAS_GFX_V9__GFX_CPF_ROQ_ME2 = + RAS_GFX_V9__GFX_CPF_INDEX_START, + RAS_GFX_V9__GFX_CPF_ROQ_ME1, + RAS_GFX_V9__GFX_CPF_TAG, + RAS_GFX_V9__GFX_CPF_INDEX_END = RAS_GFX_V9__GFX_CPF_TAG, + /* CPG */ + RAS_GFX_V9__GFX_CPG_INDEX_START, + RAS_GFX_V9__GFX_CPG_DMA_ROQ = + RAS_GFX_V9__GFX_CPG_INDEX_START, + RAS_GFX_V9__GFX_CPG_DMA_TAG, + RAS_GFX_V9__GFX_CPG_TAG, + RAS_GFX_V9__GFX_CPG_INDEX_END = RAS_GFX_V9__GFX_CPG_TAG, + /* GDS */ + RAS_GFX_V9__GFX_GDS_INDEX_START, + RAS_GFX_V9__GFX_GDS_MEM = RAS_GFX_V9__GFX_GDS_INDEX_START, + RAS_GFX_V9__GFX_GDS_INPUT_QUEUE, + RAS_GFX_V9__GFX_GDS_OA_PHY_CMD_RAM_MEM, + RAS_GFX_V9__GFX_GDS_OA_PHY_DATA_RAM_MEM, + RAS_GFX_V9__GFX_GDS_OA_PIPE_MEM, + RAS_GFX_V9__GFX_GDS_INDEX_END = + RAS_GFX_V9__GFX_GDS_OA_PIPE_MEM, + /* SPI */ + RAS_GFX_V9__GFX_SPI_SR_MEM, + /* SQ */ + RAS_GFX_V9__GFX_SQ_INDEX_START, + RAS_GFX_V9__GFX_SQ_SGPR = RAS_GFX_V9__GFX_SQ_INDEX_START, + RAS_GFX_V9__GFX_SQ_LDS_D, + RAS_GFX_V9__GFX_SQ_LDS_I, + RAS_GFX_V9__GFX_SQ_VGPR, + RAS_GFX_V9__GFX_SQ_INDEX_END = RAS_GFX_V9__GFX_SQ_VGPR, + /* SQC (3 ranges) */ + RAS_GFX_V9__GFX_SQC_INDEX_START, + /* SQC range 0 */ + RAS_GFX_V9__GFX_SQC_INDEX0_START = + RAS_GFX_V9__GFX_SQC_INDEX_START, + RAS_GFX_V9__GFX_SQC_INST_UTCL1_LFIFO = + RAS_GFX_V9__GFX_SQC_INDEX0_START, + RAS_GFX_V9__GFX_SQC_DATA_CU0_WRITE_DATA_BUF, + RAS_GFX_V9__GFX_SQC_DATA_CU0_UTCL1_LFIFO, + RAS_GFX_V9__GFX_SQC_DATA_CU1_WRITE_DATA_BUF, + RAS_GFX_V9__GFX_SQC_DATA_CU1_UTCL1_LFIFO, + RAS_GFX_V9__GFX_SQC_DATA_CU2_WRITE_DATA_BUF, + RAS_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO, + RAS_GFX_V9__GFX_SQC_INDEX0_END = + RAS_GFX_V9__GFX_SQC_DATA_CU2_UTCL1_LFIFO, + /* SQC range 1 */ + RAS_GFX_V9__GFX_SQC_INDEX1_START, + RAS_GFX_V9__GFX_SQC_INST_BANKA_TAG_RAM = + RAS_GFX_V9__GFX_SQC_INDEX1_START, + RAS_GFX_V9__GFX_SQC_INST_BANKA_UTCL1_MISS_FIFO, + RAS_GFX_V9__GFX_SQC_INST_BANKA_MISS_FIFO, + RAS_GFX_V9__GFX_SQC_INST_BANKA_BANK_RAM, + RAS_GFX_V9__GFX_SQC_DATA_BANKA_TAG_RAM, + RAS_GFX_V9__GFX_SQC_DATA_BANKA_HIT_FIFO, + RAS_GFX_V9__GFX_SQC_DATA_BANKA_MISS_FIFO, + RAS_GFX_V9__GFX_SQC_DATA_BANKA_DIRTY_BIT_RAM, + RAS_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM, + RAS_GFX_V9__GFX_SQC_INDEX1_END = + RAS_GFX_V9__GFX_SQC_DATA_BANKA_BANK_RAM, + /* SQC range 2 */ + RAS_GFX_V9__GFX_SQC_INDEX2_START, + RAS_GFX_V9__GFX_SQC_INST_BANKB_TAG_RAM = + RAS_GFX_V9__GFX_SQC_INDEX2_START, + RAS_GFX_V9__GFX_SQC_INST_BANKB_UTCL1_MISS_FIFO, + RAS_GFX_V9__GFX_SQC_INST_BANKB_MISS_FIFO, + RAS_GFX_V9__GFX_SQC_INST_BANKB_BANK_RAM, + RAS_GFX_V9__GFX_SQC_DATA_BANKB_TAG_RAM, + RAS_GFX_V9__GFX_SQC_DATA_BANKB_HIT_FIFO, + RAS_GFX_V9__GFX_SQC_DATA_BANKB_MISS_FIFO, + RAS_GFX_V9__GFX_SQC_DATA_BANKB_DIRTY_BIT_RAM, + RAS_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM, + RAS_GFX_V9__GFX_SQC_INDEX2_END = + RAS_GFX_V9__GFX_SQC_DATA_BANKB_BANK_RAM, + RAS_GFX_V9__GFX_SQC_INDEX_END = + RAS_GFX_V9__GFX_SQC_INDEX2_END, + /* TA */ + RAS_GFX_V9__GFX_TA_INDEX_START, + RAS_GFX_V9__GFX_TA_FS_DFIFO = + RAS_GFX_V9__GFX_TA_INDEX_START, + RAS_GFX_V9__GFX_TA_FS_AFIFO, + RAS_GFX_V9__GFX_TA_FL_LFIFO, + RAS_GFX_V9__GFX_TA_FX_LFIFO, + RAS_GFX_V9__GFX_TA_FS_CFIFO, + RAS_GFX_V9__GFX_TA_INDEX_END = RAS_GFX_V9__GFX_TA_FS_CFIFO, + /* TCA */ + RAS_GFX_V9__GFX_TCA_INDEX_START, + RAS_GFX_V9__GFX_TCA_HOLE_FIFO = + RAS_GFX_V9__GFX_TCA_INDEX_START, + RAS_GFX_V9__GFX_TCA_REQ_FIFO, + RAS_GFX_V9__GFX_TCA_INDEX_END = + RAS_GFX_V9__GFX_TCA_REQ_FIFO, + /* TCC (5 sub-ranges) */ + RAS_GFX_V9__GFX_TCC_INDEX_START, + /* TCC range 0 */ + RAS_GFX_V9__GFX_TCC_INDEX0_START = + RAS_GFX_V9__GFX_TCC_INDEX_START, + RAS_GFX_V9__GFX_TCC_CACHE_DATA = + RAS_GFX_V9__GFX_TCC_INDEX0_START, + RAS_GFX_V9__GFX_TCC_CACHE_DATA_BANK_0_1, + RAS_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_0, + RAS_GFX_V9__GFX_TCC_CACHE_DATA_BANK_1_1, + RAS_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_0, + RAS_GFX_V9__GFX_TCC_CACHE_DIRTY_BANK_1, + RAS_GFX_V9__GFX_TCC_HIGH_RATE_TAG, + RAS_GFX_V9__GFX_TCC_LOW_RATE_TAG, + RAS_GFX_V9__GFX_TCC_INDEX0_END = + RAS_GFX_V9__GFX_TCC_LOW_RATE_TAG, + /* TCC range 1 */ + RAS_GFX_V9__GFX_TCC_INDEX1_START, + RAS_GFX_V9__GFX_TCC_IN_USE_DEC = + RAS_GFX_V9__GFX_TCC_INDEX1_START, + RAS_GFX_V9__GFX_TCC_IN_USE_TRANSFER, + RAS_GFX_V9__GFX_TCC_INDEX1_END = + RAS_GFX_V9__GFX_TCC_IN_USE_TRANSFER, + /* TCC range 2 */ + RAS_GFX_V9__GFX_TCC_INDEX2_START, + RAS_GFX_V9__GFX_TCC_RETURN_DATA = + RAS_GFX_V9__GFX_TCC_INDEX2_START, + RAS_GFX_V9__GFX_TCC_RETURN_CONTROL, + RAS_GFX_V9__GFX_TCC_UC_ATOMIC_FIFO, + RAS_GFX_V9__GFX_TCC_WRITE_RETURN, + RAS_GFX_V9__GFX_TCC_WRITE_CACHE_READ, + RAS_GFX_V9__GFX_TCC_SRC_FIFO, + RAS_GFX_V9__GFX_TCC_SRC_FIFO_NEXT_RAM, + RAS_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO, + RAS_GFX_V9__GFX_TCC_INDEX2_END = + RAS_GFX_V9__GFX_TCC_CACHE_TAG_PROBE_FIFO, + /* TCC range 3 */ + RAS_GFX_V9__GFX_TCC_INDEX3_START, + RAS_GFX_V9__GFX_TCC_LATENCY_FIFO = + RAS_GFX_V9__GFX_TCC_INDEX3_START, + RAS_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM, + RAS_GFX_V9__GFX_TCC_INDEX3_END = + RAS_GFX_V9__GFX_TCC_LATENCY_FIFO_NEXT_RAM, + /* TCC range 4 */ + RAS_GFX_V9__GFX_TCC_INDEX4_START, + RAS_GFX_V9__GFX_TCC_WRRET_TAG_WRITE_RETURN = + RAS_GFX_V9__GFX_TCC_INDEX4_START, + RAS_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER, + RAS_GFX_V9__GFX_TCC_INDEX4_END = + RAS_GFX_V9__GFX_TCC_ATOMIC_RETURN_BUFFER, + RAS_GFX_V9__GFX_TCC_INDEX_END = + RAS_GFX_V9__GFX_TCC_INDEX4_END, + /* TCI */ + RAS_GFX_V9__GFX_TCI_WRITE_RAM, + /* TCP */ + RAS_GFX_V9__GFX_TCP_INDEX_START, + RAS_GFX_V9__GFX_TCP_CACHE_RAM = + RAS_GFX_V9__GFX_TCP_INDEX_START, + RAS_GFX_V9__GFX_TCP_LFIFO_RAM, + RAS_GFX_V9__GFX_TCP_CMD_FIFO, + RAS_GFX_V9__GFX_TCP_VM_FIFO, + RAS_GFX_V9__GFX_TCP_DB_RAM, + RAS_GFX_V9__GFX_TCP_UTCL1_LFIFO0, + RAS_GFX_V9__GFX_TCP_UTCL1_LFIFO1, + RAS_GFX_V9__GFX_TCP_INDEX_END = + RAS_GFX_V9__GFX_TCP_UTCL1_LFIFO1, + /* TD */ + RAS_GFX_V9__GFX_TD_INDEX_START, + RAS_GFX_V9__GFX_TD_SS_FIFO_LO = + RAS_GFX_V9__GFX_TD_INDEX_START, + RAS_GFX_V9__GFX_TD_SS_FIFO_HI, + RAS_GFX_V9__GFX_TD_CS_FIFO, + RAS_GFX_V9__GFX_TD_INDEX_END = RAS_GFX_V9__GFX_TD_CS_FIFO, + /* EA (3 sub-ranges) */ + RAS_GFX_V9__GFX_EA_INDEX_START, + /* EA range 0 */ + RAS_GFX_V9__GFX_EA_INDEX0_START = + RAS_GFX_V9__GFX_EA_INDEX_START, + RAS_GFX_V9__GFX_EA_DRAMRD_CMDMEM = + RAS_GFX_V9__GFX_EA_INDEX0_START, + RAS_GFX_V9__GFX_EA_DRAMWR_CMDMEM, + RAS_GFX_V9__GFX_EA_DRAMWR_DATAMEM, + RAS_GFX_V9__GFX_EA_RRET_TAGMEM, + RAS_GFX_V9__GFX_EA_WRET_TAGMEM, + RAS_GFX_V9__GFX_EA_GMIRD_CMDMEM, + RAS_GFX_V9__GFX_EA_GMIWR_CMDMEM, + RAS_GFX_V9__GFX_EA_GMIWR_DATAMEM, + RAS_GFX_V9__GFX_EA_INDEX0_END = + RAS_GFX_V9__GFX_EA_GMIWR_DATAMEM, + /* EA range 1 */ + RAS_GFX_V9__GFX_EA_INDEX1_START, + RAS_GFX_V9__GFX_EA_DRAMRD_PAGEMEM = + RAS_GFX_V9__GFX_EA_INDEX1_START, + RAS_GFX_V9__GFX_EA_DRAMWR_PAGEMEM, + RAS_GFX_V9__GFX_EA_IORD_CMDMEM, + RAS_GFX_V9__GFX_EA_IOWR_CMDMEM, + RAS_GFX_V9__GFX_EA_IOWR_DATAMEM, + RAS_GFX_V9__GFX_EA_GMIRD_PAGEMEM, + RAS_GFX_V9__GFX_EA_GMIWR_PAGEMEM, + RAS_GFX_V9__GFX_EA_INDEX1_END = + RAS_GFX_V9__GFX_EA_GMIWR_PAGEMEM, + /* EA range 2 */ + RAS_GFX_V9__GFX_EA_INDEX2_START, + RAS_GFX_V9__GFX_EA_MAM_D0MEM = + RAS_GFX_V9__GFX_EA_INDEX2_START, + RAS_GFX_V9__GFX_EA_MAM_D1MEM, + RAS_GFX_V9__GFX_EA_MAM_D2MEM, + RAS_GFX_V9__GFX_EA_MAM_D3MEM, + RAS_GFX_V9__GFX_EA_INDEX2_END = + RAS_GFX_V9__GFX_EA_MAM_D3MEM, + RAS_GFX_V9__GFX_EA_INDEX_END = + RAS_GFX_V9__GFX_EA_INDEX2_END, + /* UTC VM L2 bank */ + RAS_GFX_V9__UTC_VML2_BANK_CACHE, + /* UTC VM walker */ + RAS_GFX_V9__UTC_VML2_WALKER, + /* UTC ATC L2 2MB cache */ + RAS_GFX_V9__UTC_ATCL2_CACHE_2M_BANK, + /* UTC ATC L2 4KB cache */ + RAS_GFX_V9__UTC_ATCL2_CACHE_4K_BANK, + RAS_GFX_V9__GFX_MAX +}; + +extern const struct ras_gfx_ip_func gfx_ras_func_v9_0; + +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c new file mode 100644 index 000000000000..0a838fdcb2f6 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_core_status.h" +#include "ras_log_ring.h" + +#define RAS_LOG_MAX_QUERY_SIZE 0xC000 +#define RAS_LOG_MEM_TEMP_SIZE 0x200 +#define RAS_LOG_MEMPOOL_SIZE \ + (RAS_LOG_MAX_QUERY_SIZE + RAS_LOG_MEM_TEMP_SIZE) + +#define BATCH_IDX_TO_TREE_IDX(batch_idx, sn) (((batch_idx) << 8) | (sn)) + +static const uint64_t ras_rma_aca_reg[ACA_REG_MAX_COUNT] = { + [ACA_REG_IDX__CTL] = 0x1, + [ACA_REG_IDX__STATUS] = 0xB000000000000137, + [ACA_REG_IDX__ADDR] = 0x0, + [ACA_REG_IDX__MISC0] = 0x0, + [ACA_REG_IDX__CONFG] = 0x1ff00000002, + [ACA_REG_IDX__IPID] = 0x9600000000, + [ACA_REG_IDX__SYND] = 0x0, +}; + +static uint64_t ras_log_ring_get_logged_ecc_count(struct ras_core_context *ras_core) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + uint64_t count = 0; + + if (log_ring->logged_ecc_count < 0) { + RAS_DEV_WARN(ras_core->dev, + "Error: the logged ras count should not less than 0!\n"); + count = 0; + } else { + count = log_ring->logged_ecc_count; + } + + if (count > RAS_LOG_MEMPOOL_SIZE) + RAS_DEV_WARN(ras_core->dev, + "Error: the logged ras count is out of range!\n"); + + return count; +} + +static int ras_log_ring_add_data(struct ras_core_context *ras_core, + struct ras_log_info *log, struct ras_log_batch_tag *batch_tag) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + unsigned long flags = 0; + int ret = 0; + + if (batch_tag && (batch_tag->sub_seqno >= MAX_RECORD_PER_BATCH)) { + RAS_DEV_ERR(ras_core->dev, + "Invalid batch sub seqno:%d, batch:0x%llx\n", + batch_tag->sub_seqno, batch_tag->batch_id); + return -EINVAL; + } + + spin_lock_irqsave(&log_ring->spin_lock, flags); + if (batch_tag) { + log->seqno = + BATCH_IDX_TO_TREE_IDX(batch_tag->batch_id, batch_tag->sub_seqno); + batch_tag->sub_seqno++; + } else { + log->seqno = BATCH_IDX_TO_TREE_IDX(log_ring->mono_upward_batch_id, 0); + log_ring->mono_upward_batch_id++; + } + ret = radix_tree_insert(&log_ring->ras_log_root, log->seqno, log); + if (!ret) + log_ring->logged_ecc_count++; + spin_unlock_irqrestore(&log_ring->spin_lock, flags); + + if (ret) { + RAS_DEV_ERR(ras_core->dev, + "Failed to add ras log! seqno:0x%llx, ret:%d\n", + log->seqno, ret); + mempool_free(log, log_ring->ras_log_mempool); + } + + return ret; +} + +static int ras_log_ring_delete_data(struct ras_core_context *ras_core, uint32_t count) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + unsigned long flags = 0; + uint32_t i = 0, j = 0; + uint64_t batch_id, idx; + void *data; + int ret = -ENODATA; + + if (count > ras_log_ring_get_logged_ecc_count(ras_core)) + return -EINVAL; + + spin_lock_irqsave(&log_ring->spin_lock, flags); + batch_id = log_ring->last_del_batch_id; + while (batch_id < log_ring->mono_upward_batch_id) { + for (j = 0; j < MAX_RECORD_PER_BATCH; j++) { + idx = BATCH_IDX_TO_TREE_IDX(batch_id, j); + data = radix_tree_delete(&log_ring->ras_log_root, idx); + if (data) { + mempool_free(data, log_ring->ras_log_mempool); + log_ring->logged_ecc_count--; + i++; + } + } + batch_id = ++log_ring->last_del_batch_id; + if (i >= count) { + ret = 0; + break; + } + } + spin_unlock_irqrestore(&log_ring->spin_lock, flags); + + return ret; +} + +static void ras_log_ring_clear_log_tree(struct ras_core_context *ras_core) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + uint64_t batch_id, idx; + unsigned long flags = 0; + void *data; + int j; + + if ((log_ring->mono_upward_batch_id <= log_ring->last_del_batch_id) && + !log_ring->logged_ecc_count) + return; + + spin_lock_irqsave(&log_ring->spin_lock, flags); + batch_id = log_ring->last_del_batch_id; + while (batch_id < log_ring->mono_upward_batch_id) { + for (j = 0; j < MAX_RECORD_PER_BATCH; j++) { + idx = BATCH_IDX_TO_TREE_IDX(batch_id, j); + data = radix_tree_delete(&log_ring->ras_log_root, idx); + if (data) { + mempool_free(data, log_ring->ras_log_mempool); + log_ring->logged_ecc_count--; + } + } + batch_id++; + } + spin_unlock_irqrestore(&log_ring->spin_lock, flags); + +} + +int ras_log_ring_sw_init(struct ras_core_context *ras_core) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + + memset(log_ring, 0, sizeof(*log_ring)); + + log_ring->ras_log_mempool = mempool_create_kmalloc_pool( + RAS_LOG_MEMPOOL_SIZE, sizeof(struct ras_log_info)); + if (!log_ring->ras_log_mempool) + return -ENOMEM; + + INIT_RADIX_TREE(&log_ring->ras_log_root, GFP_KERNEL); + + spin_lock_init(&log_ring->spin_lock); + + return 0; +} + +int ras_log_ring_sw_fini(struct ras_core_context *ras_core) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + + ras_log_ring_clear_log_tree(ras_core); + log_ring->logged_ecc_count = 0; + log_ring->last_del_batch_id = 0; + log_ring->mono_upward_batch_id = 0; + + mempool_destroy(log_ring->ras_log_mempool); + + return 0; +} + +struct ras_log_batch_tag *ras_log_ring_create_batch_tag(struct ras_core_context *ras_core) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + struct ras_log_batch_tag *batch_tag; + unsigned long flags = 0; + + batch_tag = kzalloc(sizeof(*batch_tag), GFP_KERNEL); + if (!batch_tag) + return NULL; + + spin_lock_irqsave(&log_ring->spin_lock, flags); + batch_tag->batch_id = log_ring->mono_upward_batch_id; + log_ring->mono_upward_batch_id++; + spin_unlock_irqrestore(&log_ring->spin_lock, flags); + + batch_tag->sub_seqno = 0; + batch_tag->timestamp = ras_core_get_utc_second_timestamp(ras_core); + return batch_tag; +} + +void ras_log_ring_destroy_batch_tag(struct ras_core_context *ras_core, + struct ras_log_batch_tag *batch_tag) +{ + kfree(batch_tag); +} + +void ras_log_ring_add_log_event(struct ras_core_context *ras_core, + enum ras_log_event event, void *data, struct ras_log_batch_tag *batch_tag) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + struct device_system_info dev_info = {0}; + struct ras_log_info *log; + uint64_t socket_id; + void *obj; + + obj = mempool_alloc_preallocated(log_ring->ras_log_mempool); + if (!obj || + (ras_log_ring_get_logged_ecc_count(ras_core) >= RAS_LOG_MEMPOOL_SIZE)) { + ras_log_ring_delete_data(ras_core, RAS_LOG_MEM_TEMP_SIZE); + if (!obj) + obj = mempool_alloc_preallocated(log_ring->ras_log_mempool); + } + + if (!obj) { + RAS_DEV_ERR(ras_core->dev, "ERROR: Failed to alloc ras log buffer!\n"); + return; + } + + log = (struct ras_log_info *)obj; + + memset(log, 0, sizeof(*log)); + log->timestamp = + batch_tag ? batch_tag->timestamp : ras_core_get_utc_second_timestamp(ras_core); + log->event = event; + + if (data) + memcpy(&log->aca_reg, data, sizeof(log->aca_reg)); + + if (event == RAS_LOG_EVENT_RMA) { + memcpy(&log->aca_reg, ras_rma_aca_reg, sizeof(log->aca_reg)); + ras_core_get_device_system_info(ras_core, &dev_info); + socket_id = dev_info.socket_id; + log->aca_reg.regs[ACA_REG_IDX__IPID] |= ((socket_id / 4) & 0x01); + log->aca_reg.regs[ACA_REG_IDX__IPID] |= (((socket_id % 4) & 0x3) << 44); + } + + ras_log_ring_add_data(ras_core, log, batch_tag); +} + +static struct ras_log_info *ras_log_ring_lookup_data(struct ras_core_context *ras_core, + uint64_t idx) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + unsigned long flags = 0; + void *data; + + spin_lock_irqsave(&log_ring->spin_lock, flags); + data = radix_tree_lookup(&log_ring->ras_log_root, idx); + spin_unlock_irqrestore(&log_ring->spin_lock, flags); + + return (struct ras_log_info *)data; +} + +int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_id, + struct ras_log_info **log_arr, uint32_t arr_num) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + uint32_t i, idx, count = 0; + void *data; + + if ((batch_id >= log_ring->mono_upward_batch_id) || + (batch_id < log_ring->last_del_batch_id)) + return -EINVAL; + + for (i = 0; i < MAX_RECORD_PER_BATCH; i++) { + idx = BATCH_IDX_TO_TREE_IDX(batch_id, i); + data = ras_log_ring_lookup_data(ras_core, idx); + if (data) { + log_arr[count++] = data; + if (count >= arr_num) + break; + } + } + + return count; +} + +int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core, + struct ras_log_batch_overview *overview) +{ + struct ras_log_ring *log_ring = &ras_core->ras_log_ring; + + overview->logged_batch_count = + log_ring->mono_upward_batch_id - log_ring->last_del_batch_id; + overview->last_batch_id = log_ring->mono_upward_batch_id; + overview->first_batch_id = log_ring->last_del_batch_id; + + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h new file mode 100644 index 000000000000..0ff6cc35678d --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_log_ring.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_LOG_RING_H__ +#define __RAS_LOG_RING_H__ +#include "ras_aca.h" + +#define MAX_RECORD_PER_BATCH 32 + +#define RAS_LOG_SEQNO_TO_BATCH_IDX(seqno) ((seqno) >> 8) + +enum ras_log_event { + RAS_LOG_EVENT_NONE, + RAS_LOG_EVENT_UE, + RAS_LOG_EVENT_DE, + RAS_LOG_EVENT_CE, + RAS_LOG_EVENT_POISON_CREATION, + RAS_LOG_EVENT_POISON_CONSUMPTION, + RAS_LOG_EVENT_RMA, + RAS_LOG_EVENT_COUNT_MAX, +}; + +struct ras_aca_reg { + uint64_t regs[ACA_REG_MAX_COUNT]; +}; + +struct ras_log_info { + uint64_t seqno; + uint64_t timestamp; + enum ras_log_event event; + union { + struct ras_aca_reg aca_reg; + }; +}; + +struct ras_log_batch_tag { + uint64_t batch_id; + uint64_t timestamp; + uint32_t sub_seqno; +}; + +struct ras_log_ring { + void *ras_log_mempool; + struct radix_tree_root ras_log_root; + spinlock_t spin_lock; + uint64_t mono_upward_batch_id; + uint64_t last_del_batch_id; + int logged_ecc_count; +}; + +struct ras_log_batch_overview { + uint64_t first_batch_id; + uint64_t last_batch_id; + uint32_t logged_batch_count; +}; + +struct ras_core_context; + +int ras_log_ring_sw_init(struct ras_core_context *ras_core); +int ras_log_ring_sw_fini(struct ras_core_context *ras_core); + +struct ras_log_batch_tag *ras_log_ring_create_batch_tag(struct ras_core_context *ras_core); +void ras_log_ring_destroy_batch_tag(struct ras_core_context *ras_core, + struct ras_log_batch_tag *tag); +void ras_log_ring_add_log_event(struct ras_core_context *ras_core, + enum ras_log_event event, void *data, struct ras_log_batch_tag *tag); + +int ras_log_ring_get_batch_records(struct ras_core_context *ras_core, uint64_t batch_idx, + struct ras_log_info **log_arr, uint32_t arr_num); + +int ras_log_ring_get_batch_overview(struct ras_core_context *ras_core, + struct ras_log_batch_overview *overview); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1.c b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.c new file mode 100644 index 000000000000..f3321df85021 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.c @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "ras.h" +#include "ras_mp1.h" +#include "ras_mp1_v13_0.h" + +static const struct ras_mp1_ip_func *ras_mp1_get_ip_funcs( + struct ras_core_context *ras_core, uint32_t ip_version) +{ + switch (ip_version) { + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): + case IP_VERSION(13, 0, 12): + return &mp1_ras_func_v13_0; + default: + RAS_DEV_ERR(ras_core->dev, + "MP1 ip version(0x%x) is not supported!\n", ip_version); + break; + } + + return NULL; +} + +int ras_mp1_get_bank_count(struct ras_core_context *ras_core, + enum ras_err_type type, u32 *count) +{ + struct ras_mp1 *mp1 = &ras_core->ras_mp1; + + return mp1->ip_func->get_valid_bank_count(ras_core, type, count); +} + +int ras_mp1_dump_bank(struct ras_core_context *ras_core, + u32 type, u32 idx, u32 reg_idx, u64 *val) +{ + struct ras_mp1 *mp1 = &ras_core->ras_mp1; + + return mp1->ip_func->dump_valid_bank(ras_core, type, idx, reg_idx, val); +} + +int ras_mp1_hw_init(struct ras_core_context *ras_core) +{ + struct ras_mp1 *mp1 = &ras_core->ras_mp1; + + mp1->mp1_ip_version = ras_core->config->mp1_ip_version; + mp1->sys_func = ras_core->config->mp1_cfg.mp1_sys_fn; + if (!mp1->sys_func) { + RAS_DEV_ERR(ras_core->dev, "RAS mp1 sys function not configured!\n"); + return -EINVAL; + } + + mp1->ip_func = ras_mp1_get_ip_funcs(ras_core, mp1->mp1_ip_version); + + return mp1->ip_func ? RAS_CORE_OK : -EINVAL; +} + +int ras_mp1_hw_fini(struct ras_core_context *ras_core) +{ + return 0; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1.h b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.h new file mode 100644 index 000000000000..de1d08286f41 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_MP1_H__ +#define __RAS_MP1_H__ +#include "ras.h" + +enum ras_err_type; +struct ras_mp1_ip_func { + int (*get_valid_bank_count)(struct ras_core_context *ras_core, + enum ras_err_type type, u32 *count); + int (*dump_valid_bank)(struct ras_core_context *ras_core, + enum ras_err_type type, u32 idx, u32 reg_idx, u64 *val); +}; + +struct ras_mp1 { + uint32_t mp1_ip_version; + const struct ras_mp1_ip_func *ip_func; + const struct ras_mp1_sys_func *sys_func; +}; + +int ras_mp1_hw_init(struct ras_core_context *ras_core); +int ras_mp1_hw_fini(struct ras_core_context *ras_core); + +int ras_mp1_get_bank_count(struct ras_core_context *ras_core, + enum ras_err_type type, u32 *count); + +int ras_mp1_dump_bank(struct ras_core_context *ras_core, + u32 ecc_type, u32 idx, u32 reg_idx, u64 *val); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.c new file mode 100644 index 000000000000..310d39fc816b --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_mp1.h" +#include "ras_core_status.h" +#include "ras_mp1_v13_0.h" + +#define RAS_MP1_MSG_QueryValidMcaCount 0x36 +#define RAS_MP1_MSG_McaBankDumpDW 0x37 +#define RAS_MP1_MSG_ClearMcaOnRead 0x39 +#define RAS_MP1_MSG_QueryValidMcaCeCount 0x3A +#define RAS_MP1_MSG_McaBankCeDumpDW 0x3B + +#define MAX_UE_BANKS_PER_QUERY 12 +#define MAX_CE_BANKS_PER_QUERY 12 + +static int mp1_v13_0_get_bank_count(struct ras_core_context *ras_core, + enum ras_err_type type, u32 *count) +{ + struct ras_mp1 *mp1 = &ras_core->ras_mp1; + const struct ras_mp1_sys_func *sys_func = mp1->sys_func; + uint32_t bank_count = 0; + u32 msg; + int ret; + + if (!count) + return -EINVAL; + + if (!sys_func || !sys_func->mp1_get_valid_bank_count) + return -RAS_CORE_NOT_SUPPORTED; + + switch (type) { + case RAS_ERR_TYPE__UE: + msg = RAS_MP1_MSG_QueryValidMcaCount; + break; + case RAS_ERR_TYPE__CE: + case RAS_ERR_TYPE__DE: + msg = RAS_MP1_MSG_QueryValidMcaCeCount; + break; + default: + return -EINVAL; + } + + ret = sys_func->mp1_get_valid_bank_count(ras_core, msg, &bank_count); + if (!ret) { + if (((type == RAS_ERR_TYPE__UE) && (bank_count >= MAX_UE_BANKS_PER_QUERY)) || + ((type == RAS_ERR_TYPE__CE) && (bank_count >= MAX_CE_BANKS_PER_QUERY))) + return -EINVAL; + + *count = bank_count; + } + + return ret; +} + +static int mp1_v13_0_dump_bank(struct ras_core_context *ras_core, + enum ras_err_type type, u32 idx, u32 reg_idx, u64 *val) +{ + struct ras_mp1 *mp1 = &ras_core->ras_mp1; + const struct ras_mp1_sys_func *sys_func = mp1->sys_func; + u32 msg; + + if (!sys_func || !sys_func->mp1_dump_valid_bank) + return -RAS_CORE_NOT_SUPPORTED; + + switch (type) { + case RAS_ERR_TYPE__UE: + msg = RAS_MP1_MSG_McaBankDumpDW; + break; + case RAS_ERR_TYPE__CE: + case RAS_ERR_TYPE__DE: + msg = RAS_MP1_MSG_McaBankCeDumpDW; + break; + default: + return -EINVAL; + } + + return sys_func->mp1_dump_valid_bank(ras_core, msg, idx, reg_idx, val); +} + +const struct ras_mp1_ip_func mp1_ras_func_v13_0 = { + .get_valid_bank_count = mp1_v13_0_get_bank_count, + .dump_valid_bank = mp1_v13_0_dump_bank, +}; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.h new file mode 100644 index 000000000000..2edfdb5f6a75 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_mp1_v13_0.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_MP1_V13_0_H__ +#define __RAS_MP1_V13_0_H__ +#include "ras_mp1.h" + +extern const struct ras_mp1_ip_func mp1_ras_func_v13_0; + +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c new file mode 100644 index 000000000000..bfddd104d548 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "ras.h" +#include "ras_nbio.h" +#include "ras_nbio_v7_9.h" + +static const struct ras_nbio_ip_func *ras_nbio_get_ip_funcs( + struct ras_core_context *ras_core, uint32_t ip_version) +{ + switch (ip_version) { + case IP_VERSION(7, 9, 0): + case IP_VERSION(7, 9, 1): + return &ras_nbio_v7_9; + default: + RAS_DEV_ERR(ras_core->dev, + "NBIO ip version(0x%x) is not supported!\n", ip_version); + break; + } + + return NULL; +} + +int ras_nbio_hw_init(struct ras_core_context *ras_core) +{ + struct ras_nbio *nbio = &ras_core->ras_nbio; + + nbio->nbio_ip_version = ras_core->config->nbio_ip_version; + nbio->sys_func = ras_core->config->nbio_cfg.nbio_sys_fn; + if (!nbio->sys_func) { + RAS_DEV_ERR(ras_core->dev, "RAS nbio sys function not configured!\n"); + return -EINVAL; + } + + nbio->ip_func = ras_nbio_get_ip_funcs(ras_core, nbio->nbio_ip_version); + if (!nbio->ip_func) + return -EINVAL; + + if (nbio->sys_func) { + if (nbio->sys_func->set_ras_controller_irq_state) + nbio->sys_func->set_ras_controller_irq_state(ras_core, true); + if (nbio->sys_func->set_ras_err_event_athub_irq_state) + nbio->sys_func->set_ras_err_event_athub_irq_state(ras_core, true); + } + + return 0; +} + +int ras_nbio_hw_fini(struct ras_core_context *ras_core) +{ + struct ras_nbio *nbio = &ras_core->ras_nbio; + + if (nbio->sys_func) { + if (nbio->sys_func->set_ras_controller_irq_state) + nbio->sys_func->set_ras_controller_irq_state(ras_core, false); + if (nbio->sys_func->set_ras_err_event_athub_irq_state) + nbio->sys_func->set_ras_err_event_athub_irq_state(ras_core, false); + } + + return 0; +} + +bool ras_nbio_handle_irq_error(struct ras_core_context *ras_core, void *data) +{ + struct ras_nbio *nbio = &ras_core->ras_nbio; + + if (nbio->ip_func) { + if (nbio->ip_func->handle_ras_controller_intr_no_bifring) + nbio->ip_func->handle_ras_controller_intr_no_bifring(ras_core); + if (nbio->ip_func->handle_ras_err_event_athub_intr_no_bifring) + nbio->ip_func->handle_ras_err_event_athub_intr_no_bifring(ras_core); + } + + return true; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio.h b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.h new file mode 100644 index 000000000000..0a1313e59a02 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_NBIO_H__ +#define __RAS_NBIO_H__ +#include "ras.h" + +struct ras_core_context; + +struct ras_nbio_ip_func { + int (*handle_ras_controller_intr_no_bifring)(struct ras_core_context *ras_core); + int (*handle_ras_err_event_athub_intr_no_bifring)(struct ras_core_context *ras_core); + uint32_t (*get_memory_partition_mode)(struct ras_core_context *ras_core); +}; + +struct ras_nbio { + uint32_t nbio_ip_version; + const struct ras_nbio_ip_func *ip_func; + const struct ras_nbio_sys_func *sys_func; +}; + +int ras_nbio_hw_init(struct ras_core_context *ras_core); +int ras_nbio_hw_fini(struct ras_core_context *ras_core); +bool ras_nbio_handle_irq_error(struct ras_core_context *ras_core, void *data); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.c b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.c new file mode 100644 index 000000000000..f17d708ec668 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "ras.h" +#include "ras_nbio_v7_9.h" + +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR__SHIFT 0x12 +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR_MASK 0x00040000L +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS__SHIFT 0x2 +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS_MASK 0x00000004L +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_CLEAR__SHIFT 0x11 +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_CLEAR_MASK 0x00020000L +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_STATUS__SHIFT 0x1 +#define BIF_BX0_BIF_DOORBELL_INT_CNTL__RAS_CNTLR_INTERRUPT_STATUS_MASK 0x00000002L + +#define regBIF_BX0_BIF_DOORBELL_INT_CNTL_BASE_IDX 2 +#define regBIF_BX0_BIF_DOORBELL_INT_CNTL 0x00fe + +#define regBIF_BX0_BIF_INTR_CNTL 0x0101 +#define regBIF_BX0_BIF_INTR_CNTL_BASE_IDX 2 + +/* BIF_BX0_BIF_INTR_CNTL */ +#define BIF_BX0_BIF_INTR_CNTL__RAS_INTR_VEC_SEL__SHIFT 0x0 +#define BIF_BX0_BIF_INTR_CNTL__RAS_INTR_VEC_SEL_MASK 0x00000001L + +#define regBIF_BX_PF0_PARTITION_MEM_STATUS 0x0164 +#define regBIF_BX_PF0_PARTITION_MEM_STATUS_BASE_IDX 2 +/* BIF_BX_PF0_PARTITION_MEM_STATUS */ +#define BIF_BX_PF0_PARTITION_MEM_STATUS__CHANGE_STATUE__SHIFT 0x0 +#define BIF_BX_PF0_PARTITION_MEM_STATUS__NPS_MODE__SHIFT 0x4 +#define BIF_BX_PF0_PARTITION_MEM_STATUS__CHANGE_STATUE_MASK 0x0000000FL +#define BIF_BX_PF0_PARTITION_MEM_STATUS__NPS_MODE_MASK 0x00000FF0L + + +static int nbio_v7_9_handle_ras_controller_intr_no_bifring(struct ras_core_context *ras_core) +{ + uint32_t bif_doorbell_intr_cntl = 0; + + bif_doorbell_intr_cntl = + RAS_DEV_RREG32_SOC15(ras_core->dev, NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_CNTLR_INTERRUPT_CLEAR, 1); + + RAS_DEV_WREG32_SOC15(ras_core->dev, + NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + /* TODO: handle ras controller interrupt */ + } + + return 0; +} + +static int nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring(struct ras_core_context *ras_core) +{ + uint32_t bif_doorbell_intr_cntl = 0; + int ret = 0; + + bif_doorbell_intr_cntl = + RAS_DEV_RREG32_SOC15(ras_core->dev, NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL); + + if (REG_GET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) { + /* driver has to clear the interrupt status when bif ring is disabled */ + bif_doorbell_intr_cntl = REG_SET_FIELD(bif_doorbell_intr_cntl, + BIF_BX0_BIF_DOORBELL_INT_CNTL, + RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); + + RAS_DEV_WREG32_SOC15(ras_core->dev, + NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + ret = ras_core_handle_fatal_error(ras_core); + } + + return ret; +} + +static uint32_t nbio_v7_9_get_memory_partition_mode(struct ras_core_context *ras_core) +{ + uint32_t mem_status; + uint32_t mem_mode; + + mem_status = + RAS_DEV_RREG32_SOC15(ras_core->dev, NBIO, 0, regBIF_BX_PF0_PARTITION_MEM_STATUS); + + /* Each bit represents a mode 1-8*/ + mem_mode = REG_GET_FIELD(mem_status, BIF_BX_PF0_PARTITION_MEM_STATUS, NPS_MODE); + + return ffs(mem_mode); +} + +const struct ras_nbio_ip_func ras_nbio_v7_9 = { + .handle_ras_controller_intr_no_bifring = + nbio_v7_9_handle_ras_controller_intr_no_bifring, + .handle_ras_err_event_athub_intr_no_bifring = + nbio_v7_9_handle_ras_err_event_athub_intr_no_bifring, + .get_memory_partition_mode = nbio_v7_9_get_memory_partition_mode, +}; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.h b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.h new file mode 100644 index 000000000000..8711c82a927f --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_nbio_v7_9.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_NBIO_V7_9_H__ +#define __RAS_NBIO_V7_9_H__ +#include "ras_nbio.h" + +extern const struct ras_nbio_ip_func ras_nbio_v7_9; + +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.c b/drivers/gpu/drm/amd/ras/rascore/ras_process.c new file mode 100644 index 000000000000..3267dcdb169c --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_process.h" + +#define RAS_EVENT_FIFO_SIZE (128 * sizeof(struct ras_event_req)) + +#define RAS_POLLING_ECC_TIMEOUT 300 + +static int ras_process_put_event(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + int ret; + + ret = kfifo_in_spinlocked(&ras_proc->event_fifo, + req, sizeof(*req), &ras_proc->fifo_spinlock); + if (!ret) { + RAS_DEV_ERR(ras_core->dev, "Poison message fifo is full!\n"); + return -ENOSPC; + } + + return 0; +} + +static int ras_process_add_reset_gpu_event(struct ras_core_context *ras_core, + uint32_t reset_cause) +{ + struct ras_event_req req = {0}; + + req.reset = reset_cause; + + return ras_process_put_event(ras_core, &req); +} + +static int ras_process_get_event(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + + return kfifo_out_spinlocked(&ras_proc->event_fifo, + req, sizeof(*req), &ras_proc->fifo_spinlock); +} + +static void ras_process_clear_event_fifo(struct ras_core_context *ras_core) +{ + struct ras_event_req req; + int ret; + + do { + ret = ras_process_get_event(ras_core, &req); + } while (ret); +} + +#define AMDGPU_RAS_WAITING_DATA_READY 200 +static int ras_process_umc_event(struct ras_core_context *ras_core, + uint32_t event_count) +{ + struct ras_ecc_count ecc_data; + int ret = 0; + uint32_t timeout = 0; + uint32_t detected_de_count = 0; + + do { + memset(&ecc_data, 0, sizeof(ecc_data)); + ret = ras_core_update_ecc_info(ras_core); + if (ret) + return ret; + + ret = ras_core_query_block_ecc_data(ras_core, RAS_BLOCK_ID__UMC, &ecc_data); + if (ret) + return ret; + + if (ecc_data.new_de_count) { + detected_de_count += ecc_data.new_de_count; + timeout = 0; + } else { + if (!timeout && event_count) + timeout = AMDGPU_RAS_WAITING_DATA_READY; + + if (timeout) { + if (!--timeout) + break; + + msleep(1); + } + } + } while (detected_de_count < event_count); + + if (detected_de_count && ras_core_gpu_is_rma(ras_core)) + ras_process_add_reset_gpu_event(ras_core, GPU_RESET_CAUSE_RMA); + + return 0; +} + +static int ras_process_non_umc_event(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + struct ras_event_req req; + uint32_t event_count = kfifo_len(&ras_proc->event_fifo); + uint32_t reset_flags = 0; + int ret = 0, i; + + for (i = 0; i < event_count; i++) { + memset(&req, 0, sizeof(req)); + ret = ras_process_get_event(ras_core, &req); + if (!ret) + continue; + + ras_core_event_notify(ras_core, + RAS_EVENT_ID__POISON_CONSUMPTION, &req); + + reset_flags |= req.reset; + + if (req.reset == GPU_RESET_CAUSE_RMA) + continue; + + if (req.reset) + RAS_DEV_INFO(ras_core->dev, + "{%llu} GPU reset for %s RAS poison consumption is issued!\n", + req.seqno, ras_core_get_ras_block_name(req.block)); + else + RAS_DEV_INFO(ras_core->dev, + "{%llu} %s RAS poison consumption is issued!\n", + req.seqno, ras_core_get_ras_block_name(req.block)); + } + + if (reset_flags) { + ret = ras_core_event_notify(ras_core, + RAS_EVENT_ID__RESET_GPU, &reset_flags); + if (!ret && (reset_flags & GPU_RESET_CAUSE_RMA)) + return -RAS_CORE_GPU_IN_MODE1_RESET; + } + + return ret; +} + +int ras_process_handle_ras_event(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + uint32_t umc_event_count; + int ret; + + ret = ras_core_event_notify(ras_core, + RAS_EVENT_ID__RAS_EVENT_PROC_BEGIN, NULL); + if (ret) + return ret; + + ras_aca_clear_fatal_flag(ras_core); + ras_umc_log_pending_bad_bank(ras_core); + + do { + umc_event_count = atomic_read(&ras_proc->umc_interrupt_count); + ret = ras_process_umc_event(ras_core, umc_event_count); + if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) + break; + + if (umc_event_count) + atomic_sub(umc_event_count, &ras_proc->umc_interrupt_count); + } while (atomic_read(&ras_proc->umc_interrupt_count)); + + if ((ret != -RAS_CORE_GPU_IN_MODE1_RESET) && + (kfifo_len(&ras_proc->event_fifo))) + ret = ras_process_non_umc_event(ras_core); + + if (ret == -RAS_CORE_GPU_IN_MODE1_RESET) { + /* Clear poison fifo */ + ras_process_clear_event_fifo(ras_core); + atomic_set(&ras_proc->umc_interrupt_count, 0); + } + + ras_core_event_notify(ras_core, + RAS_EVENT_ID__RAS_EVENT_PROC_END, NULL); + return ret; +} + +static int thread_wait_condition(void *param) +{ + struct ras_process *ras_proc = (struct ras_process *)param; + + return (kthread_should_stop() || + atomic_read(&ras_proc->ras_interrupt_req)); +} + +static int ras_process_thread(void *context) +{ + struct ras_core_context *ras_core = (struct ras_core_context *)context; + struct ras_process *ras_proc = &ras_core->ras_proc; + + while (!kthread_should_stop()) { + ras_wait_event_interruptible_timeout(&ras_proc->ras_process_wq, + thread_wait_condition, ras_proc, + msecs_to_jiffies(RAS_POLLING_ECC_TIMEOUT)); + + if (kthread_should_stop()) + break; + + if (!ras_core->is_initialized) + continue; + + atomic_set(&ras_proc->ras_interrupt_req, 0); + + if (ras_core_gpu_in_reset(ras_core)) + continue; + + if (ras_core->sys_fn && ras_core->sys_fn->async_handle_ras_event) + ras_core->sys_fn->async_handle_ras_event(ras_core, NULL); + else + ras_process_handle_ras_event(ras_core); + } + + return 0; +} + +int ras_process_init(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + int ret; + + ret = kfifo_alloc(&ras_proc->event_fifo, RAS_EVENT_FIFO_SIZE, GFP_KERNEL); + if (ret) + return ret; + + spin_lock_init(&ras_proc->fifo_spinlock); + + init_waitqueue_head(&ras_proc->ras_process_wq); + + ras_proc->ras_process_thread = kthread_run(ras_process_thread, + (void *)ras_core, "ras_process_thread"); + if (!ras_proc->ras_process_thread) { + RAS_DEV_ERR(ras_core->dev, "Failed to create ras_process_thread.\n"); + ret = -ENOMEM; + goto err; + } + + return 0; + +err: + ras_process_fini(ras_core); + return ret; +} + +int ras_process_fini(struct ras_core_context *ras_core) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + + if (ras_proc->ras_process_thread) { + kthread_stop(ras_proc->ras_process_thread); + ras_proc->ras_process_thread = NULL; + } + + kfifo_free(&ras_proc->event_fifo); + + return 0; +} + +static int ras_process_add_umc_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + + atomic_inc(&ras_proc->umc_interrupt_count); + atomic_inc(&ras_proc->ras_interrupt_req); + + wake_up(&ras_proc->ras_process_wq); + return 0; +} + +static int ras_process_add_non_umc_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req) +{ + struct ras_process *ras_proc = &ras_core->ras_proc; + int ret; + + ret = ras_process_put_event(ras_core, req); + if (!ret) { + atomic_inc(&ras_proc->ras_interrupt_req); + wake_up(&ras_proc->ras_process_wq); + } + + return ret; +} + +int ras_process_add_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req, bool is_umc) +{ + int ret; + + if (!ras_core) + return -EINVAL; + + if (!ras_core->is_initialized) + return -EPERM; + + if (is_umc) + ret = ras_process_add_umc_interrupt_req(ras_core, req); + else + ret = ras_process_add_non_umc_interrupt_req(ras_core, req); + + return ret; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_process.h b/drivers/gpu/drm/amd/ras/rascore/ras_process.h new file mode 100644 index 000000000000..28458b50510e --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_process.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_PROCESS_H__ +#define __RAS_PROCESS_H__ + +struct ras_event_req { + uint64_t seqno; + uint32_t idx_vf; + uint32_t block; + uint16_t pasid; + uint32_t reset; + void *pasid_fn; + void *data; +}; + +struct ras_process { + void *dev; + void *ras_process_thread; + wait_queue_head_t ras_process_wq; + atomic_t ras_interrupt_req; + atomic_t umc_interrupt_count; + struct kfifo event_fifo; + spinlock_t fifo_spinlock; +}; + +struct ras_core_context; +int ras_process_init(struct ras_core_context *ras_core); +int ras_process_fini(struct ras_core_context *ras_core); +int ras_process_handle_ras_event(struct ras_core_context *ras_core); +int ras_process_add_interrupt_req(struct ras_core_context *ras_core, + struct ras_event_req *req, bool is_umc); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp.c b/drivers/gpu/drm/amd/ras/rascore/ras_psp.c new file mode 100644 index 000000000000..ccdb42d2dd60 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_ta_if.h" +#include "ras_psp.h" +#include "ras_psp_v13_0.h" + +/* position of instance value in sub_block_index of + * ta_ras_trigger_error_input, the sub block uses lower 12 bits + */ +#define RAS_TA_INST_MASK 0xfffff000 +#define RAS_TA_INST_SHIFT 0xc + +static const struct ras_psp_ip_func *ras_psp_get_ip_funcs( + struct ras_core_context *ras_core, uint32_t ip_version) +{ + switch (ip_version) { + case IP_VERSION(13, 0, 6): + case IP_VERSION(13, 0, 14): + case IP_VERSION(13, 0, 12): + return &ras_psp_v13_0; + default: + RAS_DEV_ERR(ras_core->dev, + "psp ip version(0x%x) is not supported!\n", ip_version); + break; + } + + return NULL; +} + +static int ras_psp_sync_system_ras_psp_status(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + struct ras_psp_sys_status status = {0}; + int ret; + + if (psp->sys_func && psp->sys_func->get_ras_psp_system_status) { + ret = psp->sys_func->get_ras_psp_system_status(ras_core, &status); + if (ret) + return ret; + + if (status.initialized) { + ta_ctx->preload_ras_ta_enabled = true; + ta_ctx->ras_ta_initialized = status.initialized; + ta_ctx->session_id = status.session_id; + } + + psp_ctx->external_mutex = status.psp_cmd_mutex; + } + + return 0; +} + +static int ras_psp_get_ras_ta_init_param(struct ras_core_context *ras_core, + struct ras_ta_init_param *ras_ta_param) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + if (psp->sys_func && psp->sys_func->get_ras_ta_init_param) + return psp->sys_func->get_ras_ta_init_param(ras_core, ras_ta_param); + + RAS_DEV_ERR(ras_core->dev, "Not config get_ras_ta_init_param API!!\n"); + return -EACCES; +} + +static struct gpu_mem_block *ras_psp_get_gpu_mem(struct ras_core_context *ras_core, + enum gpu_mem_type mem_type) +{ + struct ras_psp *psp = &ras_core->ras_psp; + struct gpu_mem_block *gpu_mem = NULL; + int ret; + + switch (mem_type) { + case GPU_MEM_TYPE_RAS_PSP_RING: + gpu_mem = &psp->psp_ring.ras_ring_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_PSP_CMD: + gpu_mem = &psp->psp_ctx.psp_cmd_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_PSP_FENCE: + gpu_mem = &psp->psp_ctx.out_fence_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_TA_FW: + gpu_mem = &psp->ta_ctx.fw_gpu_mem; + break; + case GPU_MEM_TYPE_RAS_TA_CMD: + gpu_mem = &psp->ta_ctx.cmd_gpu_mem; + break; + default: + return NULL; + } + + if (!gpu_mem->ref_count) { + ret = ras_core_get_gpu_mem(ras_core, mem_type, gpu_mem); + if (ret) + return NULL; + gpu_mem->mem_type = mem_type; + } + + gpu_mem->ref_count++; + + return gpu_mem; +} + +static int ras_psp_put_gpu_mem(struct ras_core_context *ras_core, + struct gpu_mem_block *gpu_mem) +{ + if (!gpu_mem) + return 0; + + gpu_mem->ref_count--; + + if (gpu_mem->ref_count > 0) { + return 0; + } else if (gpu_mem->ref_count < 0) { + RAS_DEV_WARN(ras_core->dev, + "Duplicate free gpu memory %u\n", gpu_mem->mem_type); + } else { + ras_core_put_gpu_mem(ras_core, gpu_mem->mem_type, gpu_mem); + memset(gpu_mem, 0, sizeof(*gpu_mem)); + } + + return 0; +} + +static void __acquire_psp_cmd_lock(struct ras_core_context *ras_core) +{ + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + + if (psp_ctx->external_mutex) + mutex_lock(psp_ctx->external_mutex); + else + mutex_lock(&psp_ctx->internal_mutex); +} + +static void __release_psp_cmd_lock(struct ras_core_context *ras_core) +{ + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + + if (psp_ctx->external_mutex) + mutex_unlock(psp_ctx->external_mutex); + else + mutex_unlock(&psp_ctx->internal_mutex); +} + +static uint32_t __get_ring_frame_slot(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + uint32_t ras_ring_wptr_dw; + + ras_ring_wptr_dw = psp->ip_func->psp_ras_ring_wptr_get(ras_core); + + return div64_u64((ras_ring_wptr_dw << 2), sizeof(struct psp_gfx_rb_frame)); +} + +static int __set_ring_frame_slot(struct ras_core_context *ras_core, + uint32_t slot) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + return psp->ip_func->psp_ras_ring_wptr_set(ras_core, + (slot * sizeof(struct psp_gfx_rb_frame)) >> 2); +} + +static int write_frame_to_ras_psp_ring(struct ras_core_context *ras_core, + struct psp_gfx_rb_frame *frame) +{ + struct gpu_mem_block *ring_mem; + struct psp_gfx_rb_frame *rb_frame; + uint32_t max_frame_slot; + uint32_t slot_idx; + uint32_t write_flush_read_back = 0; + int ret = 0; + + ring_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_RING); + if (!ring_mem) + return -ENOMEM; + + max_frame_slot = + div64_u64(ring_mem->mem_size, sizeof(struct psp_gfx_rb_frame)); + + rb_frame = + (struct psp_gfx_rb_frame *)ring_mem->mem_cpu_addr; + + slot_idx = __get_ring_frame_slot(ras_core); + if (slot_idx >= max_frame_slot) + slot_idx = 0; + + memcpy(&rb_frame[slot_idx], frame, sizeof(*frame)); + + /* Do a read to force the write of the frame before writing + * write pointer. + */ + write_flush_read_back = rb_frame[slot_idx].fence_value; + if (write_flush_read_back != frame->fence_value) { + RAS_DEV_ERR(ras_core->dev, + "Failed to submit ring cmd! cmd:0x%x:0x%x, fence:0x%x:0x%x value:%u, expected:%u\n", + rb_frame[slot_idx].cmd_buf_addr_hi, + rb_frame[slot_idx].cmd_buf_addr_lo, + rb_frame[slot_idx].fence_addr_hi, + rb_frame[slot_idx].fence_addr_lo, + write_flush_read_back, frame->fence_value); + ret = -EACCES; + goto err; + } + + slot_idx++; + + if (slot_idx >= max_frame_slot) + slot_idx = 0; + + __set_ring_frame_slot(ras_core, slot_idx); + +err: + ras_psp_put_gpu_mem(ras_core, ring_mem); + return ret; +} + +static int send_psp_cmd(struct ras_core_context *ras_core, + enum psp_gfx_cmd_id gfx_cmd_id, void *cmd_data, + uint32_t cmd_size, struct psp_cmd_resp *resp) +{ + struct ras_psp_ctx *psp_ctx = &ras_core->ras_psp.psp_ctx; + struct gpu_mem_block *psp_cmd_buf = NULL; + struct gpu_mem_block *psp_fence_buf = NULL; + struct psp_gfx_cmd_resp *gfx_cmd; + struct psp_gfx_rb_frame rb_frame; + int ret = 0; + int timeout = 1000; + + if (!cmd_data || (cmd_size > sizeof(union psp_gfx_commands)) || !resp) { + RAS_DEV_ERR(ras_core->dev, "Invalid RAS PSP command, id: %u\n", gfx_cmd_id); + return -EINVAL; + } + + __acquire_psp_cmd_lock(ras_core); + + psp_cmd_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_CMD); + if (!psp_cmd_buf) { + ret = -ENOMEM; + goto exit; + } + + psp_fence_buf = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_PSP_FENCE); + if (!psp_fence_buf) { + ret = -ENOMEM; + goto exit; + } + + gfx_cmd = (struct psp_gfx_cmd_resp *)psp_cmd_buf->mem_cpu_addr; + memset(gfx_cmd, 0, sizeof(*gfx_cmd)); + gfx_cmd->cmd_id = gfx_cmd_id; + memcpy(&gfx_cmd->cmd, cmd_data, cmd_size); + + psp_ctx->in_fence_value++; + + memset(&rb_frame, 0, sizeof(rb_frame)); + rb_frame.cmd_buf_addr_hi = upper_32_bits(psp_cmd_buf->mem_mc_addr); + rb_frame.cmd_buf_addr_lo = lower_32_bits(psp_cmd_buf->mem_mc_addr); + rb_frame.fence_addr_hi = upper_32_bits(psp_fence_buf->mem_mc_addr); + rb_frame.fence_addr_lo = lower_32_bits(psp_fence_buf->mem_mc_addr); + rb_frame.fence_value = psp_ctx->in_fence_value; + + ret = write_frame_to_ras_psp_ring(ras_core, &rb_frame); + if (ret) { + psp_ctx->in_fence_value--; + goto exit; + } + + while (*((uint64_t *)psp_fence_buf->mem_cpu_addr) != + psp_ctx->in_fence_value) { + if (--timeout == 0) + break; + /* + * Shouldn't wait for timeout when err_event_athub occurs, + * because gpu reset thread triggered and lock resource should + * be released for psp resume sequence. + */ + if (ras_core_ras_interrupt_detected(ras_core)) + break; + + msleep(2); + } + + resp->status = gfx_cmd->resp.status; + resp->session_id = gfx_cmd->resp.session_id; + +exit: + ras_psp_put_gpu_mem(ras_core, psp_cmd_buf); + ras_psp_put_gpu_mem(ras_core, psp_fence_buf); + + __release_psp_cmd_lock(ras_core); + + return ret; +} + +static void __check_ras_ta_cmd_resp(struct ras_core_context *ras_core, + struct ras_ta_cmd *ras_cmd) +{ + + if (ras_cmd->ras_out_message.flags.err_inject_switch_disable_flag) { + RAS_DEV_WARN(ras_core->dev, "ECC switch disabled\n"); + ras_cmd->ras_status = RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE; + } else if (ras_cmd->ras_out_message.flags.reg_access_failure_flag) + RAS_DEV_WARN(ras_core->dev, "RAS internal register access blocked\n"); + + switch (ras_cmd->ras_status) { + case RAS_TA_STATUS__ERROR_UNSUPPORTED_IP: + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: cmd failed due to unsupported ip\n"); + break; + case RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ: + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: cmd failed due to unsupported error injection\n"); + break; + case RAS_TA_STATUS__SUCCESS: + break; + case RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED: + if (ras_cmd->cmd_id == RAS_TA_CMD_ID__TRIGGER_ERROR) + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: Inject error to critical region is not allowed\n"); + break; + default: + RAS_DEV_WARN(ras_core->dev, + "RAS WARNING: ras status = 0x%X\n", ras_cmd->ras_status); + break; + } +} + +static int send_ras_ta_runtime_cmd(struct ras_core_context *ras_core, + enum ras_ta_cmd_id cmd_id, void *in, uint32_t in_size, + void *out, uint32_t out_size) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct gpu_mem_block *cmd_mem; + struct ras_ta_cmd *ras_cmd; + struct psp_gfx_cmd_invoke_cmd invoke_cmd = {0}; + struct psp_cmd_resp resp = {0}; + int ret = 0; + + if (!in || (in_size > sizeof(union ras_ta_cmd_input)) || + (cmd_id >= MAX_RAS_TA_CMD_ID)) { + RAS_DEV_ERR(ras_core->dev, "Invalid RAS TA command, id: %u\n", cmd_id); + return -EINVAL; + } + + ras_psp_sync_system_ras_psp_status(ras_core); + + cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD); + if (!cmd_mem) + return -ENOMEM; + + if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) { + ret = -EACCES; + goto out; + } + + ras_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr; + + mutex_lock(&ta_ctx->ta_mutex); + + memset(ras_cmd, 0, sizeof(*ras_cmd)); + ras_cmd->cmd_id = cmd_id; + memcpy(&ras_cmd->ras_in_message, in, in_size); + + invoke_cmd.ta_cmd_id = cmd_id; + invoke_cmd.session_id = ta_ctx->session_id; + + ret = send_psp_cmd(ras_core, GFX_CMD_ID_INVOKE_CMD, + &invoke_cmd, sizeof(invoke_cmd), &resp); + + /* If err_event_athub occurs error inject was successful, however + * return status from TA is no long reliable + */ + if (ras_core_ras_interrupt_detected(ras_core)) { + ret = 0; + goto unlock; + } + + if (ret || resp.status) { + RAS_DEV_ERR(ras_core->dev, + "RAS: Failed to send psp cmd! ret:%d, status:%u\n", + ret, resp.status); + ret = -ESTRPIPE; + goto unlock; + } + + if (ras_cmd->if_version > RAS_TA_HOST_IF_VER) { + RAS_DEV_WARN(ras_core->dev, "RAS: Unsupported Interface\n"); + ret = -EINVAL; + goto unlock; + } + + if (!ras_cmd->ras_status && out && out_size) + memcpy(out, &ras_cmd->ras_out_message, out_size); + + __check_ras_ta_cmd_resp(ras_core, ras_cmd); + +unlock: + mutex_unlock(&ta_ctx->ta_mutex); + ras_core_up_gpu_reset_lock(ras_core); +out: + ras_psp_put_gpu_mem(ras_core, cmd_mem); + return ret; +} + +static int trigger_ras_ta_error(struct ras_core_context *ras_core, + struct ras_ta_trigger_error_input *info, uint32_t instance_mask) +{ + uint32_t dev_mask = 0; + + switch (info->block_id) { + case RAS_TA_BLOCK__GFX: + if (ras_gfx_get_ta_subblock(ras_core, info->inject_error_type, + info->sub_block_index, &info->sub_block_index)) + return -EINVAL; + + dev_mask = RAS_GET_MASK(ras_core->dev, GC, instance_mask); + break; + case RAS_TA_BLOCK__SDMA: + dev_mask = RAS_GET_MASK(ras_core->dev, SDMA0, instance_mask); + break; + case RAS_TA_BLOCK__VCN: + case RAS_TA_BLOCK__JPEG: + dev_mask = RAS_GET_MASK(ras_core->dev, VCN, instance_mask); + break; + default: + dev_mask = instance_mask; + break; + } + + /* reuse sub_block_index for backward compatibility */ + dev_mask <<= RAS_TA_INST_SHIFT; + dev_mask &= RAS_TA_INST_MASK; + info->sub_block_index |= dev_mask; + + return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__TRIGGER_ERROR, + info, sizeof(*info), NULL, 0); +} + +static int send_load_ta_fw_cmd(struct ras_core_context *ras_core, + struct ras_ta_ctx *ta_ctx) +{ + struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin; + struct gpu_mem_block *fw_mem; + struct gpu_mem_block *cmd_mem; + struct ras_ta_cmd *ta_cmd; + struct ras_ta_init_flags *ta_init_flags; + struct psp_gfx_cmd_load_ta psp_load_ta_cmd; + struct psp_cmd_resp resp = {0}; + struct ras_ta_image_header *fw_hdr = NULL; + int ret; + + fw_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_FW); + if (!fw_mem) + return -ENOMEM; + + cmd_mem = ras_psp_get_gpu_mem(ras_core, GPU_MEM_TYPE_RAS_TA_CMD); + if (!cmd_mem) { + ret = -ENOMEM; + goto err; + } + + ret = ras_psp_get_ras_ta_init_param(ras_core, &ta_ctx->init_param); + if (ret) + goto err; + + if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) { + ret = -EACCES; + goto err; + } + + /* copy ras ta binary to shared gpu memory */ + memcpy(fw_mem->mem_cpu_addr, fw_bin->bin_addr, fw_bin->bin_size); + fw_mem->mem_size = fw_bin->bin_size; + + /* Initialize ras ta startup parameter */ + ta_cmd = (struct ras_ta_cmd *)cmd_mem->mem_cpu_addr; + ta_init_flags = &ta_cmd->ras_in_message.init_flags; + + ta_init_flags->poison_mode_en = ta_ctx->init_param.poison_mode_en; + ta_init_flags->dgpu_mode = ta_ctx->init_param.dgpu_mode; + ta_init_flags->xcc_mask = ta_ctx->init_param.xcc_mask; + ta_init_flags->channel_dis_num = ta_ctx->init_param.channel_dis_num; + ta_init_flags->nps_mode = ta_ctx->init_param.nps_mode; + ta_init_flags->active_umc_mask = ta_ctx->init_param.active_umc_mask; + + /* Setup load ras ta command */ + memset(&psp_load_ta_cmd, 0, sizeof(psp_load_ta_cmd)); + psp_load_ta_cmd.app_phy_addr_lo = lower_32_bits(fw_mem->mem_mc_addr); + psp_load_ta_cmd.app_phy_addr_hi = upper_32_bits(fw_mem->mem_mc_addr); + psp_load_ta_cmd.app_len = fw_mem->mem_size; + psp_load_ta_cmd.cmd_buf_phy_addr_lo = lower_32_bits(cmd_mem->mem_mc_addr); + psp_load_ta_cmd.cmd_buf_phy_addr_hi = upper_32_bits(cmd_mem->mem_mc_addr); + psp_load_ta_cmd.cmd_buf_len = cmd_mem->mem_size; + + ret = send_psp_cmd(ras_core, GFX_CMD_ID_LOAD_TA, + &psp_load_ta_cmd, sizeof(psp_load_ta_cmd), &resp); + if (!ret && !resp.status) { + /* Read TA version at FW offset 0x60 if TA version not found*/ + fw_hdr = (struct ras_ta_image_header *)fw_bin->bin_addr; + RAS_DEV_INFO(ras_core->dev, "PSP: RAS TA(version:%X.%X.%X.%X) is loaded.\n", + (fw_hdr->image_version >> 24) & 0xFF, (fw_hdr->image_version >> 16) & 0xFF, + (fw_hdr->image_version >> 8) & 0xFF, fw_hdr->image_version & 0xFF); + ta_ctx->ta_version = fw_hdr->image_version; + ta_ctx->session_id = resp.session_id; + ta_ctx->ras_ta_initialized = true; + } else { + RAS_DEV_ERR(ras_core->dev, + "Failed to load RAS TA! ret:%d, status:%d\n", ret, resp.status); + } + + ras_core_up_gpu_reset_lock(ras_core); + +err: + ras_psp_put_gpu_mem(ras_core, fw_mem); + ras_psp_put_gpu_mem(ras_core, cmd_mem); + return ret; +} + +static int load_ras_ta_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_load *ras_ta_load) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct ras_ta_fw_bin *fw_bin = &ta_ctx->fw_bin; + int ret; + + fw_bin->bin_addr = ras_ta_load->bin_addr; + fw_bin->bin_size = ras_ta_load->bin_size; + fw_bin->fw_version = ras_ta_load->fw_version; + fw_bin->feature_version = ras_ta_load->feature_version; + + ret = send_load_ta_fw_cmd(ras_core, ta_ctx); + if (!ret) { + ras_ta_load->out_session_id = ta_ctx->session_id; + ras_ta_load->out_loaded_ta_version = ta_ctx->ta_version; + } + + return ret; +} + +static int unload_ras_ta_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_unload *ras_ta_unload) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct psp_gfx_cmd_unload_ta cmd_unload_ta = {0}; + struct psp_cmd_resp resp = {0}; + int ret; + + if (!ras_core_down_trylock_gpu_reset_lock(ras_core)) + return -EACCES; + + cmd_unload_ta.session_id = ta_ctx->session_id; + ret = send_psp_cmd(ras_core, GFX_CMD_ID_UNLOAD_TA, + &cmd_unload_ta, sizeof(cmd_unload_ta), &resp); + if (ret || resp.status) { + RAS_DEV_ERR(ras_core->dev, + "Failed to unload RAS TA! ret:%d, status:%u\n", + ret, resp.status); + goto unlock; + } + + kfree(ta_ctx->fw_bin.bin_addr); + memset(&ta_ctx->fw_bin, 0, sizeof(ta_ctx->fw_bin)); + ta_ctx->ta_version = 0; + ta_ctx->ras_ta_initialized = false; + ta_ctx->session_id = 0; + +unlock: + ras_core_up_gpu_reset_lock(ras_core); + + return ret; +} + +int ras_psp_load_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_load *ras_ta_load) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + struct ras_psp_ta_unload ras_ta_unload = {0}; + int ret; + + if (ta_ctx->preload_ras_ta_enabled) + return 0; + + if (!ras_ta_load) + return -EINVAL; + + if (ta_ctx->ras_ta_initialized) { + ras_ta_unload.ras_session_id = ta_ctx->session_id; + ret = unload_ras_ta_firmware(ras_core, &ras_ta_unload); + if (ret) + return ret; + } + + return load_ras_ta_firmware(ras_core, ras_ta_load); +} + +int ras_psp_unload_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_unload *ras_ta_unload) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + + if (ta_ctx->preload_ras_ta_enabled) + return 0; + + if ((!ras_ta_unload) || + (ras_ta_unload->ras_session_id != ta_ctx->session_id)) + return -EINVAL; + + return unload_ras_ta_firmware(ras_core, ras_ta_unload); +} + +int ras_psp_trigger_error(struct ras_core_context *ras_core, + struct ras_ta_trigger_error_input *info, uint32_t instance_mask) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + + if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized) { + RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!"); + return -ENOEXEC; + } + + if (!info) + return -EINVAL; + + return trigger_ras_ta_error(ras_core, info, instance_mask); +} + +int ras_psp_query_address(struct ras_core_context *ras_core, + struct ras_ta_query_address_input *addr_in, + struct ras_ta_query_address_output *addr_out) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + + if (!ta_ctx->preload_ras_ta_enabled && + !ta_ctx->ras_ta_initialized) { + RAS_DEV_ERR(ras_core->dev, "RAS: ras firmware not initialized!"); + return -ENOEXEC; + } + + if (!addr_in || !addr_out) + return -EINVAL; + + return send_ras_ta_runtime_cmd(ras_core, RAS_TA_CMD_ID__QUERY_ADDRESS, + addr_in, sizeof(*addr_in), addr_out, sizeof(*addr_out)); +} + +int ras_psp_sw_init(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + memset(psp, 0, sizeof(*psp)); + + psp->sys_func = ras_core->config->psp_cfg.psp_sys_fn; + if (!psp->sys_func) { + RAS_DEV_ERR(ras_core->dev, "RAS psp sys function not configured!\n"); + return -EINVAL; + } + + mutex_init(&psp->psp_ctx.internal_mutex); + mutex_init(&psp->ta_ctx.ta_mutex); + + return 0; +} + +int ras_psp_sw_fini(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + mutex_destroy(&psp->psp_ctx.internal_mutex); + mutex_destroy(&psp->ta_ctx.ta_mutex); + + memset(psp, 0, sizeof(*psp)); + + return 0; +} + +int ras_psp_hw_init(struct ras_core_context *ras_core) +{ + struct ras_psp *psp = &ras_core->ras_psp; + + psp->psp_ip_version = ras_core->config->psp_ip_version; + + psp->ip_func = ras_psp_get_ip_funcs(ras_core, psp->psp_ip_version); + if (!psp->ip_func) + return -EINVAL; + + /* After GPU reset, the system RAS PSP status may change. + * therefore, it is necessary to synchronize the system status again. + */ + ras_psp_sync_system_ras_psp_status(ras_core); + + return 0; +} + +int ras_psp_hw_fini(struct ras_core_context *ras_core) +{ + return 0; +} + +bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core, + enum ras_ta_cmd_id cmd_id) +{ + struct ras_ta_ctx *ta_ctx = &ras_core->ras_psp.ta_ctx; + bool ret = false; + + if (!ta_ctx->preload_ras_ta_enabled && !ta_ctx->ras_ta_initialized) + return false; + + switch (cmd_id) { + case RAS_TA_CMD_ID__QUERY_ADDRESS: + /* Currently, querying the address from RAS TA is only supported + * when the RAS TA firmware is loaded during driver installation. + */ + if (ta_ctx->preload_ras_ta_enabled) + ret = true; + break; + case RAS_TA_CMD_ID__TRIGGER_ERROR: + ret = true; + break; + default: + ret = false; + break; + } + + return ret; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp.h b/drivers/gpu/drm/amd/ras/rascore/ras_psp.h new file mode 100644 index 000000000000..71776fecfd66 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_PSP_H__ +#define __RAS_PSP_H__ +#include "ras.h" +#include "ras_ta_if.h" + +struct ras_core_context; +struct ras_ta_trigger_error_input; +struct ras_ta_query_address_input; +struct ras_ta_query_address_output; +enum ras_ta_cmd_id; + +struct ras_ta_image_header { + uint32_t reserved1[24]; + uint32_t image_version; /* [0x60] Off Chip Firmware Version */ + uint32_t reserved2[39]; +}; + +struct ras_psp_sys_status { + bool initialized; + uint32_t session_id; + void *psp_cmd_mutex; +}; + +struct ras_ta_init_param { + uint8_t poison_mode_en; + uint8_t dgpu_mode; + uint16_t xcc_mask; + uint8_t channel_dis_num; + uint8_t nps_mode; + uint32_t active_umc_mask; +}; + +struct gpu_mem_block { + uint32_t mem_type; + void *mem_bo; + uint64_t mem_mc_addr; + void *mem_cpu_addr; + uint32_t mem_size; + int ref_count; + void *private; +}; + +struct ras_psp_ip_func { + uint32_t (*psp_ras_ring_wptr_get)(struct ras_core_context *ras_core); + int (*psp_ras_ring_wptr_set)(struct ras_core_context *ras_core, uint32_t wptr); +}; + +struct ras_psp_ring { + struct gpu_mem_block ras_ring_gpu_mem; +}; + +struct psp_cmd_resp { + uint32_t status; + uint32_t session_id; +}; + +struct ras_psp_ctx { + void *external_mutex; + struct mutex internal_mutex; + uint64_t in_fence_value; + struct gpu_mem_block psp_cmd_gpu_mem; + struct gpu_mem_block out_fence_gpu_mem; +}; + +struct ras_ta_fw_bin { + uint32_t fw_version; + uint32_t feature_version; + uint32_t bin_size; + uint8_t *bin_addr; +}; + +struct ras_ta_ctx { + bool preload_ras_ta_enabled; + bool ras_ta_initialized; + uint32_t session_id; + uint32_t resp_status; + uint32_t ta_version; + struct mutex ta_mutex; + struct ras_ta_fw_bin fw_bin; + struct ras_ta_init_param init_param; + struct gpu_mem_block fw_gpu_mem; + struct gpu_mem_block cmd_gpu_mem; +}; + +struct ras_psp { + uint32_t psp_ip_version; + struct ras_psp_ring psp_ring; + struct ras_psp_ctx psp_ctx; + struct ras_ta_ctx ta_ctx; + const struct ras_psp_ip_func *ip_func; + const struct ras_psp_sys_func *sys_func; +}; + +struct ras_psp_ta_load { + uint32_t fw_version; + uint32_t feature_version; + uint32_t bin_size; + uint8_t *bin_addr; + uint64_t out_session_id; + uint32_t out_loaded_ta_version; +}; + +struct ras_psp_ta_unload { + uint64_t ras_session_id; +}; + +int ras_psp_sw_init(struct ras_core_context *ras_core); +int ras_psp_sw_fini(struct ras_core_context *ras_core); +int ras_psp_hw_init(struct ras_core_context *ras_core); +int ras_psp_hw_fini(struct ras_core_context *ras_core); +int ras_psp_load_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_load *ras_ta_load); +int ras_psp_unload_firmware(struct ras_core_context *ras_core, + struct ras_psp_ta_unload *ras_ta_unload); +int ras_psp_trigger_error(struct ras_core_context *ras_core, + struct ras_ta_trigger_error_input *info, uint32_t instance_mask); +int ras_psp_query_address(struct ras_core_context *ras_core, + struct ras_ta_query_address_input *addr_in, + struct ras_ta_query_address_output *addr_out); +bool ras_psp_check_supported_cmd(struct ras_core_context *ras_core, + enum ras_ta_cmd_id cmd_id); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.c new file mode 100644 index 000000000000..626cf39b75ac --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "ras.h" +#include "ras_psp_v13_0.h" + +#define regMP0_SMN_C2PMSG_67 0x0083 +#define regMP0_SMN_C2PMSG_67_BASE_IDX 0 + +static uint32_t ras_psp_v13_0_ring_wptr_get(struct ras_core_context *ras_core) +{ + return RAS_DEV_RREG32_SOC15(ras_core->dev, MP0, 0, regMP0_SMN_C2PMSG_67); +} + +static int ras_psp_v13_0_ring_wptr_set(struct ras_core_context *ras_core, uint32_t value) +{ + RAS_DEV_WREG32_SOC15(ras_core->dev, MP0, 0, regMP0_SMN_C2PMSG_67, value); + + return 0; +} + +const struct ras_psp_ip_func ras_psp_v13_0 = { + .psp_ras_ring_wptr_get = ras_psp_v13_0_ring_wptr_get, + .psp_ras_ring_wptr_set = ras_psp_v13_0_ring_wptr_set, +}; diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.h new file mode 100644 index 000000000000..b705ffe38a12 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_psp_v13_0.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_PSP_V13_0_H__ +#define __RAS_PSP_V13_0_H__ +#include "ras_psp.h" + +extern const struct ras_psp_ip_func ras_psp_v13_0; + +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h b/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h new file mode 100644 index 000000000000..0921e36d3274 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_ta_if.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef _RAS_TA_IF_H +#define _RAS_TA_IF_H +#include "ras.h" + +#define RAS_TA_HOST_IF_VER 0 + +/* Responses have bit 31 set */ +#define RSP_ID_MASK (1U << 31) +#define RSP_ID(cmdId) (((uint32_t)(cmdId)) | RSP_ID_MASK) + +/* invalid node instance value */ +#define RAS_TA_INV_NODE 0xffff + +/* RAS related enumerations */ +/**********************************************************/ +enum ras_ta_cmd_id { + RAS_TA_CMD_ID__ENABLE_FEATURES = 0, + RAS_TA_CMD_ID__DISABLE_FEATURES, + RAS_TA_CMD_ID__TRIGGER_ERROR, + RAS_TA_CMD_ID__QUERY_BLOCK_INFO, + RAS_TA_CMD_ID__QUERY_SUB_BLOCK_INFO, + RAS_TA_CMD_ID__QUERY_ADDRESS, + MAX_RAS_TA_CMD_ID +}; + +enum ras_ta_status { + RAS_TA_STATUS__SUCCESS = 0x0000, + RAS_TA_STATUS__RESET_NEEDED = 0xA001, + RAS_TA_STATUS__ERROR_INVALID_PARAMETER = 0xA002, + RAS_TA_STATUS__ERROR_RAS_NOT_AVAILABLE = 0xA003, + RAS_TA_STATUS__ERROR_RAS_DUPLICATE_CMD = 0xA004, + RAS_TA_STATUS__ERROR_INJECTION_FAILED = 0xA005, + RAS_TA_STATUS__ERROR_ASD_READ_WRITE = 0xA006, + RAS_TA_STATUS__ERROR_TOGGLE_DF_CSTATE = 0xA007, + RAS_TA_STATUS__ERROR_TIMEOUT = 0xA008, + RAS_TA_STATUS__ERROR_BLOCK_DISABLED = 0XA009, + RAS_TA_STATUS__ERROR_GENERIC = 0xA00A, + RAS_TA_STATUS__ERROR_RAS_MMHUB_INIT = 0xA00B, + RAS_TA_STATUS__ERROR_GET_DEV_INFO = 0xA00C, + RAS_TA_STATUS__ERROR_UNSUPPORTED_DEV = 0xA00D, + RAS_TA_STATUS__ERROR_NOT_INITIALIZED = 0xA00E, + RAS_TA_STATUS__ERROR_TEE_INTERNAL = 0xA00F, + RAS_TA_STATUS__ERROR_UNSUPPORTED_FUNCTION = 0xA010, + RAS_TA_STATUS__ERROR_SYS_DRV_REG_ACCESS = 0xA011, + RAS_TA_STATUS__ERROR_RAS_READ_WRITE = 0xA012, + RAS_TA_STATUS__ERROR_NULL_PTR = 0xA013, + RAS_TA_STATUS__ERROR_UNSUPPORTED_IP = 0xA014, + RAS_TA_STATUS__ERROR_PCS_STATE_QUIET = 0xA015, + RAS_TA_STATUS__ERROR_PCS_STATE_ERROR = 0xA016, + RAS_TA_STATUS__ERROR_PCS_STATE_HANG = 0xA017, + RAS_TA_STATUS__ERROR_PCS_STATE_UNKNOWN = 0xA018, + RAS_TA_STATUS__ERROR_UNSUPPORTED_ERROR_INJ = 0xA019, + RAS_TA_STATUS__TEE_ERROR_ACCESS_DENIED = 0xA01A +}; + +enum ras_ta_block { + RAS_TA_BLOCK__UMC = 0, + RAS_TA_BLOCK__SDMA, + RAS_TA_BLOCK__GFX, + RAS_TA_BLOCK__MMHUB, + RAS_TA_BLOCK__ATHUB, + RAS_TA_BLOCK__PCIE_BIF, + RAS_TA_BLOCK__HDP, + RAS_TA_BLOCK__XGMI_WAFL, + RAS_TA_BLOCK__DF, + RAS_TA_BLOCK__SMN, + RAS_TA_BLOCK__SEM, + RAS_TA_BLOCK__MP0, + RAS_TA_BLOCK__MP1, + RAS_TA_BLOCK__FUSE, + RAS_TA_BLOCK__MCA, + RAS_TA_BLOCK__VCN, + RAS_TA_BLOCK__JPEG, + RAS_TA_BLOCK__IH, + RAS_TA_BLOCK__MPIO, + RAS_TA_BLOCK__MMSCH, + RAS_TA_NUM_BLOCK_MAX +}; + +enum ras_ta_mca_block { + RAS_TA_MCA_BLOCK__MP0 = 0, + RAS_TA_MCA_BLOCK__MP1 = 1, + RAS_TA_MCA_BLOCK__MPIO = 2, + RAS_TA_MCA_BLOCK__IOHC = 3, + RAS_TA_MCA_NUM_BLOCK_MAX +}; + +enum ras_ta_error_type { + RAS_TA_ERROR__NONE = 0, + RAS_TA_ERROR__PARITY = 1, + RAS_TA_ERROR__SINGLE_CORRECTABLE = 2, + RAS_TA_ERROR__MULTI_UNCORRECTABLE = 4, + RAS_TA_ERROR__POISON = 8, +}; + +enum ras_ta_address_type { + RAS_TA_MCA_TO_PA, + RAS_TA_PA_TO_MCA, +}; + +enum ras_ta_nps_mode { + RAS_TA_UNKNOWN_MODE = 0, + RAS_TA_NPS1_MODE = 1, + RAS_TA_NPS2_MODE = 2, + RAS_TA_NPS4_MODE = 4, + RAS_TA_NPS8_MODE = 8, +}; + +/* Input/output structures for RAS commands */ +/**********************************************************/ + +struct ras_ta_enable_features_input { + enum ras_ta_block block_id; + enum ras_ta_error_type error_type; +}; + +struct ras_ta_disable_features_input { + enum ras_ta_block block_id; + enum ras_ta_error_type error_type; +}; + +struct ras_ta_trigger_error_input { + /* ras-block. i.e. umc, gfx */ + enum ras_ta_block block_id; + + /* type of error. i.e. single_correctable */ + enum ras_ta_error_type inject_error_type; + + /* mem block. i.e. hbm, sram etc. */ + uint32_t sub_block_index; + + /* explicit address of error */ + uint64_t address; + + /* method if error injection. i.e persistent, coherent etc. */ + uint64_t value; +}; + +struct ras_ta_init_flags { + uint8_t poison_mode_en; + uint8_t dgpu_mode; + uint16_t xcc_mask; + uint8_t channel_dis_num; + uint8_t nps_mode; + uint32_t active_umc_mask; +}; + +struct ras_ta_mca_addr { + uint64_t err_addr; + uint32_t ch_inst; + uint32_t umc_inst; + uint32_t node_inst; + uint32_t socket_id; +}; + +struct ras_ta_phy_addr { + uint64_t pa; + uint32_t bank; + uint32_t channel_idx; +}; + +struct ras_ta_query_address_input { + enum ras_ta_address_type addr_type; + struct ras_ta_mca_addr ma; + struct ras_ta_phy_addr pa; +}; + +struct ras_ta_output_flags { + uint8_t ras_init_success_flag; + uint8_t err_inject_switch_disable_flag; + uint8_t reg_access_failure_flag; +}; + +struct ras_ta_query_address_output { + /* don't use the flags here */ + struct ras_ta_output_flags flags; + struct ras_ta_mca_addr ma; + struct ras_ta_phy_addr pa; +}; + +/* Common input structure for RAS callbacks */ +/**********************************************************/ +union ras_ta_cmd_input { + struct ras_ta_init_flags init_flags; + struct ras_ta_enable_features_input enable_features; + struct ras_ta_disable_features_input disable_features; + struct ras_ta_trigger_error_input trigger_error; + struct ras_ta_query_address_input address; + uint32_t reserve_pad[256]; +}; + +union ras_ta_cmd_output { + struct ras_ta_output_flags flags; + struct ras_ta_query_address_output address; + uint32_t reserve_pad[256]; +}; + +struct ras_ta_cmd { + uint32_t cmd_id; + uint32_t resp_id; + uint32_t ras_status; + uint32_t if_version; + union ras_ta_cmd_input ras_in_message; + union ras_ta_cmd_output ras_out_message; +}; + +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c new file mode 100644 index 000000000000..4dae64c424a2 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.c @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_umc.h" +#include "ras_umc_v12_0.h" + +#define MAX_ECC_NUM_PER_RETIREMENT 16 + +/* bad page timestamp format + * yy[31:27] mm[26:23] day[22:17] hh[16:12] mm[11:6] ss[5:0] + */ +#define EEPROM_TIMESTAMP_MINUTE 6 +#define EEPROM_TIMESTAMP_HOUR 12 +#define EEPROM_TIMESTAMP_DAY 17 +#define EEPROM_TIMESTAMP_MONTH 23 +#define EEPROM_TIMESTAMP_YEAR 27 + +static uint64_t ras_umc_get_eeprom_timestamp(struct ras_core_context *ras_core) +{ + struct ras_time tm = {0}; + uint64_t utc_timestamp = 0; + uint64_t eeprom_timestamp = 0; + + utc_timestamp = ras_core_get_utc_second_timestamp(ras_core); + if (!utc_timestamp) + return utc_timestamp; + + ras_core_convert_timestamp_to_time(ras_core, utc_timestamp, &tm); + + /* the year range is 2000 ~ 2031, set the year if not in the range */ + if (tm.tm_year < 2000) + tm.tm_year = 2000; + if (tm.tm_year > 2031) + tm.tm_year = 2031; + + tm.tm_year -= 2000; + + eeprom_timestamp = tm.tm_sec + (tm.tm_min << EEPROM_TIMESTAMP_MINUTE) + + (tm.tm_hour << EEPROM_TIMESTAMP_HOUR) + + (tm.tm_mday << EEPROM_TIMESTAMP_DAY) + + (tm.tm_mon << EEPROM_TIMESTAMP_MONTH) + + (tm.tm_year << EEPROM_TIMESTAMP_YEAR); + eeprom_timestamp &= 0xffffffff; + + return eeprom_timestamp; +} + +static const struct ras_umc_ip_func *ras_umc_get_ip_func( + struct ras_core_context *ras_core, uint32_t ip_version) +{ + switch (ip_version) { + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 5, 0): + return &ras_umc_func_v12_0; + default: + RAS_DEV_ERR(ras_core->dev, + "UMC ip version(0x%x) is not supported!\n", ip_version); + break; + } + + return NULL; +} + +int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core, + struct umc_mca_addr *in, struct umc_phy_addr *out, + uint32_t nps) +{ + struct ras_ta_query_address_input addr_in; + struct ras_ta_query_address_output addr_out; + int ret; + + if (!in) + return -EINVAL; + + memset(&addr_in, 0, sizeof(addr_in)); + memset(&addr_out, 0, sizeof(addr_out)); + + addr_in.ma.err_addr = in->err_addr; + addr_in.ma.ch_inst = in->ch_inst; + addr_in.ma.umc_inst = in->umc_inst; + addr_in.ma.node_inst = in->node_inst; + addr_in.ma.socket_id = in->socket_id; + + addr_in.addr_type = RAS_TA_MCA_TO_PA; + + ret = ras_psp_query_address(ras_core, &addr_in, &addr_out); + if (ret) { + RAS_DEV_WARN(ras_core->dev, + "Failed to query RAS physical address for 0x%llx, ret:%d", + in->err_addr, ret); + return -EREMOTEIO; + } + + if (out) { + out->pa = addr_out.pa.pa; + out->bank = addr_out.pa.bank; + out->channel_idx = addr_out.pa.channel_idx; + } + + return 0; +} + +static int ras_umc_log_ecc(struct ras_core_context *ras_core, + unsigned long idx, void *data) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + int ret; + + mutex_lock(&ras_umc->tree_lock); + ret = radix_tree_insert(&ras_umc->root, idx, data); + if (!ret) + radix_tree_tag_set(&ras_umc->root, idx, UMC_ECC_NEW_DETECTED_TAG); + mutex_unlock(&ras_umc->tree_lock); + + return ret; +} + +int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + uint64_t buf[8] = {0}; + void **slot; + void *data; + void *iter = buf; + + mutex_lock(&ras_umc->tree_lock); + radix_tree_for_each_slot(slot, &ras_umc->root, iter, 0) { + data = ras_radix_tree_delete_iter(&ras_umc->root, iter); + kfree(data); + } + mutex_unlock(&ras_umc->tree_lock); + + return 0; +} + +static void ras_umc_reserve_eeprom_record(struct ras_core_context *ras_core, + struct eeprom_umc_record *record) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + uint64_t page_pfn[16]; + int count = 0, i; + + memset(page_pfn, 0, sizeof(page_pfn)); + if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages) { + count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core, + record, record->cur_nps, page_pfn, ARRAY_SIZE(page_pfn)); + if (count <= 0) { + RAS_DEV_ERR(ras_core->dev, + "Fail to convert error address! count:%d\n", count); + return; + } + } + + /* Reserve memory */ + for (i = 0; i < count; i++) + ras_core_event_notify(ras_core, + RAS_EVENT_ID__RESERVE_BAD_PAGE, &page_pfn[i]); +} + +/* When gpu reset is ongoing, ecc logging operations will be pended. + */ +int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct ras_bank_ecc_node *ecc_node; + + ecc_node = kzalloc(sizeof(*ecc_node), GFP_KERNEL); + if (!ecc_node) + return -ENOMEM; + + memcpy(&ecc_node->ecc, bank, sizeof(ecc_node->ecc)); + + mutex_lock(&ras_umc->pending_ecc_lock); + list_add_tail(&ecc_node->node, &ras_umc->pending_ecc_list); + mutex_unlock(&ras_umc->pending_ecc_lock); + + return 0; +} + +/* After gpu reset is complete, re-log the pending error banks. + */ +int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct ras_bank_ecc_node *ecc_node, *tmp; + + mutex_lock(&ras_umc->pending_ecc_lock); + list_for_each_entry_safe(ecc_node, + tmp, &ras_umc->pending_ecc_list, node){ + if (ecc_node && !ras_umc_log_bad_bank(ras_core, &ecc_node->ecc)) { + list_del(&ecc_node->node); + kfree(ecc_node); + } + } + mutex_unlock(&ras_umc->pending_ecc_lock); + + return 0; +} + +int ras_umc_log_bad_bank(struct ras_core_context *ras_core, struct ras_bank_ecc *bank) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct eeprom_umc_record umc_rec; + struct eeprom_umc_record *err_rec; + int ret; + + memset(&umc_rec, 0, sizeof(umc_rec)); + + mutex_lock(&ras_umc->bank_log_lock); + ret = ras_umc->ip_func->bank_to_eeprom_record(ras_core, bank, &umc_rec); + if (ret) + goto out; + + err_rec = kzalloc(sizeof(*err_rec), GFP_KERNEL); + if (!err_rec) { + ret = -ENOMEM; + goto out; + } + + memcpy(err_rec, &umc_rec, sizeof(umc_rec)); + ret = ras_umc_log_ecc(ras_core, err_rec->cur_nps_retired_row_pfn, err_rec); + if (ret) { + if (ret == -EEXIST) { + RAS_DEV_INFO(ras_core->dev, "The bad pages have been logged before.\n"); + ret = 0; + } + + kfree(err_rec); + goto out; + } + + ras_umc_reserve_eeprom_record(ras_core, err_rec); + + ret = ras_core_event_notify(ras_core, + RAS_EVENT_ID__BAD_PAGE_DETECTED, NULL); + +out: + mutex_unlock(&ras_umc->bank_log_lock); + return ret; +} + +static int ras_umc_get_new_records(struct ras_core_context *ras_core, + struct eeprom_umc_record *records, u32 num) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct eeprom_umc_record *entries[MAX_ECC_NUM_PER_RETIREMENT]; + u32 entry_num = num < MAX_ECC_NUM_PER_RETIREMENT ? num : MAX_ECC_NUM_PER_RETIREMENT; + int count = 0; + int new_detected, i; + + mutex_lock(&ras_umc->tree_lock); + new_detected = radix_tree_gang_lookup_tag(&ras_umc->root, (void **)entries, + 0, entry_num, UMC_ECC_NEW_DETECTED_TAG); + for (i = 0; i < new_detected; i++) { + if (!entries[i]) + continue; + + memcpy(&records[i], entries[i], sizeof(struct eeprom_umc_record)); + count++; + radix_tree_tag_clear(&ras_umc->root, + entries[i]->cur_nps_retired_row_pfn, UMC_ECC_NEW_DETECTED_TAG); + } + mutex_unlock(&ras_umc->tree_lock); + + return count; +} + +static bool ras_umc_check_retired_record(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, bool from_eeprom) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data; + uint32_t nps = 0; + int i, ret; + + if (from_eeprom) { + nps = ras_umc->umc_err_data.umc_nps_mode; + if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_record) { + ret = ras_umc->ip_func->eeprom_record_to_nps_record(ras_core, record, nps); + if (ret) + RAS_DEV_WARN(ras_core->dev, + "Failed to adjust eeprom record, ret:%d", ret); + } + return false; + } + + for (i = 0; i < data->count; i++) { + if ((data->bps[i].retired_row_pfn == record->retired_row_pfn) && + (data->bps[i].cur_nps_retired_row_pfn == record->cur_nps_retired_row_pfn)) + return true; + } + + return false; +} + +/* alloc/realloc bps array */ +static int ras_umc_realloc_err_data_space(struct ras_core_context *ras_core, + struct eeprom_store_record *data, int pages) +{ + unsigned int old_space = data->count + data->space_left; + unsigned int new_space = old_space + pages; + unsigned int align_space = ALIGN(new_space, 512); + void *bps = kzalloc(align_space * sizeof(*data->bps), GFP_KERNEL); + + if (!bps) + return -ENOMEM; + + if (data->bps) { + memcpy(bps, data->bps, + data->count * sizeof(*data->bps)); + kfree(data->bps); + } + + data->bps = bps; + data->space_left += align_space - old_space; + return 0; +} + +static int ras_umc_update_eeprom_rom_data(struct ras_core_context *ras_core, + struct eeprom_umc_record *bps) +{ + struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.rom_data; + + if (!data->space_left && + ras_umc_realloc_err_data_space(ras_core, data, 256)) { + return -ENOMEM; + } + + memcpy(&data->bps[data->count], bps, sizeof(*data->bps)); + data->count++; + data->space_left--; + return 0; +} + +static int ras_umc_update_eeprom_ram_data(struct ras_core_context *ras_core, + struct eeprom_umc_record *bps) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data; + uint64_t page_pfn[16]; + int count = 0, j; + + if (!data->space_left && + ras_umc_realloc_err_data_space(ras_core, data, 256)) { + return -ENOMEM; + } + + memset(page_pfn, 0, sizeof(page_pfn)); + if (ras_umc->ip_func && ras_umc->ip_func->eeprom_record_to_nps_pages) + count = ras_umc->ip_func->eeprom_record_to_nps_pages(ras_core, + bps, bps->cur_nps, page_pfn, ARRAY_SIZE(page_pfn)); + + if (count > 0) { + for (j = 0; j < count; j++) { + bps->cur_nps_retired_row_pfn = page_pfn[j]; + memcpy(&data->bps[data->count], bps, sizeof(*data->bps)); + data->count++; + data->space_left--; + } + } else { + memcpy(&data->bps[data->count], bps, sizeof(*data->bps)); + data->count++; + data->space_left--; + } + + return 0; +} + +/* it deal with vram only. */ +static int ras_umc_add_bad_pages(struct ras_core_context *ras_core, + struct eeprom_umc_record *bps, + int pages, bool from_eeprom) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct ras_umc_err_data *data = &ras_umc->umc_err_data; + int i, ret = 0; + + if (!bps || pages <= 0) + return 0; + + mutex_lock(&ras_umc->umc_lock); + for (i = 0; i < pages; i++) { + if (ras_umc_check_retired_record(ras_core, &bps[i], from_eeprom)) + continue; + + ret = ras_umc_update_eeprom_rom_data(ras_core, &bps[i]); + if (ret) + goto out; + + if (data->last_retired_pfn == bps[i].cur_nps_retired_row_pfn) + continue; + + data->last_retired_pfn = bps[i].cur_nps_retired_row_pfn; + + if (from_eeprom) + ras_umc_reserve_eeprom_record(ras_core, &bps[i]); + + ret = ras_umc_update_eeprom_ram_data(ras_core, &bps[i]); + if (ret) + goto out; + } +out: + mutex_unlock(&ras_umc->umc_lock); + + return ret; +} + +/* + * read error record array in eeprom and reserve enough space for + * storing new bad pages + */ +int ras_umc_load_bad_pages(struct ras_core_context *ras_core) +{ + struct eeprom_umc_record *bps; + uint32_t ras_num_recs; + int ret; + + ras_num_recs = ras_eeprom_get_record_count(ras_core); + /* no bad page record, skip eeprom access */ + if (!ras_num_recs || + ras_core->ras_eeprom.record_threshold_config == DISABLE_RETIRE_PAGE) + return 0; + + bps = kcalloc(ras_num_recs, sizeof(*bps), GFP_KERNEL); + if (!bps) + return -ENOMEM; + + ret = ras_eeprom_read(ras_core, bps, ras_num_recs); + if (ret) { + RAS_DEV_ERR(ras_core->dev, "Failed to load EEPROM table records!"); + } else { + ras_core->ras_umc.umc_err_data.last_retired_pfn = UMC_INV_MEM_PFN; + ret = ras_umc_add_bad_pages(ras_core, bps, ras_num_recs, true); + } + + kfree(bps); + return ret; +} + +/* + * write error record array to eeprom, the function should be + * protected by recovery_lock + * new_cnt: new added UE count, excluding reserved bad pages, can be NULL + */ +static int ras_umc_save_bad_pages(struct ras_core_context *ras_core) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct eeprom_store_record *data = &ras_umc->umc_err_data.rom_data; + uint32_t eeprom_record_num; + int save_count; + int ret = 0; + + if (!data->bps) + return 0; + + eeprom_record_num = ras_eeprom_get_record_count(ras_core); + mutex_lock(&ras_umc->umc_lock); + save_count = data->count - eeprom_record_num; + /* only new entries are saved */ + if (save_count > 0) { + if (ras_eeprom_append(ras_core, + &data->bps[eeprom_record_num], + save_count)) { + RAS_DEV_ERR(ras_core->dev, "Failed to save EEPROM table data!"); + ret = -EIO; + goto exit; + } + + RAS_DEV_INFO(ras_core->dev, "Saved %d pages to EEPROM table.\n", save_count); + } + +exit: + mutex_unlock(&ras_umc->umc_lock); + return ret; +} + +int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data) +{ + struct eeprom_umc_record records[MAX_ECC_NUM_PER_RETIREMENT]; + int count, ret; + + memset(records, 0, sizeof(records)); + count = ras_umc_get_new_records(ras_core, records, ARRAY_SIZE(records)); + if (count <= 0) + return -ENODATA; + + ret = ras_umc_add_bad_pages(ras_core, records, count, false); + if (ret) { + RAS_DEV_ERR(ras_core->dev, "Failed to add ras bad page!\n"); + return -EINVAL; + } + + ret = ras_umc_save_bad_pages(ras_core); + if (ret) { + RAS_DEV_ERR(ras_core->dev, "Failed to save ras bad page\n"); + return -EINVAL; + } + + return 0; +} + +int ras_umc_sw_init(struct ras_core_context *ras_core) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + + memset(ras_umc, 0, sizeof(*ras_umc)); + + INIT_LIST_HEAD(&ras_umc->pending_ecc_list); + + INIT_RADIX_TREE(&ras_umc->root, GFP_KERNEL); + + mutex_init(&ras_umc->tree_lock); + mutex_init(&ras_umc->pending_ecc_lock); + mutex_init(&ras_umc->umc_lock); + mutex_init(&ras_umc->bank_log_lock); + + return 0; +} + +int ras_umc_sw_fini(struct ras_core_context *ras_core) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct ras_umc_err_data *umc_err_data = &ras_umc->umc_err_data; + struct ras_bank_ecc_node *ecc_node, *tmp; + + mutex_destroy(&ras_umc->umc_lock); + mutex_destroy(&ras_umc->bank_log_lock); + + if (umc_err_data->rom_data.bps) { + umc_err_data->rom_data.count = 0; + kfree(umc_err_data->rom_data.bps); + umc_err_data->rom_data.bps = NULL; + umc_err_data->rom_data.space_left = 0; + } + + if (umc_err_data->ram_data.bps) { + umc_err_data->ram_data.count = 0; + kfree(umc_err_data->ram_data.bps); + umc_err_data->ram_data.bps = NULL; + umc_err_data->ram_data.space_left = 0; + } + + ras_umc_clear_logged_ecc(ras_core); + + mutex_lock(&ras_umc->pending_ecc_lock); + list_for_each_entry_safe(ecc_node, + tmp, &ras_umc->pending_ecc_list, node){ + list_del(&ecc_node->node); + kfree(ecc_node); + } + mutex_unlock(&ras_umc->pending_ecc_lock); + + mutex_destroy(&ras_umc->tree_lock); + mutex_destroy(&ras_umc->pending_ecc_lock); + + return 0; +} + +int ras_umc_hw_init(struct ras_core_context *ras_core) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + uint32_t nps; + + nps = ras_core_get_curr_nps_mode(ras_core); + + if (!nps || (nps >= UMC_MEMORY_PARTITION_MODE_UNKNOWN)) { + RAS_DEV_ERR(ras_core->dev, "Invalid memory NPS mode: %u!\n", nps); + return -ENODATA; + } + + ras_umc->umc_err_data.umc_nps_mode = nps; + + ras_umc->umc_vram_type = ras_core->config->umc_cfg.umc_vram_type; + if (!ras_umc->umc_vram_type) { + RAS_DEV_ERR(ras_core->dev, "Invalid UMC VRAM Type: %u!\n", + ras_umc->umc_vram_type); + return -ENODATA; + } + + ras_umc->umc_ip_version = ras_core->config->umc_ip_version; + ras_umc->ip_func = ras_umc_get_ip_func(ras_core, ras_umc->umc_ip_version); + if (!ras_umc->ip_func) + return -EINVAL; + + return 0; +} + +int ras_umc_hw_fini(struct ras_core_context *ras_core) +{ + return 0; +} + +int ras_umc_clean_badpage_data(struct ras_core_context *ras_core) +{ + struct ras_umc_err_data *data = &ras_core->ras_umc.umc_err_data; + + mutex_lock(&ras_core->ras_umc.umc_lock); + + kfree(data->rom_data.bps); + kfree(data->ram_data.bps); + + memset(data, 0, sizeof(*data)); + mutex_unlock(&ras_core->ras_umc.umc_lock); + + return 0; +} + +int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core, + uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr, + enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record) +{ + struct eeprom_umc_record *err_rec = record; + + /* Set bad page pfn and nps mode */ + EEPROM_RECORD_SETUP_UMC_ADDR_AND_NPS(err_rec, + RAS_ADDR_TO_PFN(cur_nps_addr->pa), cur_nps); + + err_rec->address = err_addr; + err_rec->ts = ras_umc_get_eeprom_timestamp(ras_core); + err_rec->err_type = RAS_EEPROM_ERR_NON_RECOVERABLE; + err_rec->cu = 0; + err_rec->mem_channel = cur_nps_addr->channel_idx; + err_rec->mcumc_id = umc_inst; + err_rec->cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(cur_nps_addr->pa); + err_rec->cur_nps_bank = cur_nps_addr->bank; + err_rec->cur_nps = cur_nps; + return 0; +} + +int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core) +{ + struct ras_umc_err_data *err_data = &ras_core->ras_umc.umc_err_data; + + return err_data->rom_data.count; +} + +int ras_umc_get_badpage_count(struct ras_core_context *ras_core) +{ + struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data; + + return data->count; +} + +int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record) +{ + struct eeprom_store_record *data = &ras_core->ras_umc.umc_err_data.ram_data; + + if (index >= data->count) + return -EINVAL; + + memcpy(record, &data->bps[index], sizeof(struct eeprom_umc_record)); + return 0; +} + +bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + struct eeprom_store_record *data = &ras_umc->umc_err_data.ram_data; + uint64_t page_pfn = RAS_ADDR_TO_PFN(addr); + int i, ret = false; + + mutex_lock(&ras_umc->umc_lock); + for (i = 0; i < data->count; i++) { + if (data->bps[i].cur_nps_retired_row_pfn == page_pfn) { + ret = true; + break; + } + } + mutex_unlock(&ras_umc->umc_lock); + + return ret; +} + +int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core, + uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa) +{ + struct ras_umc *ras_umc = &ras_core->ras_umc; + int ret = 0; + + if (bank_to_pa) + ret = ras_umc->ip_func->bank_to_soc_pa(ras_core, *bank_addr, soc_pa); + else + ret = ras_umc->ip_func->soc_pa_to_bank(ras_core, *soc_pa, bank_addr); + + return ret; +} diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc.h b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h new file mode 100644 index 000000000000..7d9e779d8c4c --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc.h @@ -0,0 +1,166 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#ifndef __RAS_UMC_H__ +#define __RAS_UMC_H__ +#include "ras.h" +#include "ras_eeprom.h" +#include "ras_cmd.h" + +#define UMC_VRAM_TYPE_UNKNOWN 0 +#define UMC_VRAM_TYPE_GDDR1 1 +#define UMC_VRAM_TYPE_DDR2 2 +#define UMC_VRAM_TYPE_GDDR3 3 +#define UMC_VRAM_TYPE_GDDR4 4 +#define UMC_VRAM_TYPE_GDDR5 5 +#define UMC_VRAM_TYPE_HBM 6 +#define UMC_VRAM_TYPE_DDR3 7 +#define UMC_VRAM_TYPE_DDR4 8 +#define UMC_VRAM_TYPE_GDDR6 9 +#define UMC_VRAM_TYPE_DDR5 10 +#define UMC_VRAM_TYPE_LPDDR4 11 +#define UMC_VRAM_TYPE_LPDDR5 12 +#define UMC_VRAM_TYPE_HBM3E 13 + +#define UMC_ECC_NEW_DETECTED_TAG 0x1 +#define UMC_INV_MEM_PFN (0xFFFFFFFFFFFFFFFF) + +/* three column bits and one row bit in MCA address flip + * in bad page retirement + */ +#define UMC_PA_FLIP_BITS_NUM 4 + +enum umc_memory_partition_mode { + UMC_MEMORY_PARTITION_MODE_NONE = 0, + UMC_MEMORY_PARTITION_MODE_NPS1 = 1, + UMC_MEMORY_PARTITION_MODE_NPS2 = 2, + UMC_MEMORY_PARTITION_MODE_NPS3 = 3, + UMC_MEMORY_PARTITION_MODE_NPS4 = 4, + UMC_MEMORY_PARTITION_MODE_NPS6 = 6, + UMC_MEMORY_PARTITION_MODE_NPS8 = 8, + UMC_MEMORY_PARTITION_MODE_UNKNOWN +}; + +struct ras_core_context; +struct ras_bank_ecc; + +struct umc_flip_bits { + uint32_t flip_bits_in_pa[UMC_PA_FLIP_BITS_NUM]; + uint32_t flip_row_bit; + uint32_t r13_in_pa; + uint32_t bit_num; +}; + +struct umc_mca_addr { + uint64_t err_addr; + uint32_t ch_inst; + uint32_t umc_inst; + uint32_t node_inst; + uint32_t socket_id; +}; + +struct umc_phy_addr { + uint64_t pa; + uint32_t bank; + uint32_t channel_idx; +}; + +struct umc_bank_addr { + uint32_t stack_id; /* SID */ + uint32_t bank_group; + uint32_t bank; + uint32_t row; + uint32_t column; + uint32_t channel; + uint32_t subchannel; /* Also called Pseudochannel (PC) */ +}; + +struct ras_umc_ip_func { + int (*bank_to_eeprom_record)(struct ras_core_context *ras_core, + struct ras_bank_ecc *bank, struct eeprom_umc_record *record); + int (*eeprom_record_to_nps_record)(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, uint32_t nps); + int (*eeprom_record_to_nps_pages)(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, uint32_t nps, + uint64_t *pfns, uint32_t num); + int (*bank_to_soc_pa)(struct ras_core_context *ras_core, + struct umc_bank_addr bank_addr, uint64_t *soc_pa); + int (*soc_pa_to_bank)(struct ras_core_context *ras_core, + uint64_t soc_pa, struct umc_bank_addr *bank_addr); +}; + +struct eeprom_store_record { + /* point to data records array */ + struct eeprom_umc_record *bps; + /* the count of entries */ + int count; + /* the space can place new entries */ + int space_left; +}; + +struct ras_umc_err_data { + struct eeprom_store_record rom_data; + struct eeprom_store_record ram_data; + enum umc_memory_partition_mode umc_nps_mode; + uint64_t last_retired_pfn; +}; + +struct ras_umc { + u32 umc_ip_version; + u32 umc_vram_type; + const struct ras_umc_ip_func *ip_func; + struct radix_tree_root root; + struct mutex tree_lock; + struct mutex umc_lock; + struct mutex bank_log_lock; + struct mutex pending_ecc_lock; + struct ras_umc_err_data umc_err_data; + struct list_head pending_ecc_list; +}; + +int ras_umc_sw_init(struct ras_core_context *ras); +int ras_umc_sw_fini(struct ras_core_context *ras); +int ras_umc_hw_init(struct ras_core_context *ras); +int ras_umc_hw_fini(struct ras_core_context *ras); +int ras_umc_psp_convert_ma_to_pa(struct ras_core_context *ras_core, + struct umc_mca_addr *in, struct umc_phy_addr *out, + uint32_t nps); +int ras_umc_handle_bad_pages(struct ras_core_context *ras_core, void *data); +int ras_umc_log_bad_bank(struct ras_core_context *ras, struct ras_bank_ecc *bank); +int ras_umc_log_bad_bank_pending(struct ras_core_context *ras_core, struct ras_bank_ecc *bank); +int ras_umc_log_pending_bad_bank(struct ras_core_context *ras_core); +int ras_umc_clear_logged_ecc(struct ras_core_context *ras_core); +int ras_umc_load_bad_pages(struct ras_core_context *ras_core); +int ras_umc_get_saved_eeprom_count(struct ras_core_context *ras_core); +int ras_umc_clean_badpage_data(struct ras_core_context *ras_core); +int ras_umc_fill_eeprom_record(struct ras_core_context *ras_core, + uint64_t err_addr, uint32_t umc_inst, struct umc_phy_addr *cur_nps_addr, + enum umc_memory_partition_mode cur_nps, struct eeprom_umc_record *record); + +int ras_umc_get_badpage_count(struct ras_core_context *ras_core); +int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record); +bool ras_umc_check_retired_addr(struct ras_core_context *ras_core, uint64_t addr); +int ras_umc_translate_soc_pa_and_bank(struct ras_core_context *ras_core, + uint64_t *soc_pa, struct umc_bank_addr *bank_addr, bool bank_to_pa); +#endif diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c new file mode 100644 index 000000000000..5d9a11c17a86 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c @@ -0,0 +1,511 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#include "ras.h" +#include "ras_umc.h" +#include "ras_core_status.h" +#include "ras_umc_v12_0.h" + +#define NumDieInterleaved 4 + +static const uint32_t umc_v12_0_channel_idx_tbl[] + [UMC_V12_0_UMC_INSTANCE_NUM][UMC_V12_0_CHANNEL_INSTANCE_NUM] = { + {{3, 7, 11, 15, 2, 6, 10, 14}, {1, 5, 9, 13, 0, 4, 8, 12}, + {19, 23, 27, 31, 18, 22, 26, 30}, {17, 21, 25, 29, 16, 20, 24, 28}}, + {{47, 43, 39, 35, 46, 42, 38, 34}, {45, 41, 37, 33, 44, 40, 36, 32}, + {63, 59, 55, 51, 62, 58, 54, 50}, {61, 57, 53, 49, 60, 56, 52, 48}}, + {{79, 75, 71, 67, 78, 74, 70, 66}, {77, 73, 69, 65, 76, 72, 68, 64}, + {95, 91, 87, 83, 94, 90, 86, 82}, {93, 89, 85, 81, 92, 88, 84, 80}}, + {{99, 103, 107, 111, 98, 102, 106, 110}, {97, 101, 105, 109, 96, 100, 104, 108}, + {115, 119, 123, 127, 114, 118, 122, 126}, {113, 117, 121, 125, 112, 116, 120, 124}} +}; + +/* mapping of MCA error address to normalized address */ +static const uint32_t umc_v12_0_ma2na_mapping[] = { + 0, 5, 6, 8, 9, 14, 12, 13, + 10, 11, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, + 24, 7, 29, 30, +}; + +static bool umc_v12_0_bit_wise_xor(uint32_t val) +{ + bool result = 0; + int i; + + for (i = 0; i < 32; i++) + result = result ^ ((val >> i) & 0x1); + + return result; +} + +static void __get_nps_pa_flip_bits(struct ras_core_context *ras_core, + enum umc_memory_partition_mode nps, + struct umc_flip_bits *flip_bits) +{ + uint32_t vram_type = ras_core->ras_umc.umc_vram_type; + + /* default setting */ + flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_C2_BIT; + flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C3_BIT; + flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_C4_BIT; + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R13_BIT; + flip_bits->flip_row_bit = 13; + flip_bits->bit_num = 4; + flip_bits->r13_in_pa = UMC_V12_0_PA_R13_BIT; + + if (nps == UMC_MEMORY_PARTITION_MODE_NPS2) { + flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH5_BIT; + flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C2_BIT; + flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B1_BIT; + flip_bits->r13_in_pa = UMC_V12_0_PA_R12_BIT; + } else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4) { + flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH4_BIT; + flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_CH5_BIT; + flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B0_BIT; + flip_bits->r13_in_pa = UMC_V12_0_PA_R11_BIT; + } + + switch (vram_type) { + case UMC_VRAM_TYPE_HBM: + /* other nps modes are taken as nps1 */ + if (nps == UMC_MEMORY_PARTITION_MODE_NPS2) + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT; + else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4) + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT; + + break; + case UMC_VRAM_TYPE_HBM3E: + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT; + flip_bits->flip_row_bit = 12; + + if (nps == UMC_MEMORY_PARTITION_MODE_NPS2) + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT; + else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4) + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R10_BIT; + + break; + default: + RAS_DEV_WARN(ras_core->dev, + "Unknown HBM type, set RAS retire flip bits to the value in NPS1 mode.\n"); + break; + } +} + +static uint64_t convert_nps_pa_to_row_pa(struct ras_core_context *ras_core, + uint64_t pa, enum umc_memory_partition_mode nps, bool zero_pfn_ok) +{ + struct umc_flip_bits flip_bits = {0}; + uint64_t row_pa; + int i; + + __get_nps_pa_flip_bits(ras_core, nps, &flip_bits); + + row_pa = pa; + /* clear loop bits in soc physical address */ + for (i = 0; i < flip_bits.bit_num; i++) + row_pa &= ~BIT_ULL(flip_bits.flip_bits_in_pa[i]); + + if (!zero_pfn_ok && !RAS_ADDR_TO_PFN(row_pa)) + row_pa |= BIT_ULL(flip_bits.flip_bits_in_pa[2]); + + return row_pa; +} + +static int lookup_bad_pages_in_a_row(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, uint32_t nps, + uint64_t *pfns, uint32_t num, + uint64_t seq_no, bool dump) +{ + uint32_t col, col_lower, row, row_lower, idx, row_high; + uint64_t soc_pa, row_pa, column, err_addr; + uint64_t retired_addr = RAS_PFN_TO_ADDR(record->cur_nps_retired_row_pfn); + struct umc_flip_bits flip_bits = {0}; + uint32_t retire_unit; + uint32_t i; + + __get_nps_pa_flip_bits(ras_core, nps, &flip_bits); + + row_pa = convert_nps_pa_to_row_pa(ras_core, retired_addr, nps, true); + + err_addr = record->address; + /* get column bit 0 and 1 in mca address */ + col_lower = (err_addr >> 1) & 0x3ULL; + /* MA_R13_BIT will be handled later */ + row_lower = (err_addr >> UMC_V12_0_MCA_R0_BIT) & 0x1fffULL; + row_lower &= ~BIT_ULL(flip_bits.flip_row_bit); + + if (ras_core->ras_gfx.gfx_ip_version >= IP_VERSION(9, 5, 0)) { + row_high = (row_pa >> flip_bits.r13_in_pa) & 0x3ULL; + /* it's 2.25GB in each channel, from MCA address to PA + * [R14 R13] is converted if the two bits value are 0x3, + * get them from PA instead of MCA address. + */ + row_lower |= (row_high << 13); + } + + idx = 0; + row = 0; + retire_unit = 0x1 << flip_bits.bit_num; + /* loop for all possibilities of retire bits */ + for (column = 0; column < retire_unit; column++) { + soc_pa = row_pa; + for (i = 0; i < flip_bits.bit_num; i++) + soc_pa |= (((column >> i) & 0x1ULL) << flip_bits.flip_bits_in_pa[i]); + + col = ((column & 0x7) << 2) | col_lower; + + /* add row bit 13 */ + if (flip_bits.bit_num == UMC_PA_FLIP_BITS_NUM) + row = ((column >> 3) << flip_bits.flip_row_bit) | row_lower; + + if (dump) + RAS_DEV_INFO(ras_core->dev, + "{%llu} Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n", + seq_no, soc_pa, row, col, + record->cur_nps_bank, record->mem_channel); + + + if (pfns && (idx < num)) + pfns[idx++] = RAS_ADDR_TO_PFN(soc_pa); + } + + return idx; +} + +static int umc_v12_convert_ma_to_pa(struct ras_core_context *ras_core, + struct umc_mca_addr *addr_in, struct umc_phy_addr *addr_out, + uint32_t nps) +{ + uint32_t i, na_shift; + uint64_t soc_pa, na, na_nps; + uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row; + uint32_t bank0, bank1, bank2, bank3, bank; + uint32_t ch_inst = addr_in->ch_inst; + uint32_t umc_inst = addr_in->umc_inst; + uint32_t node_inst = addr_in->node_inst; + uint32_t socket_id = addr_in->socket_id; + uint32_t channel_index; + uint64_t err_addr = addr_in->err_addr; + + if (node_inst != UMC_INV_AID_NODE) { + if (ch_inst >= UMC_V12_0_CHANNEL_INSTANCE_NUM || + umc_inst >= UMC_V12_0_UMC_INSTANCE_NUM || + node_inst >= UMC_V12_0_AID_NUM_MAX || + socket_id >= UMC_V12_0_SOCKET_NUM_MAX) + return -EINVAL; + } else { + if (socket_id >= UMC_V12_0_SOCKET_NUM_MAX || + ch_inst >= UMC_V12_0_TOTAL_CHANNEL_NUM) + return -EINVAL; + } + + bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL; + bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL; + bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL; + bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL; + col = (err_addr >> 1) & 0x1fULL; + row = (err_addr >> 10) & 0x3fffULL; + + /* apply bank hash algorithm */ + bank0 = + bank_hash0 ^ (UMC_V12_0_XOR_EN0 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0)))); + bank1 = + bank_hash1 ^ (UMC_V12_0_XOR_EN1 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1)))); + bank2 = + bank_hash2 ^ (UMC_V12_0_XOR_EN2 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2)))); + bank3 = + bank_hash3 ^ (UMC_V12_0_XOR_EN3 & + (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^ + (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3)))); + + bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3); + err_addr &= ~0x3c0ULL; + err_addr |= (bank << UMC_V12_0_MCA_B0_BIT); + + na_nps = 0x0; + /* convert mca error address to normalized address */ + for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++) + na_nps |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i]; + + if (nps == UMC_MEMORY_PARTITION_MODE_NPS1) + na_shift = 8; + else if (nps == UMC_MEMORY_PARTITION_MODE_NPS2) + na_shift = 9; + else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4) + na_shift = 10; + else if (nps == UMC_MEMORY_PARTITION_MODE_NPS8) + na_shift = 11; + else + return -EINVAL; + + na = ((na_nps >> na_shift) << 8) | (na_nps & 0xff); + + if (node_inst != UMC_INV_AID_NODE) + channel_index = + umc_v12_0_channel_idx_tbl[node_inst][umc_inst][ch_inst]; + else { + channel_index = ch_inst; + node_inst = channel_index / + (UMC_V12_0_UMC_INSTANCE_NUM * UMC_V12_0_CHANNEL_INSTANCE_NUM); + } + + /* translate umc channel address to soc pa, 3 parts are included */ + soc_pa = ADDR_OF_32KB_BLOCK(na) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(na); + + /* calc channel hash based on absolute address */ + soc_pa += socket_id * SOCKET_LFB_SIZE; + /* the umc channel bits are not original values, they are hashed */ + UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa); + /* restore pa */ + soc_pa -= socket_id * SOCKET_LFB_SIZE; + + /* get some channel bits from na_nps directly and + * add nps section offset + */ + if (nps == UMC_MEMORY_PARTITION_MODE_NPS2) { + soc_pa &= ~(0x1ULL << UMC_V12_0_PA_CH5_BIT); + soc_pa |= ((na_nps & 0x100) << 5); + soc_pa += (node_inst >> 1) * (SOCKET_LFB_SIZE >> 1); + } else if (nps == UMC_MEMORY_PARTITION_MODE_NPS4) { + soc_pa &= ~(0x3ULL << UMC_V12_0_PA_CH4_BIT); + soc_pa |= ((na_nps & 0x300) << 4); + soc_pa += node_inst * (SOCKET_LFB_SIZE >> 2); + } else if (nps == UMC_MEMORY_PARTITION_MODE_NPS8) { + soc_pa &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT); + soc_pa |= ((na_nps & 0x700) << 4); + soc_pa += node_inst * (SOCKET_LFB_SIZE >> 2) + + (channel_index >> 4) * (SOCKET_LFB_SIZE >> 3); + } + + addr_out->pa = soc_pa; + addr_out->bank = bank; + addr_out->channel_idx = channel_index; + + return 0; +} + +static int convert_ma_to_pa(struct ras_core_context *ras_core, + struct umc_mca_addr *addr_in, struct umc_phy_addr *addr_out, + uint32_t nps) +{ + int ret; + + if (ras_psp_check_supported_cmd(ras_core, RAS_TA_CMD_ID__QUERY_ADDRESS)) + ret = ras_umc_psp_convert_ma_to_pa(ras_core, + addr_in, addr_out, nps); + else + ret = umc_v12_convert_ma_to_pa(ras_core, + addr_in, addr_out, nps); + + return ret; +} + +static int convert_bank_to_nps_addr(struct ras_core_context *ras_core, + struct ras_bank_ecc *bank, struct umc_phy_addr *pa_addr, uint32_t nps) +{ + struct umc_mca_addr addr_in; + struct umc_phy_addr addr_out; + int ret; + + memset(&addr_in, 0, sizeof(addr_in)); + memset(&addr_out, 0, sizeof(addr_out)); + + addr_in.err_addr = ACA_ADDR_2_ERR_ADDR(bank->addr); + addr_in.ch_inst = ACA_IPID_2_UMC_CH(bank->ipid); + addr_in.umc_inst = ACA_IPID_2_UMC_INST(bank->ipid); + addr_in.node_inst = ACA_IPID_2_DIE_ID(bank->ipid); + addr_in.socket_id = ACA_IPID_2_SOCKET_ID(bank->ipid); + + ret = convert_ma_to_pa(ras_core, &addr_in, &addr_out, nps); + if (!ret) { + pa_addr->pa = + convert_nps_pa_to_row_pa(ras_core, addr_out.pa, nps, false); + pa_addr->channel_idx = addr_out.channel_idx; + pa_addr->bank = addr_out.bank; + } + + return ret; +} + +static int umc_v12_0_bank_to_eeprom_record(struct ras_core_context *ras_core, + struct ras_bank_ecc *bank, struct eeprom_umc_record *record) +{ + struct umc_phy_addr nps_addr; + int ret; + + memset(&nps_addr, 0, sizeof(nps_addr)); + + ret = convert_bank_to_nps_addr(ras_core, bank, + &nps_addr, bank->nps); + if (ret) + return ret; + + ras_umc_fill_eeprom_record(ras_core, + ACA_ADDR_2_ERR_ADDR(bank->addr), ACA_IPID_2_UMC_INST(bank->ipid), + &nps_addr, bank->nps, record); + + lookup_bad_pages_in_a_row(ras_core, record, + bank->nps, NULL, 0, bank->seq_no, true); + + return 0; +} + +static int convert_eeprom_record_to_nps_addr(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, uint64_t *pa, uint32_t nps) +{ + struct device_system_info dev_info = {0}; + struct umc_mca_addr addr_in; + struct umc_phy_addr addr_out; + int ret; + + memset(&addr_in, 0, sizeof(addr_in)); + memset(&addr_out, 0, sizeof(addr_out)); + + ras_core_get_device_system_info(ras_core, &dev_info); + + addr_in.err_addr = record->address; + addr_in.ch_inst = record->mem_channel; + addr_in.umc_inst = record->mcumc_id; + addr_in.node_inst = UMC_INV_AID_NODE; + addr_in.socket_id = dev_info.socket_id; + + ret = convert_ma_to_pa(ras_core, &addr_in, &addr_out, nps); + if (ret) + return ret; + + *pa = convert_nps_pa_to_row_pa(ras_core, addr_out.pa, nps, false); + + return 0; +} + +static int umc_v12_0_eeprom_record_to_nps_record(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, uint32_t nps) +{ + uint64_t pa = 0; + int ret = 0; + + if (nps == EEPROM_RECORD_UMC_NPS_MODE(record)) { + record->cur_nps_retired_row_pfn = EEPROM_RECORD_UMC_ADDR_PFN(record); + } else { + ret = convert_eeprom_record_to_nps_addr(ras_core, + record, &pa, nps); + if (!ret) + record->cur_nps_retired_row_pfn = RAS_ADDR_TO_PFN(pa); + } + + record->cur_nps = nps; + + return ret; +} + +static int umc_v12_0_eeprom_record_to_nps_pages(struct ras_core_context *ras_core, + struct eeprom_umc_record *record, uint32_t nps, + uint64_t *pfns, uint32_t num) +{ + return lookup_bad_pages_in_a_row(ras_core, + record, nps, pfns, num, 0, false); +} + +static int umc_12_0_soc_pa_to_bank(struct ras_core_context *ras_core, + uint64_t soc_pa, + struct umc_bank_addr *bank_addr) +{ + + int channel_hashed = 0; + int channel_real = 0; + int channel_reversed = 0; + int i = 0; + + bank_addr->stack_id = UMC_V12_0_SOC_PA_TO_SID(soc_pa); + bank_addr->bank_group = 0; /* This is a combination of SID & Bank. Needed?? */ + bank_addr->bank = UMC_V12_0_SOC_PA_TO_BANK(soc_pa); + bank_addr->row = UMC_V12_0_SOC_PA_TO_ROW(soc_pa); + bank_addr->column = UMC_V12_0_SOC_PA_TO_COL(soc_pa); + + /* Channel bits 4-6 are hashed. Bruteforce reverse the hash */ + channel_hashed = (soc_pa >> UMC_V12_0_PA_CH4_BIT) & 0x7; + + for (i = 0; i < 8; i++) { + channel_reversed = 0; + channel_reversed |= UMC_V12_0_CHANNEL_HASH_CH4((i << 4), soc_pa); + channel_reversed |= (UMC_V12_0_CHANNEL_HASH_CH5((i << 4), soc_pa) << 1); + channel_reversed |= (UMC_V12_0_CHANNEL_HASH_CH6((i << 4), soc_pa) << 2); + if (channel_reversed == channel_hashed) + channel_real = ((i << 4)) | ((soc_pa >> UMC_V12_0_PA_CH0_BIT) & 0xf); + } + + bank_addr->channel = channel_real; + bank_addr->subchannel = UMC_V12_0_SOC_PA_TO_PC(soc_pa); + + return 0; +} + +static int umc_12_0_bank_to_soc_pa(struct ras_core_context *ras_core, + struct umc_bank_addr bank_addr, + uint64_t *soc_pa) +{ + uint64_t na = 0; + uint64_t tmp_pa = 0; + *soc_pa = 0; + + tmp_pa |= UMC_V12_0_SOC_SID_TO_PA(bank_addr.stack_id); + tmp_pa |= UMC_V12_0_SOC_BANK_TO_PA(bank_addr.bank); + tmp_pa |= UMC_V12_0_SOC_ROW_TO_PA(bank_addr.row); + tmp_pa |= UMC_V12_0_SOC_COL_TO_PA(bank_addr.column); + tmp_pa |= UMC_V12_0_SOC_CH_TO_PA(bank_addr.channel); + tmp_pa |= UMC_V12_0_SOC_PC_TO_PA(bank_addr.subchannel); + + /* Get the NA */ + na = ((tmp_pa >> UMC_V12_0_PA_C2_BIT) << UMC_V12_0_NA_C2_BIT); + na |= tmp_pa & 0xff; + + /* translate umc channel address to soc pa, 3 parts are included */ + tmp_pa = ADDR_OF_32KB_BLOCK(na) | + ADDR_OF_256B_BLOCK(bank_addr.channel) | + OFFSET_IN_256B_BLOCK(na); + + /* the umc channel bits are not original values, they are hashed */ + UMC_V12_0_SET_CHANNEL_HASH(bank_addr.channel, tmp_pa); + + *soc_pa = tmp_pa; + + return 0; +} + +const struct ras_umc_ip_func ras_umc_func_v12_0 = { + .bank_to_eeprom_record = umc_v12_0_bank_to_eeprom_record, + .eeprom_record_to_nps_record = umc_v12_0_eeprom_record_to_nps_record, + .eeprom_record_to_nps_pages = umc_v12_0_eeprom_record_to_nps_pages, + .bank_to_soc_pa = umc_12_0_bank_to_soc_pa, + .soc_pa_to_bank = umc_12_0_soc_pa_to_bank, +}; + diff --git a/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.h b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.h new file mode 100644 index 000000000000..8a35ad856165 --- /dev/null +++ b/drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.h @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + */ +#ifndef __RAS_UMC_V12_0_H__ +#define __RAS_UMC_V12_0_H__ +#include "ras.h" + +/* MCA_UMC_UMC0_MCUMC_ADDRT0 */ +#define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr__SHIFT 0x0 +#define MCA_UMC_UMC0_MCUMC_ADDRT0__Reserved__SHIFT 0x38 +#define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr_MASK 0x00FFFFFFFFFFFFFFL +#define MCA_UMC_UMC0_MCUMC_ADDRT0__Reserved_MASK 0xFF00000000000000L + +/* MCMP1_IPIDT0 */ +#define MCMP1_IPIDT0__InstanceIdLo__SHIFT 0x0 +#define MCMP1_IPIDT0__HardwareID__SHIFT 0x20 +#define MCMP1_IPIDT0__InstanceIdHi__SHIFT 0x2c +#define MCMP1_IPIDT0__McaType__SHIFT 0x30 + +#define MCMP1_IPIDT0__InstanceIdLo_MASK 0x00000000FFFFFFFFL +#define MCMP1_IPIDT0__HardwareID_MASK 0x00000FFF00000000L +#define MCMP1_IPIDT0__InstanceIdHi_MASK 0x0000F00000000000L +#define MCMP1_IPIDT0__McaType_MASK 0xFFFF000000000000L + +/* number of umc channel instance with memory map register access */ +#define UMC_V12_0_CHANNEL_INSTANCE_NUM 8 +/* number of umc instance with memory map register access */ +#define UMC_V12_0_UMC_INSTANCE_NUM 4 + +/* one piece of normalized address is mapped to 8 pieces of physical address */ +#define UMC_V12_0_NA_MAP_PA_NUM 8 + +/* bank bits in MCA error address */ +#define UMC_V12_0_MCA_B0_BIT 6 +#define UMC_V12_0_MCA_B1_BIT 7 +#define UMC_V12_0_MCA_B2_BIT 8 +#define UMC_V12_0_MCA_B3_BIT 9 + +/* row bits in MCA address */ +#define UMC_V12_0_MCA_R0_BIT 10 + +/* Stack ID bits in SOC physical address */ +#define UMC_V12_0_PA_SID1_BIT 37 +#define UMC_V12_0_PA_SID0_BIT 36 + +/* bank bits in SOC physical address */ +#define UMC_V12_0_PA_B3_BIT 18 +#define UMC_V12_0_PA_B2_BIT 17 +#define UMC_V12_0_PA_B1_BIT 20 +#define UMC_V12_0_PA_B0_BIT 19 + +/* row bits in SOC physical address */ +#define UMC_V12_0_PA_R13_BIT 35 +#define UMC_V12_0_PA_R12_BIT 34 +#define UMC_V12_0_PA_R11_BIT 33 +#define UMC_V12_0_PA_R10_BIT 32 +#define UMC_V12_0_PA_R9_BIT 31 +#define UMC_V12_0_PA_R8_BIT 30 +#define UMC_V12_0_PA_R7_BIT 29 +#define UMC_V12_0_PA_R6_BIT 28 +#define UMC_V12_0_PA_R5_BIT 27 +#define UMC_V12_0_PA_R4_BIT 26 +#define UMC_V12_0_PA_R3_BIT 25 +#define UMC_V12_0_PA_R2_BIT 24 +#define UMC_V12_0_PA_R1_BIT 23 +#define UMC_V12_0_PA_R0_BIT 22 + +/* column bits in SOC physical address */ +#define UMC_V12_0_PA_C4_BIT 21 +#define UMC_V12_0_PA_C3_BIT 16 +#define UMC_V12_0_PA_C2_BIT 15 +#define UMC_V12_0_PA_C1_BIT 6 +#define UMC_V12_0_PA_C0_BIT 5 + +/* channel index bits in SOC physical address */ +#define UMC_V12_0_PA_CH6_BIT 14 +#define UMC_V12_0_PA_CH5_BIT 13 +#define UMC_V12_0_PA_CH4_BIT 12 +#define UMC_V12_0_PA_CH3_BIT 11 +#define UMC_V12_0_PA_CH2_BIT 10 +#define UMC_V12_0_PA_CH1_BIT 9 +#define UMC_V12_0_PA_CH0_BIT 8 + +/* Pseudochannel index bits in SOC physical address */ +#define UMC_V12_0_PA_PC0_BIT 7 + +#define UMC_V12_0_NA_C2_BIT 8 + +#define UMC_V12_0_SOC_PA_TO_SID(pa) \ + ((((pa >> UMC_V12_0_PA_SID0_BIT) & 0x1ULL) << 0ULL) | \ + (((pa >> UMC_V12_0_PA_SID1_BIT) & 0x1ULL) << 1ULL)) + +#define UMC_V12_0_SOC_PA_TO_BANK(pa) \ + ((((pa >> UMC_V12_0_PA_B0_BIT) & 0x1ULL) << 0ULL) | \ + (((pa >> UMC_V12_0_PA_B1_BIT) & 0x1ULL) << 1ULL) | \ + (((pa >> UMC_V12_0_PA_B2_BIT) & 0x1ULL) << 2ULL) | \ + (((pa >> UMC_V12_0_PA_B3_BIT) & 0x1ULL) << 3ULL)) + +#define UMC_V12_0_SOC_PA_TO_ROW(pa) \ + ((((pa >> UMC_V12_0_PA_R0_BIT) & 0x1ULL) << 0ULL) | \ + (((pa >> UMC_V12_0_PA_R1_BIT) & 0x1ULL) << 1ULL) | \ + (((pa >> UMC_V12_0_PA_R2_BIT) & 0x1ULL) << 2ULL) | \ + (((pa >> UMC_V12_0_PA_R3_BIT) & 0x1ULL) << 3ULL) | \ + (((pa >> UMC_V12_0_PA_R4_BIT) & 0x1ULL) << 4ULL) | \ + (((pa >> UMC_V12_0_PA_R5_BIT) & 0x1ULL) << 5ULL) | \ + (((pa >> UMC_V12_0_PA_R6_BIT) & 0x1ULL) << 6ULL) | \ + (((pa >> UMC_V12_0_PA_R7_BIT) & 0x1ULL) << 7ULL) | \ + (((pa >> UMC_V12_0_PA_R8_BIT) & 0x1ULL) << 8ULL) | \ + (((pa >> UMC_V12_0_PA_R9_BIT) & 0x1ULL) << 9ULL) | \ + (((pa >> UMC_V12_0_PA_R10_BIT) & 0x1ULL) << 10ULL) | \ + (((pa >> UMC_V12_0_PA_R11_BIT) & 0x1ULL) << 11ULL) | \ + (((pa >> UMC_V12_0_PA_R12_BIT) & 0x1ULL) << 12ULL) | \ + (((pa >> UMC_V12_0_PA_R13_BIT) & 0x1ULL) << 13ULL)) + +#define UMC_V12_0_SOC_PA_TO_COL(pa) \ + ((((pa >> UMC_V12_0_PA_C0_BIT) & 0x1ULL) << 0ULL) | \ + (((pa >> UMC_V12_0_PA_C1_BIT) & 0x1ULL) << 1ULL) | \ + (((pa >> UMC_V12_0_PA_C2_BIT) & 0x1ULL) << 2ULL) | \ + (((pa >> UMC_V12_0_PA_C3_BIT) & 0x1ULL) << 3ULL) | \ + (((pa >> UMC_V12_0_PA_C4_BIT) & 0x1ULL) << 4ULL)) + +#define UMC_V12_0_SOC_PA_TO_CH(pa) \ + ((((pa >> UMC_V12_0_PA_CH0_BIT) & 0x1ULL) << 0ULL) | \ + (((pa >> UMC_V12_0_PA_CH1_BIT) & 0x1ULL) << 1ULL) | \ + (((pa >> UMC_V12_0_PA_CH2_BIT) & 0x1ULL) << 2ULL) | \ + (((pa >> UMC_V12_0_PA_CH3_BIT) & 0x1ULL) << 3ULL) | \ + (((pa >> UMC_V12_0_PA_CH4_BIT) & 0x1ULL) << 4ULL) | \ + (((pa >> UMC_V12_0_PA_CH5_BIT) & 0x1ULL) << 5ULL) | \ + (((pa >> UMC_V12_0_PA_CH6_BIT) & 0x1ULL) << 6ULL)) + +#define UMC_V12_0_SOC_PA_TO_PC(pa) (((pa >> UMC_V12_0_PA_PC0_BIT) & 0x1ULL) << 0ULL) + +#define UMC_V12_0_SOC_SID_TO_PA(sid) \ + ((((sid >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_SID0_BIT) | \ + (((sid >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_SID1_BIT)) + +#define UMC_V12_0_SOC_BANK_TO_PA(bank) \ + ((((bank >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_B0_BIT) | \ + (((bank >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_B1_BIT) | \ + (((bank >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_B2_BIT) | \ + (((bank >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_B3_BIT)) + +#define UMC_V12_0_SOC_ROW_TO_PA(row) \ + ((((row >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_R0_BIT) | \ + (((row >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_R1_BIT) | \ + (((row >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_R2_BIT) | \ + (((row >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_R3_BIT) | \ + (((row >> 4ULL) & 0x1ULL) << UMC_V12_0_PA_R4_BIT) | \ + (((row >> 5ULL) & 0x1ULL) << UMC_V12_0_PA_R5_BIT) | \ + (((row >> 6ULL) & 0x1ULL) << UMC_V12_0_PA_R6_BIT) | \ + (((row >> 7ULL) & 0x1ULL) << UMC_V12_0_PA_R7_BIT) | \ + (((row >> 8ULL) & 0x1ULL) << UMC_V12_0_PA_R8_BIT) | \ + (((row >> 9ULL) & 0x1ULL) << UMC_V12_0_PA_R9_BIT) | \ + (((row >> 10ULL) & 0x1ULL) << UMC_V12_0_PA_R10_BIT) | \ + (((row >> 11ULL) & 0x1ULL) << UMC_V12_0_PA_R11_BIT) | \ + (((row >> 12ULL) & 0x1ULL) << UMC_V12_0_PA_R12_BIT) | \ + (((row >> 13ULL) & 0x1ULL) << UMC_V12_0_PA_R13_BIT)) + +#define UMC_V12_0_SOC_COL_TO_PA(col) \ + ((((col >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_C0_BIT) | \ + (((col >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_C1_BIT) | \ + (((col >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_C2_BIT) | \ + (((col >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_C3_BIT) | \ + (((col >> 4ULL) & 0x1ULL) << UMC_V12_0_PA_C4_BIT)) + +#define UMC_V12_0_SOC_CH_TO_PA(ch) \ + ((((ch >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_CH0_BIT) | \ + (((ch >> 1ULL) & 0x1ULL) << UMC_V12_0_PA_CH1_BIT) | \ + (((ch >> 2ULL) & 0x1ULL) << UMC_V12_0_PA_CH2_BIT) | \ + (((ch >> 3ULL) & 0x1ULL) << UMC_V12_0_PA_CH3_BIT) | \ + (((ch >> 4ULL) & 0x1ULL) << UMC_V12_0_PA_CH4_BIT) | \ + (((ch >> 5ULL) & 0x1ULL) << UMC_V12_0_PA_CH5_BIT) | \ + (((ch >> 6ULL) & 0x1ULL) << UMC_V12_0_PA_CH6_BIT)) + +#define UMC_V12_0_SOC_PC_TO_PA(pc) (((pc >> 0ULL) & 0x1ULL) << UMC_V12_0_PA_PC0_BIT) + +/* bank hash settings */ +#define UMC_V12_0_XOR_EN0 1 +#define UMC_V12_0_XOR_EN1 1 +#define UMC_V12_0_XOR_EN2 1 +#define UMC_V12_0_XOR_EN3 1 +#define UMC_V12_0_COL_XOR0 0x0 +#define UMC_V12_0_COL_XOR1 0x0 +#define UMC_V12_0_COL_XOR2 0x800 +#define UMC_V12_0_COL_XOR3 0x1000 +#define UMC_V12_0_ROW_XOR0 0x11111 +#define UMC_V12_0_ROW_XOR1 0x22222 +#define UMC_V12_0_ROW_XOR2 0x4444 +#define UMC_V12_0_ROW_XOR3 0x8888 + +/* channel hash settings */ +#define UMC_V12_0_HASH_4K 0 +#define UMC_V12_0_HASH_64K 1 +#define UMC_V12_0_HASH_2M 1 +#define UMC_V12_0_HASH_1G 1 +#define UMC_V12_0_HASH_1T 1 + +/* XOR some bits of PA into CH4~CH6 bits (bits 12~14 of PA), + * hash bit is only effective when related setting is enabled + */ +#define UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) ((((channel_idx) >> 5) & 0x1) ^ \ + (((pa) >> 20) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \ + (((pa) >> 27) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \ + (((pa) >> 34) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \ + (((pa) >> 41) & 0x1ULL & UMC_V12_0_HASH_1T)) +#define UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) ((((channel_idx) >> 6) & 0x1) ^ \ + (((pa) >> 21) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \ + (((pa) >> 28) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \ + (((pa) >> 35) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \ + (((pa) >> 42) & 0x1ULL & UMC_V12_0_HASH_1T)) +#define UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) ((((channel_idx) >> 4) & 0x1) ^ \ + (((pa) >> 19) & 0x1ULL & UMC_V12_0_HASH_64K) ^ \ + (((pa) >> 26) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \ + (((pa) >> 33) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \ + (((pa) >> 40) & 0x1ULL & UMC_V12_0_HASH_1T) ^ \ + (((pa) >> 47) & 0x1ULL & UMC_V12_0_HASH_1T)) +#define UMC_V12_0_SET_CHANNEL_HASH(channel_idx, pa) do { \ + (pa) &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT); \ + (pa) |= (UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) << UMC_V12_0_PA_CH4_BIT); \ + (pa) |= (UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) << UMC_V12_0_PA_CH5_BIT); \ + (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << UMC_V12_0_PA_CH6_BIT); \ + } while (0) + + +/* + * (addr / 256) * 4096, the higher 26 bits in ErrorAddr + * is the index of 4KB block + */ +#define ADDR_OF_4KB_BLOCK(addr) (((addr) & ~0xffULL) << 4) +/* + * (addr / 256) * 8192, the higher 26 bits in ErrorAddr + * is the index of 8KB block + */ +#define ADDR_OF_8KB_BLOCK(addr) (((addr) & ~0xffULL) << 5) +/* + * (addr / 256) * 32768, the higher 26 bits in ErrorAddr + * is the index of 8KB block + */ +#define ADDR_OF_32KB_BLOCK(addr) (((addr) & ~0xffULL) << 7) +/* channel index is the index of 256B block */ +#define ADDR_OF_256B_BLOCK(channel_index) ((channel_index) << 8) +/* offset in 256B block */ +#define OFFSET_IN_256B_BLOCK(addr) ((addr) & 0xffULL) + + +#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \ + ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \ + (0x1ULL << UMC_V12_0_PA_C4_BIT) | \ + (0x1ULL << UMC_V12_0_PA_R13_BIT))) + +#define ACA_IPID_HI_2_UMC_AID(_ipid_hi) (((_ipid_hi) >> 2) & 0x3) +#define ACA_IPID_LO_2_UMC_CH(_ipid_lo) \ + (((((_ipid_lo) >> 20) & 0x1) * 4) + (((_ipid_lo) >> 12) & 0xF)) +#define ACA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7) + +#define ACA_IPID_2_DIE_ID(ipid) ((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) >> 2) & 0x03) +#define ACA_IPID_2_UMC_CH(ipid) \ + (ACA_IPID_LO_2_UMC_CH(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo))) + +#define ACA_IPID_2_UMC_INST(ipid) \ + (ACA_IPID_LO_2_UMC_INST(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo))) + +#define ACA_IPID_2_SOCKET_ID(ipid) \ + (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \ + (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03)) + +#define ACA_ADDR_2_ERR_ADDR(addr) \ + REG_GET_FIELD(addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr) + +/* R13 bit shift should be considered, double the number */ +#define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2) + + +/* C2, C3, C4, R13, four MCA bits are looped in page retirement */ +#define UMC_V12_0_RETIRE_LOOP_BITS 4 + +/* invalid node instance value */ +#define UMC_INV_AID_NODE 0xffff + +#define UMC_V12_0_AID_NUM_MAX 4 +#define UMC_V12_0_SOCKET_NUM_MAX 8 + +#define UMC_V12_0_TOTAL_CHANNEL_NUM \ + (UMC_V12_0_AID_NUM_MAX * UMC_V12_0_UMC_INSTANCE_NUM * UMC_V12_0_CHANNEL_INSTANCE_NUM) + +/* one device has 192GB HBM */ +#define SOCKET_LFB_SIZE 0x3000000000ULL + +extern const struct ras_umc_ip_func ras_umc_func_v12_0; + +int ras_umc_get_badpage_count(struct ras_core_context *ras_core); +int ras_umc_get_badpage_record(struct ras_core_context *ras_core, uint32_t index, void *record); +#endif + |
