diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 348 |
1 files changed, 264 insertions, 84 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 540817e296da..e0ee21150860 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -122,12 +122,15 @@ const char *get_ras_block_str(struct ras_common_if *ras_block) /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) -#define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms +#define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms #define MAX_FLUSH_RETIRE_DWORK_TIMES 100 +#define BYPASS_ALLOCATED_ADDRESS 0x0 +#define BYPASS_INITIALIZATION_ADDRESS 0x1 + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -136,10 +139,14 @@ enum amdgpu_ras_retire_page_reservation { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr); -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev); +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); + #ifdef CONFIG_X86_MCE_AMD static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev); struct mce_notifier_adev_list { @@ -169,18 +176,16 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre struct eeprom_table_record err_rec; int ret; - if ((address >= adev->gmc.mc_vram_size) || - (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + ret = amdgpu_ras_check_bad_page(adev, address); + if (ret == -EINVAL) { dev_warn(adev->dev, - "RAS WARN: input address 0x%llx is invalid.\n", - address); + "RAS WARN: input address 0x%llx is invalid.\n", + address); return -EINVAL; - } - - if (amdgpu_ras_check_bad_page(adev, address)) { + } else if (ret == 1) { dev_warn(adev->dev, - "RAS WARN: 0x%llx has already been marked as bad page!\n", - address); + "RAS WARN: 0x%llx has already been marked as bad page!\n", + address); return 0; } @@ -207,6 +212,56 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre return 0; } +static int amdgpu_check_address_validity(struct amdgpu_device *adev, + uint64_t address, uint64_t flags) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_vram_block_info blk_info; + uint64_t page_pfns[32] = {0}; + int i, ret, count; + bool hit = false; + + if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) + return 0; + + if (amdgpu_sriov_vf(adev)) { + if (amdgpu_virt_check_vf_critical_region(adev, address, &hit)) + return -EPERM; + return hit ? -EACCES : 0; + } + + if ((address >= adev->gmc.mc_vram_size) || + (address >= RAS_UMC_INJECT_ADDR_LIMIT)) + return -EFAULT; + + count = amdgpu_umc_lookup_bad_pages_in_a_row(adev, + address, page_pfns, ARRAY_SIZE(page_pfns)); + if (count <= 0) + return -EPERM; + + for (i = 0; i < count; i++) { + memset(&blk_info, 0, sizeof(blk_info)); + ret = amdgpu_vram_mgr_query_address_block_info(&adev->mman.vram_mgr, + page_pfns[i] << AMDGPU_GPU_PAGE_SHIFT, &blk_info); + if (!ret) { + /* The input address that needs to be checked is allocated by + * current calling process, so it is necessary to exclude + * the calling process. + */ + if ((flags == BYPASS_ALLOCATED_ADDRESS) && + ((blk_info.task.pid != task_pid_nr(current)) || + strncmp(blk_info.task.comm, current->comm, TASK_COMM_LEN))) + return -EACCES; + else if ((flags == BYPASS_INITIALIZATION_ADDRESS) && + (blk_info.task.pid == con->init_task_pid) && + !strncmp(blk_info.task.comm, con->init_task_comm, TASK_COMM_LEN)) + return -EACCES; + } + } + + return 0; +} + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -297,6 +352,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 2; else if (strstr(str, "retire_page") != NULL) op = 3; + else if (strstr(str, "check_address") != NULL) + op = 4; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -311,6 +368,15 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, data->inject.address = address; return 0; + } else if (op == 4) { + if (sscanf(str, "%*s 0x%llx 0x%llx", &address, &value) != 2 && + sscanf(str, "%*s %llu %llu", &address, &value) != 2) + return -EINVAL; + + data->op = op; + data->inject.address = address; + data->inject.value = value; + return 0; } if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) @@ -500,6 +566,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, return size; else return ret; + } else if (data.op == 4) { + ret = amdgpu_check_address_validity(adev, data.inject.address, data.inject.value); + return ret ? ret : size; } if (!amdgpu_ras_is_supported(adev, data.head.block)) @@ -513,22 +582,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, ret = amdgpu_ras_feature_enable(adev, &data.head, 1); break; case 2: - if ((data.inject.address >= adev->gmc.mc_vram_size && - adev->gmc.mc_vram_size) || - (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { - dev_warn(adev->dev, "RAS WARN: input address " - "0x%llx is invalid.", + /* umc ce/ue error injection for a bad page is not allowed */ + if (data.head.block == AMDGPU_RAS_BLOCK__UMC) + ret = amdgpu_ras_check_bad_page(adev, data.inject.address); + if (ret == -EINVAL) { + dev_warn(adev->dev, "RAS WARN: input address 0x%llx is invalid.", data.inject.address); - ret = -EINVAL; break; - } - - /* umc ce/ue error injection for a bad page is not allowed */ - if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && - amdgpu_ras_check_bad_page(adev, data.inject.address)) { - dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has " - "already been marked as bad!\n", - data.inject.address); + } else if (ret == 1) { + dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has already been marked as bad!\n", + data.inject.address); break; } @@ -2566,18 +2629,26 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, goto out; } - *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); + *bps = kmalloc_array(data->count, sizeof(struct ras_badpage), GFP_KERNEL); if (!*bps) { ret = -ENOMEM; goto out; } for (; i < data->count; i++) { + if (!data->bps[i].ts) + continue; + (*bps)[i] = (struct ras_badpage){ .bp = data->bps[i].retired_page, .size = AMDGPU_GPU_PAGE_SIZE, .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, }; + + if (amdgpu_ras_check_critical_address(adev, + data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + continue; + status = amdgpu_vram_mgr_query_page_status(&adev->mman.vram_mgr, data->bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT); if (status == -EBUSY) @@ -2586,7 +2657,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; } - *count = data->count; + *count = con->bad_page_num; out: mutex_unlock(&con->recovery_lock); return ret; @@ -2638,6 +2709,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + unsigned int error_query_mode; enum ras_event_type type; if (hive) { @@ -2666,6 +2738,13 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) device_list_handle = &device_list; } + if (amdgpu_ras_get_error_query_mode(adev, &error_query_mode)) { + if (error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY) { + /* wait 500ms to ensure pmfw polling mca bank info done */ + msleep(500); + } + } + type = amdgpu_ras_get_fatal_error_event(adev); list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { @@ -2722,7 +2801,7 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, unsigned int old_space = data->count + data->space_left; unsigned int new_space = old_space + pages; unsigned int align_space = ALIGN(new_space, 512); - void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); + void *bps = kmalloc_array(align_space, sizeof(*data->bps), GFP_KERNEL); if (!bps) { return -ENOMEM; @@ -2814,8 +2893,11 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, for (j = 0; j < count; j++) { if (amdgpu_ras_check_bad_page_unlock(con, - bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { + data->count++; + data->space_left--; continue; + } if (!data->space_left && amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { @@ -2828,6 +2910,7 @@ static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, sizeof(struct eeprom_table_record)); data->count++; data->space_left--; + con->bad_page_num++; } return 0; @@ -2974,7 +3057,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, ret = __amdgpu_ras_convert_rec_array_from_rom(adev, &bps[i], &err_data, nps); if (ret) - control->ras_num_bad_pages -= adev->umc.retire_unit; + con->bad_page_num -= adev->umc.retire_unit; i += (adev->umc.retire_unit - 1); } else { break; @@ -2988,8 +3071,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, ret = __amdgpu_ras_convert_rec_from_rom(adev, &bps[i], &err_data, nps); if (ret) - control->ras_num_bad_pages -= adev->umc.retire_unit; + con->bad_page_num -= adev->umc.retire_unit; } + + con->eh_data->count_saved = con->eh_data->count; } else { ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); } @@ -3012,7 +3097,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; struct amdgpu_ras_eeprom_control *control; - int save_count, unit_num, bad_page_num, i; + int save_count, unit_num, i; if (!con || !con->eh_data) { if (new_cnt) @@ -3033,27 +3118,26 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, mutex_lock(&con->recovery_lock); control = &con->eeprom_control; data = con->eh_data; - bad_page_num = control->ras_num_bad_pages; - save_count = data->count - bad_page_num; + unit_num = data->count / adev->umc.retire_unit - control->ras_num_recs; + save_count = con->bad_page_num - control->ras_num_bad_pages; mutex_unlock(&con->recovery_lock); - unit_num = save_count / adev->umc.retire_unit; if (new_cnt) *new_cnt = unit_num; /* only new entries are saved */ - if (save_count > 0) { + if (unit_num > 0) { /*old asics only save pa to eeprom like before*/ if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) < 12) { if (amdgpu_ras_eeprom_append(control, - &data->bps[bad_page_num], save_count)) { + &data->bps[data->count_saved], unit_num)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } } else { for (i = 0; i < unit_num; i++) { if (amdgpu_ras_eeprom_append(control, - &data->bps[bad_page_num + + &data->bps[data->count_saved + i * adev->umc.retire_unit], 1)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; @@ -3062,6 +3146,7 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, } dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); + data->count_saved = data->count; } return 0; @@ -3116,17 +3201,17 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) } } + ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); + if (ret) + goto out; + ret = amdgpu_ras_eeprom_check(control); if (ret) goto out; /* HW not usable */ - if (amdgpu_ras_is_rma(adev)) { + if (amdgpu_ras_is_rma(adev)) ret = -EHWPOISON; - goto out; - } - - ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs, true); } out: @@ -3134,18 +3219,24 @@ out: return ret; } -static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, +static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr) { struct ras_err_handler_data *data = con->eh_data; + struct amdgpu_device *adev = con->adev; int i; + if ((addr >= adev->gmc.mc_vram_size && + adev->gmc.mc_vram_size) || + (addr >= RAS_UMC_INJECT_ADDR_LIMIT)) + return -EINVAL; + addr >>= AMDGPU_GPU_PAGE_SHIFT; for (i = 0; i < data->count; i++) if (addr == data->bps[i].retired_page) - return true; + return 1; - return false; + return 0; } /* @@ -3153,11 +3244,11 @@ static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, * * Note: this check is only for umc block */ -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, +static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - bool ret = false; + int ret = 0; if (!con || !con->eh_data) return ret; @@ -3241,7 +3332,7 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log) INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; } static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -3261,7 +3352,7 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) mutex_destroy(&ecc_log->lock); ecc_log->de_queried_count = 0; - ecc_log->prev_de_queried_count = 0; + ecc_log->consumption_q_count = 0; } static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, @@ -3287,7 +3378,6 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) page_retirement_dwork.work); struct amdgpu_device *adev = con->adev; struct ras_err_data err_data; - unsigned long err_cnt; /* If gpu reset is ongoing, delay retiring the bad pages */ if (amdgpu_in_reset(adev) || amdgpu_ras_in_recovery(adev)) { @@ -3299,13 +3389,9 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work) amdgpu_ras_error_data_init(&err_data); amdgpu_umc_handle_bad_pages(adev, &err_data); - err_cnt = err_data.err_addr_cnt; amdgpu_ras_error_data_fini(&err_data); - if (err_cnt && amdgpu_ras_is_rma(adev)) - amdgpu_ras_reset_gpu(adev); - amdgpu_ras_schedule_retirement_dwork(con, AMDGPU_RAS_RETIRE_PAGE_INTERVAL); } @@ -3316,49 +3402,39 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, int ret = 0; struct ras_ecc_log_info *ecc_log; struct ras_query_if info; - uint32_t timeout = 0; + u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - uint64_t de_queried_count; - uint32_t new_detect_count, total_detect_count; - uint32_t need_query_count = poison_creation_count; + u64 de_queried_count; + u64 consumption_q_count; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; memset(&info, 0, sizeof(info)); info.head.block = AMDGPU_RAS_BLOCK__UMC; ecc_log = &ras->umc_ecc_log; - total_detect_count = 0; + ecc_log->de_queried_count = 0; + ecc_log->consumption_q_count = 0; + do { ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); if (ret) return ret; de_queried_count = ecc_log->de_queried_count; - if (de_queried_count > ecc_log->prev_de_queried_count) { - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; - ecc_log->prev_de_queried_count = de_queried_count; - timeout = 0; - } else { - new_detect_count = 0; - } + consumption_q_count = ecc_log->consumption_q_count; - if (new_detect_count) { - total_detect_count += new_detect_count; - } else { - if (!timeout && need_query_count) - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; + if (de_queried_count && consumption_q_count) + break; - if (timeout) { - if (!--timeout) - break; - msleep(1); - } - } - } while (total_detect_count < need_query_count); + msleep(100); + } while (--timeout); - if (total_detect_count) + if (de_queried_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); + if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) + amdgpu_ras_reset_gpu(adev); + return 0; } @@ -3394,6 +3470,12 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, reset_flags |= msg.reset; } + /* + * Try to ensure poison creation handler is completed first + * to set rma if bad page exceed threshold. + */ + flush_delayed_work(&con->page_retirement_dwork); + /* for RMA, amdgpu_ras_poison_creation_handler will trigger gpu reset */ if (reset_flags && !amdgpu_ras_is_rma(adev)) { if (reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) @@ -3403,8 +3485,6 @@ static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, else reset = reset_flags; - flush_delayed_work(&con->page_retirement_dwork); - con->gpu_reset_flags |= reset; amdgpu_ras_reset_gpu(adev); @@ -3434,6 +3514,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) if (kthread_should_stop()) break; + mutex_lock(&con->poison_lock); gpu_reset = 0; do { @@ -3446,7 +3527,8 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(poison_creation_count, &con->poison_creation_count); atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); } - } while (atomic_read(&con->poison_creation_count)); + } while (atomic_read(&con->poison_creation_count) && + !atomic_read(&con->poison_consumption_count)); if (ret != -EIO) { msg_count = kfifo_len(&con->poison_fifo); @@ -3463,6 +3545,7 @@ static int amdgpu_ras_page_retirement_thread(void *param) /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ /* Clear poison creation request */ atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); /* Clear poison fifo */ amdgpu_ras_clear_poison_fifo(adev); @@ -3487,9 +3570,12 @@ static int amdgpu_ras_page_retirement_thread(void *param) atomic_sub(msg_count, &con->page_retirement_req_cnt); } + atomic_set(&con->poison_consumption_count, 0); + /* Wake up work to save bad pages to eeprom */ schedule_delayed_work(&con->page_retirement_dwork, 0); } + mutex_unlock(&con->poison_lock); } return 0; @@ -3570,8 +3656,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) } mutex_init(&con->recovery_lock); + mutex_init(&con->poison_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); atomic_set(&con->in_recovery, 0); + atomic_set(&con->rma_in_recovery, 0); con->eeprom_control.bad_channel_bitmap = 0; max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(&con->eeprom_control); @@ -3589,6 +3677,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) init_waitqueue_head(&con->page_retirement_wq); atomic_set(&con->page_retirement_req_cnt, 0); atomic_set(&con->poison_creation_count, 0); + atomic_set(&con->poison_consumption_count, 0); con->page_retirement_thread = kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); if (IS_ERR(con->page_retirement_thread)) { @@ -3661,6 +3750,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) kfree(data); mutex_unlock(&con->recovery_lock); + amdgpu_ras_critical_region_init(adev); + return 0; } /* recovery end */ @@ -4087,6 +4178,12 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + con->init_task_pid = task_pid_nr(current); + get_task_comm(con->init_task_comm, current); + + mutex_init(&con->critical_region_lock); + INIT_LIST_HEAD(&con->critical_region_head); + dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", adev->ras_hw_enabled, adev->ras_enabled); @@ -4366,6 +4463,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) if (!adev->ras_enabled || !con) return 0; + amdgpu_ras_critical_region_fini(adev); + mutex_destroy(&con->critical_region_lock); + list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) { if (ras_node->ras_obj) { obj = ras_node->ras_obj; @@ -5274,6 +5374,9 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn) uint64_t start = pfn << AMDGPU_GPU_PAGE_SHIFT; int ret = 0; + if (amdgpu_ras_check_critical_address(adev, start)) + return 0; + mutex_lock(&con->page_rsv_lock); ret = amdgpu_vram_mgr_query_page_status(mgr, start); if (ret == -ENOENT) @@ -5310,3 +5413,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev) return con->is_rma; } + +int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, + struct amdgpu_bo *bo) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct amdgpu_vram_mgr_resource *vres; + struct ras_critical_region *region; + struct drm_buddy_block *block; + int ret = 0; + + if (!bo || !bo->tbo.resource) + return -EINVAL; + + vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource); + + mutex_lock(&con->critical_region_lock); + + /* Check if the bo had been recorded */ + list_for_each_entry(region, &con->critical_region_head, node) + if (region->bo == bo) + goto out; + + /* Record new critical amdgpu bo */ + list_for_each_entry(block, &vres->blocks, link) { + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) { + ret = -ENOMEM; + goto out; + } + region->bo = bo; + region->start = amdgpu_vram_mgr_block_start(block); + region->size = amdgpu_vram_mgr_block_size(block); + list_add_tail(®ion->node, &con->critical_region_head); + } + +out: + mutex_unlock(&con->critical_region_lock); + + return ret; +} + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev) +{ + amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory); +} + +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_critical_region *region, *tmp; + + mutex_lock(&con->critical_region_lock); + list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) { + list_del(®ion->node); + kfree(region); + } + mutex_unlock(&con->critical_region_lock); +} + +bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_critical_region *region; + bool ret = false; + + mutex_lock(&con->critical_region_lock); + list_for_each_entry(region, &con->critical_region_head, node) { + if ((region->start <= addr) && + (addr < (region->start + region->size))) { + ret = true; + break; + } + } + mutex_unlock(&con->critical_region_lock); + + return ret; +} |