summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c18
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c54
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c20
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c4
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c3
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c2
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c7
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_userqueue.c6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_v11_0.c8
-rw-r--r--drivers/gpu/drm/amd/amdgpu/mes_v12_0.c15
22 files changed, 135 insertions, 54 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2a0df4cabb99..6f5b4a0e0a34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1290,6 +1290,7 @@ struct amdgpu_device {
bool debug_disable_gpu_ring_reset;
bool debug_vm_userptr;
bool debug_disable_ce_logs;
+ bool debug_enable_ce_cs;
/* Protection for the following isolation structure */
struct mutex enforce_isolation_mutex;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 83020963dfde..a2ca9acf8c4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2329,10 +2329,9 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem)
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
struct kfd_vm_fault_info *mem)
{
- if (atomic_read(&adev->gmc.vm_fault_info_updated) == 1) {
+ if (atomic_read_acquire(&adev->gmc.vm_fault_info_updated) == 1) {
*mem = *adev->gmc.vm_fault_info;
- mb(); /* make sure read happened */
- atomic_set(&adev->gmc.vm_fault_info_updated, 0);
+ atomic_set_release(&adev->gmc.vm_fault_info_updated, 0);
}
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 9cd7741d2254..2f6a96af7fb1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -364,6 +364,12 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
if (p->uf_bo && ring->funcs->no_user_fence)
return -EINVAL;
+ if (!p->adev->debug_enable_ce_cs &&
+ chunk_ib->flags & AMDGPU_IB_FLAG_CE) {
+ dev_err_ratelimited(p->adev->dev, "CE CS is blocked, use debug=0x400 to override\n");
+ return -EINVAL;
+ }
+
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
@@ -702,7 +708,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
*/
const s64 us_upper_bound = 200000;
- if (!adev->mm_stats.log2_max_MBps) {
+ if ((!adev->mm_stats.log2_max_MBps) || !ttm_resource_manager_used(&adev->mman.vram_mgr.manager)) {
*max_bytes = 0;
*max_vis_bytes = 0;
return;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 7a899fb4de29..3d032c4e2dce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1882,6 +1882,13 @@ static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device
static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
{
+ /* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4.
+ * It's unclear if this is a platform-specific or GPU-specific issue.
+ * Disable ASPM on SI for the time being.
+ */
+ if (adev->family == AMDGPU_FAMILY_SI)
+ return true;
+
#if IS_ENABLED(CONFIG_X86)
struct cpuinfo_x86 *c = &cpu_data(0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 73401f0aeb34..dd7b2b796427 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -1033,7 +1033,9 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev,
/* Until a uniform way is figured, get mask based on hwid */
switch (hw_id) {
case VCN_HWID:
- harvest = ((1 << inst) & adev->vcn.inst_mask) == 0;
+ /* VCN vs UVD+VCE */
+ if (!amdgpu_ip_version(adev, VCE_HWIP, 0))
+ harvest = ((1 << inst) & adev->vcn.inst_mask) == 0;
break;
case DMU_HWID:
if (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
@@ -2565,7 +2567,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_discovery_init(adev);
vega10_reg_base_init(adev);
adev->sdma.num_instances = 2;
+ adev->sdma.sdma_mask = 3;
adev->gmc.num_umc = 4;
+ adev->gfx.xcc_mask = 1;
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 0, 0);
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 0, 0);
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 0, 0);
@@ -2592,7 +2596,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_discovery_init(adev);
vega10_reg_base_init(adev);
adev->sdma.num_instances = 2;
+ adev->sdma.sdma_mask = 3;
adev->gmc.num_umc = 4;
+ adev->gfx.xcc_mask = 1;
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 3, 0);
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 3, 0);
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 0, 1);
@@ -2619,8 +2625,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_discovery_init(adev);
vega10_reg_base_init(adev);
adev->sdma.num_instances = 1;
+ adev->sdma.sdma_mask = 1;
adev->vcn.num_vcn_inst = 1;
adev->gmc.num_umc = 2;
+ adev->gfx.xcc_mask = 1;
if (adev->apu_flags & AMD_APU_IS_RAVEN2) {
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 2, 0);
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 2, 0);
@@ -2665,7 +2673,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_discovery_init(adev);
vega20_reg_base_init(adev);
adev->sdma.num_instances = 2;
+ adev->sdma.sdma_mask = 3;
adev->gmc.num_umc = 8;
+ adev->gfx.xcc_mask = 1;
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 4, 0);
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 4, 0);
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 2, 0);
@@ -2693,8 +2703,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_discovery_init(adev);
arct_reg_base_init(adev);
adev->sdma.num_instances = 8;
+ adev->sdma.sdma_mask = 0xff;
adev->vcn.num_vcn_inst = 2;
adev->gmc.num_umc = 8;
+ adev->gfx.xcc_mask = 1;
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 4, 1);
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 4, 1);
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 2, 1);
@@ -2726,8 +2738,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_discovery_init(adev);
aldebaran_reg_base_init(adev);
adev->sdma.num_instances = 5;
+ adev->sdma.sdma_mask = 0x1f;
adev->vcn.num_vcn_inst = 2;
adev->gmc.num_umc = 4;
+ adev->gfx.xcc_mask = 1;
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 4, 2);
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 4, 2);
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 4, 0);
@@ -2762,6 +2776,8 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
} else {
cyan_skillfish_reg_base_init(adev);
adev->sdma.num_instances = 2;
+ adev->sdma.sdma_mask = 3;
+ adev->gfx.xcc_mask = 1;
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(2, 0, 3);
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(2, 0, 3);
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(5, 0, 1);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index bff25ef3e2d0..61268aa82df4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -144,7 +144,8 @@ enum AMDGPU_DEBUG_MASK {
AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6),
AMDGPU_DEBUG_SMU_POOL = BIT(7),
AMDGPU_DEBUG_VM_USERPTR = BIT(8),
- AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9)
+ AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9),
+ AMDGPU_DEBUG_ENABLE_CE_CS = BIT(10)
};
unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2289,6 +2290,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
pr_info("debug: disable kernel logs of correctable errors\n");
adev->debug_disable_ce_logs = true;
}
+
+ if (amdgpu_debug_mask & AMDGPU_DEBUG_ENABLE_CE_CS) {
+ pr_info("debug: allowing command submission to CE engine\n");
+ adev->debug_enable_ce_cs = true;
+ }
}
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index fd8cca241da6..18a7829122d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -758,11 +758,42 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
* @fence: fence of the ring to signal
*
*/
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
{
- dma_fence_set_error(&fence->base, -ETIME);
- amdgpu_fence_write(fence->ring, fence->seq);
- amdgpu_fence_process(fence->ring);
+ struct dma_fence *unprocessed;
+ struct dma_fence __rcu **ptr;
+ struct amdgpu_fence *fence;
+ struct amdgpu_ring *ring = af->ring;
+ unsigned long flags;
+ u32 seq, last_seq;
+
+ last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
+ seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
+
+ /* mark all fences from the guilty context with an error */
+ spin_lock_irqsave(&ring->fence_drv.lock, flags);
+ do {
+ last_seq++;
+ last_seq &= ring->fence_drv.num_fences_mask;
+
+ ptr = &ring->fence_drv.fences[last_seq];
+ rcu_read_lock();
+ unprocessed = rcu_dereference(*ptr);
+
+ if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
+ fence = container_of(unprocessed, struct amdgpu_fence, base);
+
+ if (fence == af)
+ dma_fence_set_error(&fence->base, -ETIME);
+ else if (fence->context == af->context)
+ dma_fence_set_error(&fence->base, -ECANCELED);
+ }
+ rcu_read_unlock();
+ } while (last_seq != seq);
+ spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
+ /* signal the guilty fence */
+ amdgpu_fence_write(ring, af->seq);
+ amdgpu_fence_process(ring);
}
void amdgpu_fence_save_wptr(struct dma_fence *fence)
@@ -790,14 +821,19 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
struct dma_fence *unprocessed;
struct dma_fence __rcu **ptr;
struct amdgpu_fence *fence;
- u64 wptr, i, seqno;
+ u64 wptr;
+ u32 seq, last_seq;
- seqno = amdgpu_fence_read(ring);
+ last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
+ seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
wptr = ring->fence_drv.signalled_wptr;
ring->ring_backup_entries_to_copy = 0;
- for (i = seqno + 1; i <= ring->fence_drv.sync_seq; ++i) {
- ptr = &ring->fence_drv.fences[i & ring->fence_drv.num_fences_mask];
+ do {
+ last_seq++;
+ last_seq &= ring->fence_drv.num_fences_mask;
+
+ ptr = &ring->fence_drv.fences[last_seq];
rcu_read_lock();
unprocessed = rcu_dereference(*ptr);
@@ -813,7 +849,7 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
wptr = fence->wptr;
}
rcu_read_unlock();
- }
+ } while (last_seq != seq);
}
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
index 6b7d66b6d4cc..63ee6ba6a931 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c
@@ -371,7 +371,7 @@ static int amdgpu_debugfs_jpeg_sched_mask_set(void *data, u64 val)
for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) {
for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
ring = &adev->jpeg.inst[i].ring_dec[j];
- if (val & (BIT_ULL(1) << ((i * adev->jpeg.num_jpeg_rings) + j)))
+ if (val & (BIT_ULL((i * adev->jpeg.num_jpeg_rings) + j)))
ring->sched.ready = true;
else
ring->sched.ready = false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index a9327472c651..b3e6b3fcdf2c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -758,7 +758,8 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
ui64 = atomic64_read(&adev->num_vram_cpu_page_faults);
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VRAM_USAGE:
- ui64 = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
+ ui64 = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ?
+ ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) : 0;
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VIS_VRAM_USAGE:
ui64 = amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
@@ -804,8 +805,8 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
mem.vram.usable_heap_size = adev->gmc.real_vram_size -
atomic64_read(&adev->vram_pin_size) -
AMDGPU_VM_RESERVED_VRAM;
- mem.vram.heap_usage =
- ttm_resource_manager_usage(vram_man);
+ mem.vram.heap_usage = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ?
+ ttm_resource_manager_usage(vram_man) : 0;
mem.vram.max_allocation = mem.vram.usable_heap_size * 3 / 4;
mem.cpu_accessible_vram.total_heap_size =
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 5bf9be073cdd..4883adcfbb4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -409,7 +409,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
return -EINVAL;
/* Clear the doorbell array before detection */
- memset(adev->mes.hung_queue_db_array_cpu_addr, 0,
+ memset(adev->mes.hung_queue_db_array_cpu_addr, AMDGPU_MES_INVALID_DB_OFFSET,
adev->mes.hung_queue_db_array_size * sizeof(u32));
input.queue_type = queue_type;
input.detect_only = detect_only;
@@ -420,12 +420,17 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
dev_err(adev->dev, "failed to detect and reset\n");
} else {
*hung_db_num = 0;
- for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
+ for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
hung_db_array[i] = db_array[i];
*hung_db_num += 1;
}
}
+
+ /*
+ * TODO: return HQD info for MES scheduled user compute queue reset cases
+ * stored in hung_db_array hqd info offset to full array size
+ */
}
return r;
@@ -686,14 +691,11 @@ out:
bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
{
uint32_t mes_rev = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
- bool is_supported = false;
-
- if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0) &&
- amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(12, 0, 0) &&
- mes_rev >= 0x63)
- is_supported = true;
- return is_supported;
+ return ((amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0) &&
+ amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(12, 0, 0) &&
+ mes_rev >= 0x63) ||
+ amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
}
/* Fix me -- node_id is used to identify the correct MES instances in the future */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index 6b506fc72f58..97c137c90f97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -149,6 +149,7 @@ struct amdgpu_mes {
void *resource_1_addr[AMDGPU_MAX_MES_PIPES];
int hung_queue_db_array_size;
+ int hung_queue_hqd_info_offset;
struct amdgpu_bo *hung_queue_db_array_gpu_obj;
uint64_t hung_queue_db_array_gpu_addr;
void *hung_queue_db_array_cpu_addr;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 8f6ce948c684..5ec5c3ff22bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -811,7 +811,7 @@ int amdgpu_ring_reset_helper_end(struct amdgpu_ring *ring,
if (r)
return r;
- /* signal the fence of the bad job */
+ /* signal the guilty fence and set an error on all fences from the context */
if (guilty_fence)
amdgpu_fence_driver_guilty_force_completion(guilty_fence);
/* Re-emit the non-guilty commands */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index b6b649179776..4b46e3c26ff3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -155,7 +155,7 @@ extern const struct drm_sched_backend_ops amdgpu_sched_ops;
void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error);
void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring);
-void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence);
+void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af);
void amdgpu_fence_save_wptr(struct dma_fence *fence);
int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 3328ab63376b..f96beb96c75c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -598,8 +598,8 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)
vf2pf_info->driver_cert = 0;
vf2pf_info->os_info.all = 0;
- vf2pf_info->fb_usage =
- ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) >> 20;
+ vf2pf_info->fb_usage = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ?
+ ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) >> 20 : 0;
vf2pf_info->fb_vis_usage =
amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr) >> 20;
vf2pf_info->fb_size = adev->gmc.real_vram_size >> 20;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index a5adb2ed9b3c..9d934c07fa6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -234,6 +234,9 @@ static umode_t amdgpu_vram_attrs_is_visible(struct kobject *kobj,
!adev->gmc.vram_vendor)
return 0;
+ if (!ttm_resource_manager_used(&adev->mman.vram_mgr.manager))
+ return 0;
+
return attr->mode;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 66c47c466532..d61eb9f187c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -5862,8 +5862,6 @@ static void gfx_v11_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job);
u32 header, control = 0;
- BUG_ON(ib->flags & AMDGPU_IB_FLAG_CE);
-
header = PACKET3(PACKET3_INDIRECT_BUFFER, 2);
control |= ib->length_dw | (vmid << 24);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
index 710ec9c34e43..93fde0f9af87 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
@@ -4419,8 +4419,6 @@ static void gfx_v12_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
unsigned vmid = AMDGPU_JOB_GET_VMID(job);
u32 header, control = 0;
- BUG_ON(ib->flags & AMDGPU_IB_FLAG_CE);
-
header = PACKET3(PACKET3_INDIRECT_BUFFER, 2);
control |= ib->length_dw | (vmid << 24);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 93d7ccb7d013..0e5e54d0a9a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -1068,7 +1068,7 @@ static int gmc_v7_0_sw_init(struct amdgpu_ip_block *ip_block)
GFP_KERNEL);
if (!adev->gmc.vm_fault_info)
return -ENOMEM;
- atomic_set(&adev->gmc.vm_fault_info_updated, 0);
+ atomic_set_release(&adev->gmc.vm_fault_info_updated, 0);
return 0;
}
@@ -1290,7 +1290,7 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS,
VMID);
if (amdgpu_amdkfd_is_kfd_vmid(adev, vmid)
- && !atomic_read(&adev->gmc.vm_fault_info_updated)) {
+ && !atomic_read_acquire(&adev->gmc.vm_fault_info_updated)) {
struct kfd_vm_fault_info *info = adev->gmc.vm_fault_info;
u32 protections = REG_GET_FIELD(status,
VM_CONTEXT1_PROTECTION_FAULT_STATUS,
@@ -1306,8 +1306,7 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
info->prot_read = protections & 0x8 ? true : false;
info->prot_write = protections & 0x10 ? true : false;
info->prot_exec = protections & 0x20 ? true : false;
- mb();
- atomic_set(&adev->gmc.vm_fault_info_updated, 1);
+ atomic_set_release(&adev->gmc.vm_fault_info_updated, 1);
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index c5e2a2c41e06..e1509480dfc2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -1183,7 +1183,7 @@ static int gmc_v8_0_sw_init(struct amdgpu_ip_block *ip_block)
GFP_KERNEL);
if (!adev->gmc.vm_fault_info)
return -ENOMEM;
- atomic_set(&adev->gmc.vm_fault_info_updated, 0);
+ atomic_set_release(&adev->gmc.vm_fault_info_updated, 0);
return 0;
}
@@ -1478,7 +1478,7 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
vmid = REG_GET_FIELD(status, VM_CONTEXT1_PROTECTION_FAULT_STATUS,
VMID);
if (amdgpu_amdkfd_is_kfd_vmid(adev, vmid)
- && !atomic_read(&adev->gmc.vm_fault_info_updated)) {
+ && !atomic_read_acquire(&adev->gmc.vm_fault_info_updated)) {
struct kfd_vm_fault_info *info = adev->gmc.vm_fault_info;
u32 protections = REG_GET_FIELD(status,
VM_CONTEXT1_PROTECTION_FAULT_STATUS,
@@ -1494,8 +1494,7 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
info->prot_read = protections & 0x8 ? true : false;
info->prot_write = protections & 0x10 ? true : false;
info->prot_exec = protections & 0x20 ? true : false;
- mb();
- atomic_set(&adev->gmc.vm_fault_info_updated, 1);
+ atomic_set_release(&adev->gmc.vm_fault_info_updated, 1);
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index 2db9b2c63693..1cd9eaeef38f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -208,10 +208,10 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
struct amdgpu_userq_mgr *uqm, *tmp;
unsigned int hung_db_num = 0;
int queue_id, r, i;
- u32 db_array[4];
+ u32 db_array[8];
- if (db_array_size > 4) {
- dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
+ if (db_array_size > 8) {
+ dev_err(adev->dev, "DB array size (%d vs 8) too small\n",
db_array_size);
return -EINVAL;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index e82188431f79..da575bb1377f 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -66,7 +66,8 @@ static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev);
#define GFX_MES_DRAM_SIZE 0x80000
#define MES11_HW_RESOURCE_1_SIZE (128 * AMDGPU_GPU_PAGE_SIZE)
-#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 4
+#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset, [4:7] = hqd info */
+#define MES11_HUNG_HQD_INFO_OFFSET 4
static void mes_v11_0_ring_set_wptr(struct amdgpu_ring *ring)
{
@@ -1720,8 +1721,9 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int pipe, r;
- adev->mes.hung_queue_db_array_size =
- MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
+ adev->mes.hung_queue_db_array_size = MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
+ adev->mes.hung_queue_hqd_info_offset = MES11_HUNG_HQD_INFO_OFFSET;
+
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE)
continue;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
index aff06f06aeee..7f3512d9de07 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v12_0.c
@@ -47,7 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev);
#define MES_EOP_SIZE 2048
-#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
+#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd info */
+#define MES12_HUNG_HQD_INFO_OFFSET 4
static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring)
{
@@ -228,7 +229,12 @@ static int mes_v12_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
pipe, x_pkt->header.opcode);
r = amdgpu_fence_wait_polling(ring, seq, timeout);
- if (r < 1 || !*status_ptr) {
+
+ /*
+ * status_ptr[31:0] == 0 (fail) or status_ptr[63:0] == 1 (success).
+ * If status_ptr[31:0] == 0 then status_ptr[63:32] will have debug error information.
+ */
+ if (r < 1 || !(lower_32_bits(*status_ptr))) {
if (misc_op_str)
dev_err(adev->dev, "MES(%d) failed to respond to msg=%s (%s)\n",
@@ -1899,8 +1905,9 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
struct amdgpu_device *adev = ip_block->adev;
int pipe, r;
- adev->mes.hung_queue_db_array_size =
- MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
+ adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
+ adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET;
+
for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
r = amdgpu_mes_init_microcode(adev, pipe);
if (r)