From: Felix Kuehling Date: Fri, 20 Dec 2019 07:07:54 +0000 (-0500) Subject: drm/amdkfd: Improve HWS hang detection and handling X-Git-Tag: Ubuntu-5.10.0-12.13~3611^2~12^2~61 X-Git-Url: https://git.proxmox.com/?a=commitdiff_plain;h=09c34e8d7a631cf541c82f46ee73433c28b6ce0e;p=mirror_ubuntu-hirsute-kernel.git drm/amdkfd: Improve HWS hang detection and handling Move HWS hang detection into unmap_queues_cpsch to catch hangs in all cases. If this happens during a reset, don't schedule another reset because the reset already in progress is expected to take care of it. Signed-off-by: Felix Kuehling Tested-by: Emily Deng Reviewed-by: shaoyunl Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index c6b6901bbda3..2a9e40131735 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -728,6 +728,9 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd) { if (!kfd->init_complete) return 0; + + kfd->dqm->ops.pre_reset(kfd->dqm); + kgd2kfd_suspend(kfd); kfd_signal_reset_event(kfd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 558c0ad81848..a7e9ec1b3ce3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -952,6 +952,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm) return 0; } +static void pre_reset(struct device_queue_manager *dqm) +{ + dqm_lock(dqm); + dqm->is_resetting = true; + dqm_unlock(dqm); +} + static int allocate_sdma_queue(struct device_queue_manager *dqm, struct queue *q) { @@ -1099,6 +1106,7 @@ static int start_cpsch(struct device_queue_manager *dqm) dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; + dqm->is_resetting = false; dqm->sched_running = true; execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0); dqm_unlock(dqm); @@ -1351,8 +1359,17 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm, /* should be timed out */ retval = amdkfd_fence_wait_timeout(dqm->fence_addr, KFD_FENCE_COMPLETED, queue_preemption_timeout_ms); - if (retval) + if (retval) { + pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); + dqm->is_hws_hang = true; + /* It's possible we're detecting a HWS hang in the + * middle of a GPU reset. No need to schedule another + * reset in this case. + */ + if (!dqm->is_resetting) + schedule_work(&dqm->hw_exception_work); return retval; + } pm_release_ib(&dqm->packets); dqm->active_runlist = false; @@ -1370,12 +1387,8 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm, if (dqm->is_hws_hang) return -EIO; retval = unmap_queues_cpsch(dqm, filter, filter_param); - if (retval) { - pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n"); - dqm->is_hws_hang = true; - schedule_work(&dqm->hw_exception_work); + if (retval) return retval; - } return map_queues_cpsch(dqm); } @@ -1769,6 +1782,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) dqm->ops.initialize = initialize_cpsch; dqm->ops.start = start_cpsch; dqm->ops.stop = stop_cpsch; + dqm->ops.pre_reset = pre_reset; dqm->ops.destroy_queue = destroy_queue_cpsch; dqm->ops.update_queue = update_queue; dqm->ops.register_process = register_process; @@ -1787,6 +1801,7 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) /* initialize dqm for no cp scheduling */ dqm->ops.start = start_nocpsch; dqm->ops.stop = stop_nocpsch; + dqm->ops.pre_reset = pre_reset; dqm->ops.create_queue = create_queue_nocpsch; dqm->ops.destroy_queue = destroy_queue_nocpsch; dqm->ops.update_queue = update_queue; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 8991120c4fa2..871d3b628d2d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -104,6 +104,7 @@ struct device_queue_manager_ops { int (*initialize)(struct device_queue_manager *dqm); int (*start)(struct device_queue_manager *dqm); int (*stop)(struct device_queue_manager *dqm); + void (*pre_reset)(struct device_queue_manager *dqm); void (*uninitialize)(struct device_queue_manager *dqm); int (*create_kernel_queue)(struct device_queue_manager *dqm, struct kernel_queue *kq, @@ -198,6 +199,7 @@ struct device_queue_manager { /* hw exception */ bool is_hws_hang; + bool is_resetting; struct work_struct hw_exception_work; struct kfd_mem_obj hiq_sdma_mqd; bool sched_running;