drm/gpu-sched: fix force APP kill hang(v2)

Submitted by Deng, Emily on April 4, 2018, 9:37 a.m.

Details

Message ID 1522834621-3688-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "drm/gpu-sched: fix force APP kill hang" ( rev: 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily April 4, 2018, 9:37 a.m.
issue:
there are VMC page fault occured if force APP kill during
3dmark test, the cause is in entity_fini we manually signal
all those jobs in entity's queue which confuse the sync/dep
mechanism:

1)page fault occured in sdma's clear job which operate on
shadow buffer, and shadow buffer's Gart table is cleaned by
ttm_bo_release since the fence in its reservation was fake signaled
by entity_fini() under the case of SIGKILL received.

2)page fault occured in gfx' job because during the lifetime
of gfx job we manually fake signal all jobs from its entity
in entity_fini(), thus the unmapping/clear PTE job depend on those
result fence is satisfied and sdma start clearing the PTE and lead
to GFX page fault.

fix:
1)should at least wait all jobs already scheduled complete in entity_fini()
if SIGKILL is the case.

2)if a fence signaled and try to clear some entity's dependency, should
set this entity guilty to prevent its job really run since the dependency
is fake signaled.

v2:
splitting drm_sched_entity_fini() into two functions:
1)The first one is does the waiting, removes the entity from the
runqueue and returns an error when the process was killed.
2)The second one then goes over the entity, install it as
completion signal for the remaining jobs and signals all jobs
with an error code.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 13 +++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  4 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   |  8 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  6 ++-
 drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c     |  3 +-
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c     |  3 +-
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 67 +++++++++++++++++++++++++------
 include/drm/gpu_scheduler.h               |  7 +++-
 10 files changed, 93 insertions(+), 24 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 09d35051..69dadf9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -104,8 +104,13 @@  static int amdgpu_ctx_init(struct amdgpu_device *adev,
 
 failed:
 	for (j = 0; j < i; j++)
-		drm_sched_entity_fini(&adev->rings[j]->sched,
+		drm_sched_entity_fini1(&adev->rings[j]->sched,
 				      &ctx->rings[j].entity);
+
+	for (j = 0; j < i; j++)
+		drm_sched_entity_fini2(&adev->rings[j]->sched,
+				      &ctx->rings[j].entity);
+
 	kfree(ctx->fences);
 	ctx->fences = NULL;
 	return r;
@@ -126,7 +131,11 @@  static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
 	ctx->fences = NULL;
 
 	for (i = 0; i < adev->num_rings; i++)
-		drm_sched_entity_fini(&adev->rings[i]->sched,
+		drm_sched_entity_fini1(&adev->rings[i]->sched,
+				      &ctx->rings[i].entity);
+
+	for (i = 0; i < adev->num_rings; i++)
+		drm_sched_entity_fini2(&adev->rings[i]->sched,
 				      &ctx->rings[i].entity);
 
 	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index d2ab404..83d46bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -131,8 +131,10 @@  static int amdgpu_ttm_global_init(struct amdgpu_device *adev)
 static void amdgpu_ttm_global_fini(struct amdgpu_device *adev)
 {
 	if (adev->mman.mem_global_referenced) {
-		drm_sched_entity_fini(adev->mman.entity.sched,
+		drm_sched_entity_fini1(adev->mman.entity.sched,
 				      &adev->mman.entity);
+		drm_sched_entity_fini2(adev->mman.entity.sched,
+                      &adev->mman.entity);
 		mutex_destroy(&adev->mman.gtt_window_lock);
 		drm_global_item_unref(&adev->mman.bo_global_ref.ref);
 		drm_global_item_unref(&adev->mman.mem_global_ref);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
index 627542b..74a6953 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
@@ -277,7 +277,8 @@  int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
 	int i;
 	kfree(adev->uvd.saved_bo);
 
-	drm_sched_entity_fini(&adev->uvd.ring.sched, &adev->uvd.entity);
+	drm_sched_entity_fini1(&adev->uvd.ring.sched, &adev->uvd.entity);
+	drm_sched_entity_fini2(&adev->uvd.ring.sched, &adev->uvd.entity);
 
 	amdgpu_bo_free_kernel(&adev->uvd.vcpu_bo,
 			      &adev->uvd.gpu_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
index a33804b..4166d26 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
@@ -212,7 +212,8 @@  int amdgpu_vce_sw_fini(struct amdgpu_device *adev)
 	if (adev->vce.vcpu_bo == NULL)
 		return 0;
 
-	drm_sched_entity_fini(&adev->vce.ring[0].sched, &adev->vce.entity);
+	drm_sched_entity_fini1(&adev->vce.ring[0].sched, &adev->vce.entity);
+	drm_sched_entity_fini2(&adev->vce.ring[0].sched, &adev->vce.entity);
 
 	amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr,
 		(void **)&adev->vce.cpu_addr);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
index 58e4953..268c0c3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
@@ -129,9 +129,13 @@  int amdgpu_vcn_sw_fini(struct amdgpu_device *adev)
 
 	kfree(adev->vcn.saved_bo);
 
-	drm_sched_entity_fini(&adev->vcn.ring_dec.sched, &adev->vcn.entity_dec);
+	drm_sched_entity_fini1(&adev->vcn.ring_dec.sched, &adev->vcn.entity_dec);
 
-	drm_sched_entity_fini(&adev->vcn.ring_enc[0].sched, &adev->vcn.entity_enc);
+	drm_sched_entity_fini1(&adev->vcn.ring_enc[0].sched, &adev->vcn.entity_enc);
+
+	drm_sched_entity_fini2(&adev->vcn.ring_dec.sched, &adev->vcn.entity_dec);
+
+	drm_sched_entity_fini2(&adev->vcn.ring_enc[0].sched, &adev->vcn.entity_enc);
 
 	amdgpu_bo_free_kernel(&adev->vcn.vcpu_bo,
 			      &adev->vcn.gpu_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 2447429..084a39d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2456,7 +2456,8 @@  int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	vm->root.base.bo = NULL;
 
 error_free_sched_entity:
-	drm_sched_entity_fini(&ring->sched, &vm->entity);
+	drm_sched_entity_fini1(&ring->sched, &vm->entity);
+	drm_sched_entity_fini2(&ring->sched, &vm->entity);
 
 	return r;
 }
@@ -2520,7 +2521,8 @@  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
 	}
 
-	drm_sched_entity_fini(vm->entity.sched, &vm->entity);
+	drm_sched_entity_fini1(vm->entity.sched, &vm->entity);
+	drm_sched_entity_fini2(vm->entity.sched, &vm->entity);
 
 	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
 		dev_err(adev->dev, "still active bo inside vm\n");
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
index f26f515..3c57a6e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
@@ -469,7 +469,8 @@  static int uvd_v6_0_sw_fini(void *handle)
 		return r;
 
 	if (uvd_v6_0_enc_support(adev)) {
-		drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
+		drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
+		drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
 
 		for (i = 0; i < adev->uvd.num_enc_rings; ++i)
 			amdgpu_ring_fini(&adev->uvd.ring_enc[i]);
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
index 280c082..a422acf 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
@@ -472,7 +472,8 @@  static int uvd_v7_0_sw_fini(void *handle)
 	if (r)
 		return r;
 
-	drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
+	drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
+	drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
 
 	for (i = 0; i < adev->uvd.num_enc_rings; ++i)
 		amdgpu_ring_fini(&adev->uvd.ring_enc[i]);
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 310275e..abeb3ff 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -136,6 +136,8 @@  int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 	entity->rq = rq;
 	entity->sched = sched;
 	entity->guilty = guilty;
+	entity->fini_status = 0;
+	entity->finished = NULL;
 
 	spin_lock_init(&entity->rq_lock);
 	spin_lock_init(&entity->queue_lock);
@@ -197,19 +199,29 @@  static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
 	return true;
 }
 
+static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
+				    struct dma_fence_cb *cb)
+{
+	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+						 finish_cb);
+	drm_sched_fence_finished(job->s_fence);
+	WARN_ON(job->s_fence->parent);
+	dma_fence_put(&job->s_fence->finished);
+	job->sched->ops->free_job(job);
+}
+
 /**
  * Destroy a context entity
  *
  * @sched       Pointer to scheduler instance
  * @entity	The pointer to a valid scheduler entity
  *
- * Cleanup and free the allocated resources.
+ * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting, 
+ * removes the entity from the runqueue and returns an error when the process was killed.
  */
-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,
 			   struct drm_sched_entity *entity)
 {
-	int r;
-
 	if (!drm_sched_entity_is_initialized(sched, entity))
 		return;
 	/**
@@ -217,13 +229,27 @@  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 	 * queued IBs or discard them on SIGKILL
 	*/
 	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
-		r = -ERESTARTSYS;
+		entity->fini_status = -ERESTARTSYS;
 	else
-		r = wait_event_killable(sched->job_scheduled,
+		entity->fini_status = wait_event_killable(sched->job_scheduled,
 					drm_sched_entity_is_idle(entity));
 	drm_sched_entity_set_rq(entity, NULL);
-	if (r) {
+}
+
+/**
+ * Destroy a context entity
+ *
+ * @sched       Pointer to scheduler instance
+ * @entity	The pointer to a valid scheduler entity
+ *
+ * The second one then goes over the entity and signals all jobs with an error code.
+ */
+void drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity)
+{
+	if (entity->fini_status) {
 		struct drm_sched_job *job;
+		int r;
 
 		/* Park the kernel for a moment to make sure it isn't processing
 		 * our enity.
@@ -241,14 +267,21 @@  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 			struct drm_sched_fence *s_fence = job->s_fence;
 			drm_sched_fence_scheduled(s_fence);
 			dma_fence_set_error(&s_fence->finished, -ESRCH);
-			drm_sched_fence_finished(s_fence);
-			WARN_ON(s_fence->parent);
-			dma_fence_put(&s_fence->finished);
-			sched->ops->free_job(job);
+            r = dma_fence_add_callback(entity->finished, &job->finish_cb,
+                               drm_sched_entity_fini_job_cb);
+            if (r == -ENOENT)
+                drm_sched_entity_fini_job_cb(entity->finished, &job->finish_cb);
+            else if (r)
+                DRM_ERROR("fence add callback failed (%d)\n", r);
+        }
+        if(entity->finished){
+            dma_fence_put(entity->finished);
+		    entity->finished = NULL;
 		}
 	}
 }
-EXPORT_SYMBOL(drm_sched_entity_fini);
+EXPORT_SYMBOL(drm_sched_entity_fini1);
+EXPORT_SYMBOL(drm_sched_entity_fini2);
 
 static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
 {
@@ -530,6 +563,11 @@  void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
 		spin_unlock(&sched->job_list_lock);
 		fence = sched->ops->run_job(s_job);
 		atomic_inc(&sched->hw_rq_count);
+
+        if(s_job->entity->finished)
+			dma_fence_put(s_job->entity->finished);
+		s_job->entity->finished = dma_fence_get(&s_fence->finished);
+
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
 			r = dma_fence_add_callback(fence, &s_fence->cb,
@@ -556,6 +594,7 @@  int drm_sched_job_init(struct drm_sched_job *job,
 		       void *owner)
 {
 	job->sched = sched;
+	job->entity = entity;
 	job->s_priority = entity->rq - sched->sched_rq;
 	job->s_fence = drm_sched_fence_create(entity, owner);
 	if (!job->s_fence)
@@ -669,6 +708,10 @@  static int drm_sched_main(void *param)
 		fence = sched->ops->run_job(sched_job);
 		drm_sched_fence_scheduled(s_fence);
 
+		if(entity->finished)
+			dma_fence_put(entity->finished);
+		entity->finished = dma_fence_get(&s_fence->finished);
+
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
 			r = dma_fence_add_callback(fence, &s_fence->cb,
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index dfd54fb..e0a5a53 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -63,6 +63,8 @@  struct drm_sched_entity {
 	struct dma_fence		*dependency;
 	struct dma_fence_cb		cb;
 	atomic_t			*guilty; /* points to ctx's guilty */
+	uint32_t            fini_status;
+	struct dma_fence    *finished;
 };
 
 /**
@@ -99,6 +101,7 @@  struct drm_sched_job {
 	uint64_t			id;
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
+	struct drm_sched_entity  *entity;
 };
 
 static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
@@ -148,7 +151,9 @@  int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 			  struct drm_sched_entity *entity,
 			  struct drm_sched_rq *rq,
 			  uint32_t jobs, atomic_t *guilty);
-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity);
+void drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,
 			   struct drm_sched_entity *entity);
 void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
 			       struct drm_sched_entity *entity);

Comments

Am 04.04.2018 um 11:37 schrieb Emily Deng:
> issue:
> there are VMC page fault occured if force APP kill during
> 3dmark test, the cause is in entity_fini we manually signal
> all those jobs in entity's queue which confuse the sync/dep
> mechanism:
>
> 1)page fault occured in sdma's clear job which operate on
> shadow buffer, and shadow buffer's Gart table is cleaned by
> ttm_bo_release since the fence in its reservation was fake signaled
> by entity_fini() under the case of SIGKILL received.
>
> 2)page fault occured in gfx' job because during the lifetime
> of gfx job we manually fake signal all jobs from its entity
> in entity_fini(), thus the unmapping/clear PTE job depend on those
> result fence is satisfied and sdma start clearing the PTE and lead
> to GFX page fault.
>
> fix:
> 1)should at least wait all jobs already scheduled complete in entity_fini()
> if SIGKILL is the case.
>
> 2)if a fence signaled and try to clear some entity's dependency, should
> set this entity guilty to prevent its job really run since the dependency
> is fake signaled.
>
> v2:
> splitting drm_sched_entity_fini() into two functions:
> 1)The first one is does the waiting, removes the entity from the
> runqueue and returns an error when the process was killed.
> 2)The second one then goes over the entity, install it as
> completion signal for the remaining jobs and signals all jobs
> with an error code.

First of all this patch won't work like this. You need to call the first 
part before the VM teardown in amdgpu_driver_postclose_kms() and the 
second part after the VM teardown.

Then please come up with a better name than fini1 and fini2, something 
like drm_sched_entity_cleanup() or something like this.

Then I think it is harmless to call the _cleanup() function from the 
_fini() function and only change the places where it makes sense to 
explicitely call _cleanup().

E.g. add a new function amdgpu_ctx_mgr_cleanup() which calls the 
cleanup() for each still open ctx and then modify the order in 
amdgpu_driver_postclose_kms().

Regards,
Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 13 +++++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  4 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c   |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c   |  3 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   |  8 +++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  6 ++-
>   drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c     |  3 +-
>   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c     |  3 +-
>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 67 +++++++++++++++++++++++++------
>   include/drm/gpu_scheduler.h               |  7 +++-
>   10 files changed, 93 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 09d35051..69dadf9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -104,8 +104,13 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev,
>   
>   failed:
>   	for (j = 0; j < i; j++)
> -		drm_sched_entity_fini(&adev->rings[j]->sched,
> +		drm_sched_entity_fini1(&adev->rings[j]->sched,
>   				      &ctx->rings[j].entity);
> +
> +	for (j = 0; j < i; j++)
> +		drm_sched_entity_fini2(&adev->rings[j]->sched,
> +				      &ctx->rings[j].entity);
> +
>   	kfree(ctx->fences);
>   	ctx->fences = NULL;
>   	return r;
> @@ -126,7 +131,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
>   	ctx->fences = NULL;
>   
>   	for (i = 0; i < adev->num_rings; i++)
> -		drm_sched_entity_fini(&adev->rings[i]->sched,
> +		drm_sched_entity_fini1(&adev->rings[i]->sched,
> +				      &ctx->rings[i].entity);
> +
> +	for (i = 0; i < adev->num_rings; i++)
> +		drm_sched_entity_fini2(&adev->rings[i]->sched,
>   				      &ctx->rings[i].entity);
>   
>   	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index d2ab404..83d46bb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -131,8 +131,10 @@ static int amdgpu_ttm_global_init(struct amdgpu_device *adev)
>   static void amdgpu_ttm_global_fini(struct amdgpu_device *adev)
>   {
>   	if (adev->mman.mem_global_referenced) {
> -		drm_sched_entity_fini(adev->mman.entity.sched,
> +		drm_sched_entity_fini1(adev->mman.entity.sched,
>   				      &adev->mman.entity);
> +		drm_sched_entity_fini2(adev->mman.entity.sched,
> +                      &adev->mman.entity);
>   		mutex_destroy(&adev->mman.gtt_window_lock);
>   		drm_global_item_unref(&adev->mman.bo_global_ref.ref);
>   		drm_global_item_unref(&adev->mman.mem_global_ref);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> index 627542b..74a6953 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c
> @@ -277,7 +277,8 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev)
>   	int i;
>   	kfree(adev->uvd.saved_bo);
>   
> -	drm_sched_entity_fini(&adev->uvd.ring.sched, &adev->uvd.entity);
> +	drm_sched_entity_fini1(&adev->uvd.ring.sched, &adev->uvd.entity);
> +	drm_sched_entity_fini2(&adev->uvd.ring.sched, &adev->uvd.entity);
>   
>   	amdgpu_bo_free_kernel(&adev->uvd.vcpu_bo,
>   			      &adev->uvd.gpu_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> index a33804b..4166d26 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c
> @@ -212,7 +212,8 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev)
>   	if (adev->vce.vcpu_bo == NULL)
>   		return 0;
>   
> -	drm_sched_entity_fini(&adev->vce.ring[0].sched, &adev->vce.entity);
> +	drm_sched_entity_fini1(&adev->vce.ring[0].sched, &adev->vce.entity);
> +	drm_sched_entity_fini2(&adev->vce.ring[0].sched, &adev->vce.entity);
>   
>   	amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr,
>   		(void **)&adev->vce.cpu_addr);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> index 58e4953..268c0c3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c
> @@ -129,9 +129,13 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device *adev)
>   
>   	kfree(adev->vcn.saved_bo);
>   
> -	drm_sched_entity_fini(&adev->vcn.ring_dec.sched, &adev->vcn.entity_dec);
> +	drm_sched_entity_fini1(&adev->vcn.ring_dec.sched, &adev->vcn.entity_dec);
>   
> -	drm_sched_entity_fini(&adev->vcn.ring_enc[0].sched, &adev->vcn.entity_enc);
> +	drm_sched_entity_fini1(&adev->vcn.ring_enc[0].sched, &adev->vcn.entity_enc);
> +
> +	drm_sched_entity_fini2(&adev->vcn.ring_dec.sched, &adev->vcn.entity_dec);
> +
> +	drm_sched_entity_fini2(&adev->vcn.ring_enc[0].sched, &adev->vcn.entity_enc);
>   
>   	amdgpu_bo_free_kernel(&adev->vcn.vcpu_bo,
>   			      &adev->vcn.gpu_addr,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 2447429..084a39d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2456,7 +2456,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   	vm->root.base.bo = NULL;
>   
>   error_free_sched_entity:
> -	drm_sched_entity_fini(&ring->sched, &vm->entity);
> +	drm_sched_entity_fini1(&ring->sched, &vm->entity);
> +	drm_sched_entity_fini2(&ring->sched, &vm->entity);
>   
>   	return r;
>   }
> @@ -2520,7 +2521,8 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
>   	}
>   
> -	drm_sched_entity_fini(vm->entity.sched, &vm->entity);
> +	drm_sched_entity_fini1(vm->entity.sched, &vm->entity);
> +	drm_sched_entity_fini2(vm->entity.sched, &vm->entity);
>   
>   	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
>   		dev_err(adev->dev, "still active bo inside vm\n");
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
> index f26f515..3c57a6e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
> @@ -469,7 +469,8 @@ static int uvd_v6_0_sw_fini(void *handle)
>   		return r;
>   
>   	if (uvd_v6_0_enc_support(adev)) {
> -		drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
> +		drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
> +		drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
>   
>   		for (i = 0; i < adev->uvd.num_enc_rings; ++i)
>   			amdgpu_ring_fini(&adev->uvd.ring_enc[i]);
> diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> index 280c082..a422acf 100644
> --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
> @@ -472,7 +472,8 @@ static int uvd_v7_0_sw_fini(void *handle)
>   	if (r)
>   		return r;
>   
> -	drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
> +	drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
> +	drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched, &adev->uvd.entity_enc);
>   
>   	for (i = 0; i < adev->uvd.num_enc_rings; ++i)
>   		amdgpu_ring_fini(&adev->uvd.ring_enc[i]);
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> index 310275e..abeb3ff 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
>   	entity->rq = rq;
>   	entity->sched = sched;
>   	entity->guilty = guilty;
> +	entity->fini_status = 0;
> +	entity->finished = NULL;
>   
>   	spin_lock_init(&entity->rq_lock);
>   	spin_lock_init(&entity->queue_lock);
> @@ -197,19 +199,29 @@ static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
>   	return true;
>   }
>   
> +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
> +				    struct dma_fence_cb *cb)
> +{
> +	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
> +						 finish_cb);
> +	drm_sched_fence_finished(job->s_fence);
> +	WARN_ON(job->s_fence->parent);
> +	dma_fence_put(&job->s_fence->finished);
> +	job->sched->ops->free_job(job);
> +}
> +
>   /**
>    * Destroy a context entity
>    *
>    * @sched       Pointer to scheduler instance
>    * @entity	The pointer to a valid scheduler entity
>    *
> - * Cleanup and free the allocated resources.
> + * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting,
> + * removes the entity from the runqueue and returns an error when the process was killed.
>    */
> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,
>   			   struct drm_sched_entity *entity)
>   {
> -	int r;
> -
>   	if (!drm_sched_entity_is_initialized(sched, entity))
>   		return;
>   	/**
> @@ -217,13 +229,27 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>   	 * queued IBs or discard them on SIGKILL
>   	*/
>   	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
> -		r = -ERESTARTSYS;
> +		entity->fini_status = -ERESTARTSYS;
>   	else
> -		r = wait_event_killable(sched->job_scheduled,
> +		entity->fini_status = wait_event_killable(sched->job_scheduled,
>   					drm_sched_entity_is_idle(entity));
>   	drm_sched_entity_set_rq(entity, NULL);
> -	if (r) {
> +}
> +
> +/**
> + * Destroy a context entity
> + *
> + * @sched       Pointer to scheduler instance
> + * @entity	The pointer to a valid scheduler entity
> + *
> + * The second one then goes over the entity and signals all jobs with an error code.
> + */
> +void drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,
> +			   struct drm_sched_entity *entity)
> +{
> +	if (entity->fini_status) {
>   		struct drm_sched_job *job;
> +		int r;
>   
>   		/* Park the kernel for a moment to make sure it isn't processing
>   		 * our enity.
> @@ -241,14 +267,21 @@ void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>   			struct drm_sched_fence *s_fence = job->s_fence;
>   			drm_sched_fence_scheduled(s_fence);
>   			dma_fence_set_error(&s_fence->finished, -ESRCH);
> -			drm_sched_fence_finished(s_fence);
> -			WARN_ON(s_fence->parent);
> -			dma_fence_put(&s_fence->finished);
> -			sched->ops->free_job(job);
> +            r = dma_fence_add_callback(entity->finished, &job->finish_cb,
> +                               drm_sched_entity_fini_job_cb);
> +            if (r == -ENOENT)
> +                drm_sched_entity_fini_job_cb(entity->finished, &job->finish_cb);
> +            else if (r)
> +                DRM_ERROR("fence add callback failed (%d)\n", r);
> +        }
> +        if(entity->finished){
> +            dma_fence_put(entity->finished);
> +		    entity->finished = NULL;
>   		}
>   	}
>   }
> -EXPORT_SYMBOL(drm_sched_entity_fini);
> +EXPORT_SYMBOL(drm_sched_entity_fini1);
> +EXPORT_SYMBOL(drm_sched_entity_fini2);
>   
>   static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
>   {
> @@ -530,6 +563,11 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>   		spin_unlock(&sched->job_list_lock);
>   		fence = sched->ops->run_job(s_job);
>   		atomic_inc(&sched->hw_rq_count);
> +
> +        if(s_job->entity->finished)
> +			dma_fence_put(s_job->entity->finished);
> +		s_job->entity->finished = dma_fence_get(&s_fence->finished);
> +
>   		if (fence) {
>   			s_fence->parent = dma_fence_get(fence);
>   			r = dma_fence_add_callback(fence, &s_fence->cb,
> @@ -556,6 +594,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>   		       void *owner)
>   {
>   	job->sched = sched;
> +	job->entity = entity;
>   	job->s_priority = entity->rq - sched->sched_rq;
>   	job->s_fence = drm_sched_fence_create(entity, owner);
>   	if (!job->s_fence)
> @@ -669,6 +708,10 @@ static int drm_sched_main(void *param)
>   		fence = sched->ops->run_job(sched_job);
>   		drm_sched_fence_scheduled(s_fence);
>   
> +		if(entity->finished)
> +			dma_fence_put(entity->finished);
> +		entity->finished = dma_fence_get(&s_fence->finished);
> +
>   		if (fence) {
>   			s_fence->parent = dma_fence_get(fence);
>   			r = dma_fence_add_callback(fence, &s_fence->cb,
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index dfd54fb..e0a5a53 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -63,6 +63,8 @@ struct drm_sched_entity {
>   	struct dma_fence		*dependency;
>   	struct dma_fence_cb		cb;
>   	atomic_t			*guilty; /* points to ctx's guilty */
> +	uint32_t            fini_status;
> +	struct dma_fence    *finished;
>   };
>   
>   /**
> @@ -99,6 +101,7 @@ struct drm_sched_job {
>   	uint64_t			id;
>   	atomic_t			karma;
>   	enum drm_sched_priority		s_priority;
> +	struct drm_sched_entity  *entity;
>   };
>   
>   static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
> @@ -148,7 +151,9 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
>   			  struct drm_sched_entity *entity,
>   			  struct drm_sched_rq *rq,
>   			  uint32_t jobs, atomic_t *guilty);
> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,
> +			   struct drm_sched_entity *entity);
> +void drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,
>   			   struct drm_sched_entity *entity);
>   void drm_sched_entity_push_job(struct drm_sched_job *sched_job,
>   			       struct drm_sched_entity *entity);
Thanks Christian, will refine the code as your suggestion.

Best Wishes,
Emily Deng

> -----Original Message-----

> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]

> Sent: Wednesday, April 04, 2018 6:02 PM

> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

> Cc: Liu, Monk <Monk.Liu@amd.com>

> Subject: Re: [PATCH] drm/gpu-sched: fix force APP kill hang(v2)

> 

> Am 04.04.2018 um 11:37 schrieb Emily Deng:

> > issue:

> > there are VMC page fault occured if force APP kill during 3dmark test,

> > the cause is in entity_fini we manually signal all those jobs in

> > entity's queue which confuse the sync/dep

> > mechanism:

> >

> > 1)page fault occured in sdma's clear job which operate on shadow

> > buffer, and shadow buffer's Gart table is cleaned by ttm_bo_release

> > since the fence in its reservation was fake signaled by entity_fini()

> > under the case of SIGKILL received.

> >

> > 2)page fault occured in gfx' job because during the lifetime of gfx

> > job we manually fake signal all jobs from its entity in entity_fini(),

> > thus the unmapping/clear PTE job depend on those result fence is

> > satisfied and sdma start clearing the PTE and lead to GFX page fault.

> >

> > fix:

> > 1)should at least wait all jobs already scheduled complete in

> > entity_fini() if SIGKILL is the case.

> >

> > 2)if a fence signaled and try to clear some entity's dependency,

> > should set this entity guilty to prevent its job really run since the

> > dependency is fake signaled.

> >

> > v2:

> > splitting drm_sched_entity_fini() into two functions:

> > 1)The first one is does the waiting, removes the entity from the

> > runqueue and returns an error when the process was killed.

> > 2)The second one then goes over the entity, install it as completion

> > signal for the remaining jobs and signals all jobs with an error code.

> 

> First of all this patch won't work like this. You need to call the first part

> before the VM teardown in amdgpu_driver_postclose_kms() and the second

> part after the VM teardown.

> 

> Then please come up with a better name than fini1 and fini2, something like

> drm_sched_entity_cleanup() or something like this.

> 

> Then I think it is harmless to call the _cleanup() function from the

> _fini() function and only change the places where it makes sense to

> explicitely call _cleanup().

> 

> E.g. add a new function amdgpu_ctx_mgr_cleanup() which calls the

> cleanup() for each still open ctx and then modify the order in

> amdgpu_driver_postclose_kms().

> 

> Regards,

> Christian.

> 

> >

> > Signed-off-by: Monk Liu <Monk.Liu@amd.com>

> > Signed-off-by: Emily Deng <Emily.Deng@amd.com>

> > ---

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 13 +++++-

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   |  4 +-

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c   |  3 +-

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c   |  3 +-

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c   |  8 +++-

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c    |  6 ++-

> >   drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c     |  3 +-

> >   drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c     |  3 +-

> >   drivers/gpu/drm/scheduler/gpu_scheduler.c | 67

> +++++++++++++++++++++++++------

> >   include/drm/gpu_scheduler.h               |  7 +++-

> >   10 files changed, 93 insertions(+), 24 deletions(-)

> >

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > index 09d35051..69dadf9 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > @@ -104,8 +104,13 @@ static int amdgpu_ctx_init(struct amdgpu_device

> > *adev,

> >

> >   failed:

> >   	for (j = 0; j < i; j++)

> > -		drm_sched_entity_fini(&adev->rings[j]->sched,

> > +		drm_sched_entity_fini1(&adev->rings[j]->sched,

> >   				      &ctx->rings[j].entity);

> > +

> > +	for (j = 0; j < i; j++)

> > +		drm_sched_entity_fini2(&adev->rings[j]->sched,

> > +				      &ctx->rings[j].entity);

> > +

> >   	kfree(ctx->fences);

> >   	ctx->fences = NULL;

> >   	return r;

> > @@ -126,7 +131,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx

> *ctx)

> >   	ctx->fences = NULL;

> >

> >   	for (i = 0; i < adev->num_rings; i++)

> > -		drm_sched_entity_fini(&adev->rings[i]->sched,

> > +		drm_sched_entity_fini1(&adev->rings[i]->sched,

> > +				      &ctx->rings[i].entity);

> > +

> > +	for (i = 0; i < adev->num_rings; i++)

> > +		drm_sched_entity_fini2(&adev->rings[i]->sched,

> >   				      &ctx->rings[i].entity);

> >

> >   	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr); diff --git

> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

> > index d2ab404..83d46bb 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

> > @@ -131,8 +131,10 @@ static int amdgpu_ttm_global_init(struct

> amdgpu_device *adev)

> >   static void amdgpu_ttm_global_fini(struct amdgpu_device *adev)

> >   {

> >   	if (adev->mman.mem_global_referenced) {

> > -		drm_sched_entity_fini(adev->mman.entity.sched,

> > +		drm_sched_entity_fini1(adev->mman.entity.sched,

> >   				      &adev->mman.entity);

> > +		drm_sched_entity_fini2(adev->mman.entity.sched,

> > +                      &adev->mman.entity);

> >   		mutex_destroy(&adev->mman.gtt_window_lock);

> >   		drm_global_item_unref(&adev->mman.bo_global_ref.ref);

> >   		drm_global_item_unref(&adev->mman.mem_global_ref);

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

> > index 627542b..74a6953 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c

> > @@ -277,7 +277,8 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device

> *adev)

> >   	int i;

> >   	kfree(adev->uvd.saved_bo);

> >

> > -	drm_sched_entity_fini(&adev->uvd.ring.sched, &adev->uvd.entity);

> > +	drm_sched_entity_fini1(&adev->uvd.ring.sched, &adev->uvd.entity);

> > +	drm_sched_entity_fini2(&adev->uvd.ring.sched, &adev->uvd.entity);

> >

> >   	amdgpu_bo_free_kernel(&adev->uvd.vcpu_bo,

> >   			      &adev->uvd.gpu_addr,

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c

> > index a33804b..4166d26 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c

> > @@ -212,7 +212,8 @@ int amdgpu_vce_sw_fini(struct amdgpu_device

> *adev)

> >   	if (adev->vce.vcpu_bo == NULL)

> >   		return 0;

> >

> > -	drm_sched_entity_fini(&adev->vce.ring[0].sched, &adev->vce.entity);

> > +	drm_sched_entity_fini1(&adev->vce.ring[0].sched, &adev-

> >vce.entity);

> > +	drm_sched_entity_fini2(&adev->vce.ring[0].sched, &adev-

> >vce.entity);

> >

> >   	amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev-

> >vce.gpu_addr,

> >   		(void **)&adev->vce.cpu_addr);

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c

> > index 58e4953..268c0c3 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c

> > @@ -129,9 +129,13 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device

> > *adev)

> >

> >   	kfree(adev->vcn.saved_bo);

> >

> > -	drm_sched_entity_fini(&adev->vcn.ring_dec.sched, &adev-

> >vcn.entity_dec);

> > +	drm_sched_entity_fini1(&adev->vcn.ring_dec.sched,

> > +&adev->vcn.entity_dec);

> >

> > -	drm_sched_entity_fini(&adev->vcn.ring_enc[0].sched, &adev-

> >vcn.entity_enc);

> > +	drm_sched_entity_fini1(&adev->vcn.ring_enc[0].sched,

> > +&adev->vcn.entity_enc);

> > +

> > +	drm_sched_entity_fini2(&adev->vcn.ring_dec.sched,

> > +&adev->vcn.entity_dec);

> > +

> > +	drm_sched_entity_fini2(&adev->vcn.ring_enc[0].sched,

> > +&adev->vcn.entity_enc);

> >

> >   	amdgpu_bo_free_kernel(&adev->vcn.vcpu_bo,

> >   			      &adev->vcn.gpu_addr,

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

> > index 2447429..084a39d 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

> > @@ -2456,7 +2456,8 @@ int amdgpu_vm_init(struct amdgpu_device

> *adev, struct amdgpu_vm *vm,

> >   	vm->root.base.bo = NULL;

> >

> >   error_free_sched_entity:

> > -	drm_sched_entity_fini(&ring->sched, &vm->entity);

> > +	drm_sched_entity_fini1(&ring->sched, &vm->entity);

> > +	drm_sched_entity_fini2(&ring->sched, &vm->entity);

> >

> >   	return r;

> >   }

> > @@ -2520,7 +2521,8 @@ void amdgpu_vm_fini(struct amdgpu_device

> *adev, struct amdgpu_vm *vm)

> >   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock,

> flags);

> >   	}

> >

> > -	drm_sched_entity_fini(vm->entity.sched, &vm->entity);

> > +	drm_sched_entity_fini1(vm->entity.sched, &vm->entity);

> > +	drm_sched_entity_fini2(vm->entity.sched, &vm->entity);

> >

> >   	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {

> >   		dev_err(adev->dev, "still active bo inside vm\n"); diff --git

> > a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c

> > b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c

> > index f26f515..3c57a6e9 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c

> > @@ -469,7 +469,8 @@ static int uvd_v6_0_sw_fini(void *handle)

> >   		return r;

> >

> >   	if (uvd_v6_0_enc_support(adev)) {

> > -		drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched,

> &adev->uvd.entity_enc);

> > +		drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched,

> &adev->uvd.entity_enc);

> > +		drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched,

> > +&adev->uvd.entity_enc);

> >

> >   		for (i = 0; i < adev->uvd.num_enc_rings; ++i)

> >   			amdgpu_ring_fini(&adev->uvd.ring_enc[i]);

> > diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c

> > b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c

> > index 280c082..a422acf 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c

> > @@ -472,7 +472,8 @@ static int uvd_v7_0_sw_fini(void *handle)

> >   	if (r)

> >   		return r;

> >

> > -	drm_sched_entity_fini(&adev->uvd.ring_enc[0].sched, &adev-

> >uvd.entity_enc);

> > +	drm_sched_entity_fini1(&adev->uvd.ring_enc[0].sched, &adev-

> >uvd.entity_enc);

> > +	drm_sched_entity_fini2(&adev->uvd.ring_enc[0].sched,

> > +&adev->uvd.entity_enc);

> >

> >   	for (i = 0; i < adev->uvd.num_enc_rings; ++i)

> >   		amdgpu_ring_fini(&adev->uvd.ring_enc[i]);

> > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > b/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > index 310275e..abeb3ff 100644

> > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct

> drm_gpu_scheduler *sched,

> >   	entity->rq = rq;

> >   	entity->sched = sched;

> >   	entity->guilty = guilty;

> > +	entity->fini_status = 0;

> > +	entity->finished = NULL;

> >

> >   	spin_lock_init(&entity->rq_lock);

> >   	spin_lock_init(&entity->queue_lock);

> > @@ -197,19 +199,29 @@ static bool drm_sched_entity_is_ready(struct

> drm_sched_entity *entity)

> >   	return true;

> >   }

> >

> > +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,

> > +				    struct dma_fence_cb *cb)

> > +{

> > +	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,

> > +						 finish_cb);

> > +	drm_sched_fence_finished(job->s_fence);

> > +	WARN_ON(job->s_fence->parent);

> > +	dma_fence_put(&job->s_fence->finished);

> > +	job->sched->ops->free_job(job);

> > +}

> > +

> >   /**

> >    * Destroy a context entity

> >    *

> >    * @sched       Pointer to scheduler instance

> >    * @entity	The pointer to a valid scheduler entity

> >    *

> > - * Cleanup and free the allocated resources.

> > + * Splitting drm_sched_entity_fini() into two functions, The first

> > + one is does the waiting,

> > + * removes the entity from the runqueue and returns an error when the

> process was killed.

> >    */

> > -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> > +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,

> >   			   struct drm_sched_entity *entity)

> >   {

> > -	int r;

> > -

> >   	if (!drm_sched_entity_is_initialized(sched, entity))

> >   		return;

> >   	/**

> > @@ -217,13 +229,27 @@ void drm_sched_entity_fini(struct

> drm_gpu_scheduler *sched,

> >   	 * queued IBs or discard them on SIGKILL

> >   	*/

> >   	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)

> > -		r = -ERESTARTSYS;

> > +		entity->fini_status = -ERESTARTSYS;

> >   	else

> > -		r = wait_event_killable(sched->job_scheduled,

> > +		entity->fini_status = wait_event_killable(sched-

> >job_scheduled,

> >   					drm_sched_entity_is_idle(entity));

> >   	drm_sched_entity_set_rq(entity, NULL);

> > -	if (r) {

> > +}

> > +

> > +/**

> > + * Destroy a context entity

> > + *

> > + * @sched       Pointer to scheduler instance

> > + * @entity	The pointer to a valid scheduler entity

> > + *

> > + * The second one then goes over the entity and signals all jobs with an

> error code.

> > + */

> > +void drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,

> > +			   struct drm_sched_entity *entity) {

> > +	if (entity->fini_status) {

> >   		struct drm_sched_job *job;

> > +		int r;

> >

> >   		/* Park the kernel for a moment to make sure it isn't

> processing

> >   		 * our enity.

> > @@ -241,14 +267,21 @@ void drm_sched_entity_fini(struct

> drm_gpu_scheduler *sched,

> >   			struct drm_sched_fence *s_fence = job->s_fence;

> >   			drm_sched_fence_scheduled(s_fence);

> >   			dma_fence_set_error(&s_fence->finished, -ESRCH);

> > -			drm_sched_fence_finished(s_fence);

> > -			WARN_ON(s_fence->parent);

> > -			dma_fence_put(&s_fence->finished);

> > -			sched->ops->free_job(job);

> > +            r = dma_fence_add_callback(entity->finished, &job->finish_cb,

> > +                               drm_sched_entity_fini_job_cb);

> > +            if (r == -ENOENT)

> > +                drm_sched_entity_fini_job_cb(entity->finished, &job->finish_cb);

> > +            else if (r)

> > +                DRM_ERROR("fence add callback failed (%d)\n", r);

> > +        }

> > +        if(entity->finished){

> > +            dma_fence_put(entity->finished);

> > +		    entity->finished = NULL;

> >   		}

> >   	}

> >   }

> > -EXPORT_SYMBOL(drm_sched_entity_fini);

> > +EXPORT_SYMBOL(drm_sched_entity_fini1);

> > +EXPORT_SYMBOL(drm_sched_entity_fini2);

> >

> >   static void drm_sched_entity_wakeup(struct dma_fence *f, struct

> dma_fence_cb *cb)

> >   {

> > @@ -530,6 +563,11 @@ void drm_sched_job_recovery(struct

> drm_gpu_scheduler *sched)

> >   		spin_unlock(&sched->job_list_lock);

> >   		fence = sched->ops->run_job(s_job);

> >   		atomic_inc(&sched->hw_rq_count);

> > +

> > +        if(s_job->entity->finished)

> > +			dma_fence_put(s_job->entity->finished);

> > +		s_job->entity->finished = dma_fence_get(&s_fence-

> >finished);

> > +

> >   		if (fence) {

> >   			s_fence->parent = dma_fence_get(fence);

> >   			r = dma_fence_add_callback(fence, &s_fence->cb,

> @@ -556,6 +594,7

> > @@ int drm_sched_job_init(struct drm_sched_job *job,

> >   		       void *owner)

> >   {

> >   	job->sched = sched;

> > +	job->entity = entity;

> >   	job->s_priority = entity->rq - sched->sched_rq;

> >   	job->s_fence = drm_sched_fence_create(entity, owner);

> >   	if (!job->s_fence)

> > @@ -669,6 +708,10 @@ static int drm_sched_main(void *param)

> >   		fence = sched->ops->run_job(sched_job);

> >   		drm_sched_fence_scheduled(s_fence);

> >

> > +		if(entity->finished)

> > +			dma_fence_put(entity->finished);

> > +		entity->finished = dma_fence_get(&s_fence->finished);

> > +

> >   		if (fence) {

> >   			s_fence->parent = dma_fence_get(fence);

> >   			r = dma_fence_add_callback(fence, &s_fence->cb,

> diff --git

> > a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index

> > dfd54fb..e0a5a53 100644

> > --- a/include/drm/gpu_scheduler.h

> > +++ b/include/drm/gpu_scheduler.h

> > @@ -63,6 +63,8 @@ struct drm_sched_entity {

> >   	struct dma_fence		*dependency;

> >   	struct dma_fence_cb		cb;

> >   	atomic_t			*guilty; /* points to ctx's guilty */

> > +	uint32_t            fini_status;

> > +	struct dma_fence    *finished;

> >   };

> >

> >   /**

> > @@ -99,6 +101,7 @@ struct drm_sched_job {

> >   	uint64_t			id;

> >   	atomic_t			karma;

> >   	enum drm_sched_priority		s_priority;

> > +	struct drm_sched_entity  *entity;

> >   };

> >

> >   static inline bool drm_sched_invalidate_job(struct drm_sched_job

> > *s_job, @@ -148,7 +151,9 @@ int drm_sched_entity_init(struct

> drm_gpu_scheduler *sched,

> >   			  struct drm_sched_entity *entity,

> >   			  struct drm_sched_rq *rq,

> >   			  uint32_t jobs, atomic_t *guilty); -void

> > drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> > +void drm_sched_entity_fini1(struct drm_gpu_scheduler *sched,

> > +			   struct drm_sched_entity *entity); void

> > +drm_sched_entity_fini2(struct drm_gpu_scheduler *sched,

> >   			   struct drm_sched_entity *entity);

> >   void drm_sched_entity_push_job(struct drm_sched_job *sched_job,

> >   			       struct drm_sched_entity *entity);