drm/gpu-sched: fix force APP kill hang(v3)

Submitted by Deng, Emily on April 12, 2018, 10:22 a.m.

Details

Message ID 1523528532-12927-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "drm/gpu-sched: fix force APP kill hang" ( rev: 3 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily April 12, 2018, 10:22 a.m.
issue:
there are VMC page fault occurred if force APP kill during
3dmark test, the cause is in entity_fini we manually signal
all those jobs in entity's queue which confuse the sync/dep
mechanism:

1)page fault occurred in sdma's clear job which operate on
shadow buffer, and shadow buffer's Gart table is cleaned by
ttm_bo_release since the fence in its reservation was fake signaled
by entity_fini() under the case of SIGKILL received.

2)page fault occurred in gfx' job because during the lifetime
of gfx job we manually fake signal all jobs from its entity
in entity_fini(), thus the unmapping/clear PTE job depend on those
result fence is satisfied and sdma start clearing the PTE and lead
to GFX page fault.

fix:
1)should at least wait all jobs already scheduled complete in entity_fini()
if SIGKILL is the case.

2)if a fence signaled and try to clear some entity's dependency, should
set this entity guilty to prevent its job really run since the dependency
is fake signaled.

v2:
splitting drm_sched_entity_fini() into two functions:
1)The first one is does the waiting, removes the entity from the
runqueue and returns an error when the process was killed.
2)The second one then goes over the entity, install it as
completion signal for the remaining jobs and signals all jobs
with an error code.

v3:
1)Replace the fini1 and fini2 with better name
2)Call the first part before the VM teardown in
amdgpu_driver_postclose_kms() and the second part
after the VM teardown
3)Keep the original function drm_sched_entity_fini to
refine the code.

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64 ++++++++++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-
 drivers/gpu/drm/scheduler/gpu_scheduler.c | 74 ++++++++++++++++++++++++++-----
 include/drm/gpu_scheduler.h               |  7 +++
 5 files changed, 131 insertions(+), 21 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 2babfad..200db73 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -681,6 +681,8 @@  int amdgpu_ctx_ioctl(struct drm_device *dev, void *data,
 int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned ring_id);
 
 void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
+void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);
+void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
 
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 09d35051..659add4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -111,8 +111,9 @@  static int amdgpu_ctx_init(struct amdgpu_device *adev,
 	return r;
 }
 
-static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
+static void amdgpu_ctx_fini(struct kref *ref)
 {
+	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx, refcount);
 	struct amdgpu_device *adev = ctx->adev;
 	unsigned i, j;
 
@@ -125,13 +126,11 @@  static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
 	kfree(ctx->fences);
 	ctx->fences = NULL;
 
-	for (i = 0; i < adev->num_rings; i++)
-		drm_sched_entity_fini(&adev->rings[i]->sched,
-				      &ctx->rings[i].entity);
-
 	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
 
 	mutex_destroy(&ctx->lock);
+
+	kfree(ctx);
 }
 
 static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
@@ -170,12 +169,15 @@  static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
 static void amdgpu_ctx_do_release(struct kref *ref)
 {
 	struct amdgpu_ctx *ctx;
+	u32 i;
 
 	ctx = container_of(ref, struct amdgpu_ctx, refcount);
 
-	amdgpu_ctx_fini(ctx);
+	for (i = 0; i < ctx->adev->num_rings; i++)
+		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
+							&ctx->rings[i].entity);
 
-	kfree(ctx);
+	amdgpu_ctx_fini(ref);
 }
 
 static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)
@@ -435,16 +437,62 @@  void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr)
 	idr_init(&mgr->ctx_handles);
 }
 
+void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr)
+{
+	struct amdgpu_ctx *ctx;
+	struct idr *idp;
+	uint32_t id, i;
+
+	idp = &mgr->ctx_handles;
+
+	idr_for_each_entry(idp, ctx, id) {
+
+		if (!ctx->adev)
+			return;
+
+		for (i = 0; i < ctx->adev->num_rings; i++)
+			if (kref_read(&ctx->refcount) == 1)
+				drm_sched_entity_do_release(&ctx->adev->rings[i]->sched,
+						  &ctx->rings[i].entity);
+			else
+				DRM_ERROR("ctx %p is still alive\n", ctx);
+	}
+}
+
+void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr)
+{
+	struct amdgpu_ctx *ctx;
+	struct idr *idp;
+	uint32_t id, i;
+
+	idp = &mgr->ctx_handles;
+
+	idr_for_each_entry(idp, ctx, id) {
+
+		if (!ctx->adev)
+			return;
+
+		for (i = 0; i < ctx->adev->num_rings; i++)
+			if (kref_read(&ctx->refcount) == 1)
+				drm_sched_entity_cleanup(&ctx->adev->rings[i]->sched,
+					&ctx->rings[i].entity);
+			else
+				DRM_ERROR("ctx %p is still alive\n", ctx);
+	}
+}
+
 void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)
 {
 	struct amdgpu_ctx *ctx;
 	struct idr *idp;
 	uint32_t id;
 
+	amdgpu_ctx_mgr_entity_cleanup(mgr);
+
 	idp = &mgr->ctx_handles;
 
 	idr_for_each_entry(idp, ctx, id) {
-		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
+		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
 			DRM_ERROR("ctx %p is still alive\n", ctx);
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 487d39e..6cbb427 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -913,8 +913,7 @@  void amdgpu_driver_postclose_kms(struct drm_device *dev,
 		return;
 
 	pm_runtime_get_sync(dev->dev);
-
-	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
+	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
 
 	if (adev->asic_type != CHIP_RAVEN) {
 		amdgpu_uvd_free_handles(adev, file_priv);
@@ -935,6 +934,8 @@  void amdgpu_driver_postclose_kms(struct drm_device *dev,
 	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
 
 	amdgpu_vm_fini(adev, &fpriv->vm);
+	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
+
 	if (pasid)
 		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
 	amdgpu_bo_unref(&pd);
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c
index 310275e..9062d44 100644
--- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
+++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
@@ -136,6 +136,8 @@  int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 	entity->rq = rq;
 	entity->sched = sched;
 	entity->guilty = guilty;
+	entity->fini_status = 0;
+	entity->finished = NULL;
 
 	spin_lock_init(&entity->rq_lock);
 	spin_lock_init(&entity->queue_lock);
@@ -197,19 +199,30 @@  static bool drm_sched_entity_is_ready(struct drm_sched_entity *entity)
 	return true;
 }
 
+static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
+				    struct dma_fence_cb *cb)
+{
+	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
+						 finish_cb);
+	drm_sched_fence_finished(job->s_fence);
+	WARN_ON(job->s_fence->parent);
+	dma_fence_put(&job->s_fence->finished);
+	job->sched->ops->free_job(job);
+}
+
+
 /**
  * Destroy a context entity
  *
  * @sched       Pointer to scheduler instance
  * @entity	The pointer to a valid scheduler entity
  *
- * Cleanup and free the allocated resources.
+ * Splitting drm_sched_entity_fini() into two functions, The first one is does the waiting,
+ * removes the entity from the runqueue and returns an error when the process was killed.
  */
-void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
 			   struct drm_sched_entity *entity)
 {
-	int r;
-
 	if (!drm_sched_entity_is_initialized(sched, entity))
 		return;
 	/**
@@ -217,13 +230,28 @@  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 	 * queued IBs or discard them on SIGKILL
 	*/
 	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
-		r = -ERESTARTSYS;
+		entity->fini_status = -ERESTARTSYS;
 	else
-		r = wait_event_killable(sched->job_scheduled,
+		entity->fini_status = wait_event_killable(sched->job_scheduled,
 					drm_sched_entity_is_idle(entity));
 	drm_sched_entity_set_rq(entity, NULL);
-	if (r) {
+}
+EXPORT_SYMBOL(drm_sched_entity_do_release);
+
+/**
+ * Destroy a context entity
+ *
+ * @sched       Pointer to scheduler instance
+ * @entity	The pointer to a valid scheduler entity
+ *
+ * The second one then goes over the entity and signals all jobs with an error code.
+ */
+void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity)
+{
+	if (entity->fini_status) {
 		struct drm_sched_job *job;
+		int r;
 
 		/* Park the kernel for a moment to make sure it isn't processing
 		 * our enity.
@@ -241,13 +269,28 @@  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 			struct drm_sched_fence *s_fence = job->s_fence;
 			drm_sched_fence_scheduled(s_fence);
 			dma_fence_set_error(&s_fence->finished, -ESRCH);
-			drm_sched_fence_finished(s_fence);
-			WARN_ON(s_fence->parent);
-			dma_fence_put(&s_fence->finished);
-			sched->ops->free_job(job);
+			r = dma_fence_add_callback(entity->finished, &job->finish_cb,
+							drm_sched_entity_fini_job_cb);
+			if (r == -ENOENT)
+				drm_sched_entity_fini_job_cb(entity->finished, &job->finish_cb);
+			else if (r)
+				DRM_ERROR("fence add callback failed (%d)\n", r);
+		}
+
+		if (entity->finished) {
+			dma_fence_put(entity->finished);
+			entity->finished = NULL;
 		}
 	}
 }
+EXPORT_SYMBOL(drm_sched_entity_cleanup);
+
+void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
+				struct drm_sched_entity *entity)
+{
+	drm_sched_entity_do_release(sched, entity);
+	drm_sched_entity_cleanup(sched, entity);
+}
 EXPORT_SYMBOL(drm_sched_entity_fini);
 
 static void drm_sched_entity_wakeup(struct dma_fence *f, struct dma_fence_cb *cb)
@@ -530,6 +573,11 @@  void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
 		spin_unlock(&sched->job_list_lock);
 		fence = sched->ops->run_job(s_job);
 		atomic_inc(&sched->hw_rq_count);
+
+		if (s_job->entity->finished)
+			dma_fence_put(s_job->entity->finished);
+		s_job->entity->finished = dma_fence_get(&s_fence->finished);
+
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
 			r = dma_fence_add_callback(fence, &s_fence->cb,
@@ -556,6 +604,7 @@  int drm_sched_job_init(struct drm_sched_job *job,
 		       void *owner)
 {
 	job->sched = sched;
+	job->entity = entity;
 	job->s_priority = entity->rq - sched->sched_rq;
 	job->s_fence = drm_sched_fence_create(entity, owner);
 	if (!job->s_fence)
@@ -668,6 +717,9 @@  static int drm_sched_main(void *param)
 
 		fence = sched->ops->run_job(sched_job);
 		drm_sched_fence_scheduled(s_fence);
+		if (entity->finished)
+			dma_fence_put(entity->finished);
+		entity->finished = dma_fence_get(&s_fence->finished);
 
 		if (fence) {
 			s_fence->parent = dma_fence_get(fence);
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index c053a32..a0dd947 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -65,6 +65,8 @@  struct drm_sched_entity {
 	struct dma_fence		*dependency;
 	struct dma_fence_cb		cb;
 	atomic_t			*guilty; /* points to ctx's guilty */
+	uint32_t            fini_status;
+	struct dma_fence    *finished;
 };
 
 /**
@@ -119,6 +121,7 @@  struct drm_sched_job {
 	uint64_t			id;
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
+	struct drm_sched_entity  *entity;
 };
 
 static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
@@ -186,6 +189,10 @@  int drm_sched_entity_init(struct drm_gpu_scheduler *sched,
 			  struct drm_sched_entity *entity,
 			  struct drm_sched_rq *rq,
 			  uint32_t jobs, atomic_t *guilty);
+void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity);
+void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
+			   struct drm_sched_entity *entity);
 void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
 			   struct drm_sched_entity *entity);
 void drm_sched_entity_push_job(struct drm_sched_job *sched_job,

Comments

Ping....

Best Wishes,
Emily Deng




> -----Original Message-----
> From: Emily Deng [mailto:Emily.Deng@amd.com]
> Sent: Thursday, April 12, 2018 6:22 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
> Subject: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)
> 
> issue:
> there are VMC page fault occurred if force APP kill during 3dmark test, the
> cause is in entity_fini we manually signal all those jobs in entity's queue
> which confuse the sync/dep
> mechanism:
> 
> 1)page fault occurred in sdma's clear job which operate on shadow buffer,
> and shadow buffer's Gart table is cleaned by ttm_bo_release since the fence
> in its reservation was fake signaled by entity_fini() under the case of SIGKILL
> received.
> 
> 2)page fault occurred in gfx' job because during the lifetime of gfx job we
> manually fake signal all jobs from its entity in entity_fini(), thus the
> unmapping/clear PTE job depend on those result fence is satisfied and sdma
> start clearing the PTE and lead to GFX page fault.
> 
> fix:
> 1)should at least wait all jobs already scheduled complete in entity_fini() if
> SIGKILL is the case.
> 
> 2)if a fence signaled and try to clear some entity's dependency, should set
> this entity guilty to prevent its job really run since the dependency is fake
> signaled.
> 
> v2:
> splitting drm_sched_entity_fini() into two functions:
> 1)The first one is does the waiting, removes the entity from the runqueue
> and returns an error when the process was killed.
> 2)The second one then goes over the entity, install it as completion signal for
> the remaining jobs and signals all jobs with an error code.
> 
> v3:
> 1)Replace the fini1 and fini2 with better name 2)Call the first part before the
> VM teardown in
> amdgpu_driver_postclose_kms() and the second part after the VM teardown
> 3)Keep the original function drm_sched_entity_fini to refine the code.
> 
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64
> ++++++++++++++++++++++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-
>  drivers/gpu/drm/scheduler/gpu_scheduler.c | 74
> ++++++++++++++++++++++++++-----
>  include/drm/gpu_scheduler.h               |  7 +++
>  5 files changed, 131 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 2babfad..200db73 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void
> *data,  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned
> ring_id);
> 
>  void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr); void
> +amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
>  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
> 
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 09d35051..659add4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -111,8 +111,9 @@ static int amdgpu_ctx_init(struct amdgpu_device
> *adev,
>  	return r;
>  }
> 
> -static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
> +static void amdgpu_ctx_fini(struct kref *ref)
>  {
> +	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx,
> +refcount);
>  	struct amdgpu_device *adev = ctx->adev;
>  	unsigned i, j;
> 
> @@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx
> *ctx)
>  	kfree(ctx->fences);
>  	ctx->fences = NULL;
> 
> -	for (i = 0; i < adev->num_rings; i++)
> -		drm_sched_entity_fini(&adev->rings[i]->sched,
> -				      &ctx->rings[i].entity);
> -
>  	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
> 
>  	mutex_destroy(&ctx->lock);
> +
> +	kfree(ctx);
>  }
> 
>  static int amdgpu_ctx_alloc(struct amdgpu_device *adev, @@ -170,12
> +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
> static void amdgpu_ctx_do_release(struct kref *ref)  {
>  	struct amdgpu_ctx *ctx;
> +	u32 i;
> 
>  	ctx = container_of(ref, struct amdgpu_ctx, refcount);
> 
> -	amdgpu_ctx_fini(ctx);
> +	for (i = 0; i < ctx->adev->num_rings; i++)
> +		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
> +							&ctx->rings[i].entity);
> 
> -	kfree(ctx);
> +	amdgpu_ctx_fini(ref);
>  }
> 
>  static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id) @@ -
> 435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr
> *mgr)
>  	idr_init(&mgr->ctx_handles);
>  }
> 
> +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) {
> +	struct amdgpu_ctx *ctx;
> +	struct idr *idp;
> +	uint32_t id, i;
> +
> +	idp = &mgr->ctx_handles;
> +
> +	idr_for_each_entry(idp, ctx, id) {
> +
> +		if (!ctx->adev)
> +			return;
> +
> +		for (i = 0; i < ctx->adev->num_rings; i++)
> +			if (kref_read(&ctx->refcount) == 1)
> +				drm_sched_entity_do_release(&ctx->adev-
> >rings[i]->sched,
> +						  &ctx->rings[i].entity);
> +			else
> +				DRM_ERROR("ctx %p is still alive\n", ctx);
> +	}
> +}
> +
> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) {
> +	struct amdgpu_ctx *ctx;
> +	struct idr *idp;
> +	uint32_t id, i;
> +
> +	idp = &mgr->ctx_handles;
> +
> +	idr_for_each_entry(idp, ctx, id) {
> +
> +		if (!ctx->adev)
> +			return;
> +
> +		for (i = 0; i < ctx->adev->num_rings; i++)
> +			if (kref_read(&ctx->refcount) == 1)
> +				drm_sched_entity_cleanup(&ctx->adev-
> >rings[i]->sched,
> +					&ctx->rings[i].entity);
> +			else
> +				DRM_ERROR("ctx %p is still alive\n", ctx);
> +	}
> +}
> +
>  void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)  {
>  	struct amdgpu_ctx *ctx;
>  	struct idr *idp;
>  	uint32_t id;
> 
> +	amdgpu_ctx_mgr_entity_cleanup(mgr);
> +
>  	idp = &mgr->ctx_handles;
> 
>  	idr_for_each_entry(idp, ctx, id) {
> -		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
> +		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
>  			DRM_ERROR("ctx %p is still alive\n", ctx);
>  	}
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index 487d39e..6cbb427 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct
> drm_device *dev,
>  		return;
> 
>  	pm_runtime_get_sync(dev->dev);
> -
> -	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
> +	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
> 
>  	if (adev->asic_type != CHIP_RAVEN) {
>  		amdgpu_uvd_free_handles(adev, file_priv); @@ -935,6
> +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>  	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
> 
>  	amdgpu_vm_fini(adev, &fpriv->vm);
> +	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
> +
>  	if (pasid)
>  		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
>  	amdgpu_bo_unref(&pd);
> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> index 310275e..9062d44 100644
> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
> @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler
> *sched,
>  	entity->rq = rq;
>  	entity->sched = sched;
>  	entity->guilty = guilty;
> +	entity->fini_status = 0;
> +	entity->finished = NULL;
> 
>  	spin_lock_init(&entity->rq_lock);
>  	spin_lock_init(&entity->queue_lock);
> @@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct
> drm_sched_entity *entity)
>  	return true;
>  }
> 
> +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
> +				    struct dma_fence_cb *cb)
> +{
> +	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
> +						 finish_cb);
> +	drm_sched_fence_finished(job->s_fence);
> +	WARN_ON(job->s_fence->parent);
> +	dma_fence_put(&job->s_fence->finished);
> +	job->sched->ops->free_job(job);
> +}
> +
> +
>  /**
>   * Destroy a context entity
>   *
>   * @sched       Pointer to scheduler instance
>   * @entity	The pointer to a valid scheduler entity
>   *
> - * Cleanup and free the allocated resources.
> + * Splitting drm_sched_entity_fini() into two functions, The first one
> + is does the waiting,
> + * removes the entity from the runqueue and returns an error when the
> process was killed.
>   */
> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>  			   struct drm_sched_entity *entity)
>  {
> -	int r;
> -
>  	if (!drm_sched_entity_is_initialized(sched, entity))
>  		return;
>  	/**
> @@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct
> drm_gpu_scheduler *sched,
>  	 * queued IBs or discard them on SIGKILL
>  	*/
>  	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
> -		r = -ERESTARTSYS;
> +		entity->fini_status = -ERESTARTSYS;
>  	else
> -		r = wait_event_killable(sched->job_scheduled,
> +		entity->fini_status = wait_event_killable(sched-
> >job_scheduled,
>  					drm_sched_entity_is_idle(entity));
>  	drm_sched_entity_set_rq(entity, NULL);
> -	if (r) {
> +}
> +EXPORT_SYMBOL(drm_sched_entity_do_release);
> +
> +/**
> + * Destroy a context entity
> + *
> + * @sched       Pointer to scheduler instance
> + * @entity	The pointer to a valid scheduler entity
> + *
> + * The second one then goes over the entity and signals all jobs with an
> error code.
> + */
> +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
> +			   struct drm_sched_entity *entity)
> +{
> +	if (entity->fini_status) {
>  		struct drm_sched_job *job;
> +		int r;
> 
>  		/* Park the kernel for a moment to make sure it isn't
> processing
>  		 * our enity.
> @@ -241,13 +269,28 @@ void drm_sched_entity_fini(struct
> drm_gpu_scheduler *sched,
>  			struct drm_sched_fence *s_fence = job->s_fence;
>  			drm_sched_fence_scheduled(s_fence);
>  			dma_fence_set_error(&s_fence->finished, -ESRCH);
> -			drm_sched_fence_finished(s_fence);
> -			WARN_ON(s_fence->parent);
> -			dma_fence_put(&s_fence->finished);
> -			sched->ops->free_job(job);
> +			r = dma_fence_add_callback(entity->finished, &job-
> >finish_cb,
> +
> 	drm_sched_entity_fini_job_cb);
> +			if (r == -ENOENT)
> +				drm_sched_entity_fini_job_cb(entity-
> >finished, &job->finish_cb);
> +			else if (r)
> +				DRM_ERROR("fence add callback failed
> (%d)\n", r);
> +		}
> +
> +		if (entity->finished) {
> +			dma_fence_put(entity->finished);
> +			entity->finished = NULL;
>  		}
>  	}
>  }
> +EXPORT_SYMBOL(drm_sched_entity_cleanup);
> +
> +void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
> +				struct drm_sched_entity *entity)
> +{
> +	drm_sched_entity_do_release(sched, entity);
> +	drm_sched_entity_cleanup(sched, entity); }
>  EXPORT_SYMBOL(drm_sched_entity_fini);
> 
>  static void drm_sched_entity_wakeup(struct dma_fence *f, struct
> dma_fence_cb *cb) @@ -530,6 +573,11 @@ void
> drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>  		spin_unlock(&sched->job_list_lock);
>  		fence = sched->ops->run_job(s_job);
>  		atomic_inc(&sched->hw_rq_count);
> +
> +		if (s_job->entity->finished)
> +			dma_fence_put(s_job->entity->finished);
> +		s_job->entity->finished = dma_fence_get(&s_fence-
> >finished);
> +
>  		if (fence) {
>  			s_fence->parent = dma_fence_get(fence);
>  			r = dma_fence_add_callback(fence, &s_fence->cb,
> @@ -556,6 +604,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>  		       void *owner)
>  {
>  	job->sched = sched;
> +	job->entity = entity;
>  	job->s_priority = entity->rq - sched->sched_rq;
>  	job->s_fence = drm_sched_fence_create(entity, owner);
>  	if (!job->s_fence)
> @@ -668,6 +717,9 @@ static int drm_sched_main(void *param)
> 
>  		fence = sched->ops->run_job(sched_job);
>  		drm_sched_fence_scheduled(s_fence);
> +		if (entity->finished)
> +			dma_fence_put(entity->finished);
> +		entity->finished = dma_fence_get(&s_fence->finished);
> 
>  		if (fence) {
>  			s_fence->parent = dma_fence_get(fence); diff --git
> a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index
> c053a32..a0dd947 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -65,6 +65,8 @@ struct drm_sched_entity {
>  	struct dma_fence		*dependency;
>  	struct dma_fence_cb		cb;
>  	atomic_t			*guilty; /* points to ctx's guilty */
> +	uint32_t            fini_status;
> +	struct dma_fence    *finished;
>  };
> 
>  /**
> @@ -119,6 +121,7 @@ struct drm_sched_job {
>  	uint64_t			id;
>  	atomic_t			karma;
>  	enum drm_sched_priority		s_priority;
> +	struct drm_sched_entity  *entity;
>  };
> 
>  static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
> @@ -186,6 +189,10 @@ int drm_sched_entity_init(struct
> drm_gpu_scheduler *sched,
>  			  struct drm_sched_entity *entity,
>  			  struct drm_sched_rq *rq,
>  			  uint32_t jobs, atomic_t *guilty);
> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
> +			   struct drm_sched_entity *entity); void
> +drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
> +			   struct drm_sched_entity *entity);
>  void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>  			   struct drm_sched_entity *entity);  void
> drm_sched_entity_push_job(struct drm_sched_job *sched_job,
> --
> 2.7.4
Hi Christian & Emily

This v3 version looks pretty good to me, but still some parts need to 
improve:

e.g.

1)entity->finished doesn't presenting what it really means, better 
rename it to entity->last_scheduled.

2)drm_sched_entity_fini_job_cb() better renamed to 
drm_sched_entity_kill_jobs_cb()

3) no need to pass "entity->finished" (line275 of gpu_schedulers.c) to 
drm_sched_entity_fini_job_cb() if -ENOENT returned after 
dma_fence_add_callback since this parm is not needed at all in this 
callback routine

4)better change type of entity->fini_status to "int" instead of 
"uint32_t" ... it should be aligned with the return type of 
wait_event_killable()

5)

+		if (entity->finished) {
+			dma_fence_put(entity->finished);
+			entity->finished = NULL;
  		}

no need to check entity->finished because dma_fence_put() will do it inside...



and the same here in job_recovery:

+
+		if (s_job->entity->finished)
+			dma_fence_put(s_job->entity->finished);

and the same here in sched_main:

+		if (entity->finished)
+			dma_fence_put(entity->finished);


6) why put amdgpu_ctx_mgr_fini() beneath amdgpu_vm_fini() ? any reason for that ?


thanks

/Monk






On 04/13/2018 10:06 AM, Deng, Emily wrote:
> Ping....
>
> Best Wishes,
> Emily Deng
>
>
>
>
>> -----Original Message-----
>> From: Emily Deng [mailto:Emily.Deng@amd.com]
>> Sent: Thursday, April 12, 2018 6:22 PM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>
>> Subject: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)
>>
>> issue:
>> there are VMC page fault occurred if force APP kill during 3dmark test, the
>> cause is in entity_fini we manually signal all those jobs in entity's queue
>> which confuse the sync/dep
>> mechanism:
>>
>> 1)page fault occurred in sdma's clear job which operate on shadow buffer,
>> and shadow buffer's Gart table is cleaned by ttm_bo_release since the fence
>> in its reservation was fake signaled by entity_fini() under the case of SIGKILL
>> received.
>>
>> 2)page fault occurred in gfx' job because during the lifetime of gfx job we
>> manually fake signal all jobs from its entity in entity_fini(), thus the
>> unmapping/clear PTE job depend on those result fence is satisfied and sdma
>> start clearing the PTE and lead to GFX page fault.
>>
>> fix:
>> 1)should at least wait all jobs already scheduled complete in entity_fini() if
>> SIGKILL is the case.
>>
>> 2)if a fence signaled and try to clear some entity's dependency, should set
>> this entity guilty to prevent its job really run since the dependency is fake
>> signaled.
>>
>> v2:
>> splitting drm_sched_entity_fini() into two functions:
>> 1)The first one is does the waiting, removes the entity from the runqueue
>> and returns an error when the process was killed.
>> 2)The second one then goes over the entity, install it as completion signal for
>> the remaining jobs and signals all jobs with an error code.
>>
>> v3:
>> 1)Replace the fini1 and fini2 with better name 2)Call the first part before the
>> VM teardown in
>> amdgpu_driver_postclose_kms() and the second part after the VM teardown
>> 3)Keep the original function drm_sched_entity_fini to refine the code.
>>
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64
>> ++++++++++++++++++++++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-
>>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 74
>> ++++++++++++++++++++++++++-----
>>   include/drm/gpu_scheduler.h               |  7 +++
>>   5 files changed, 131 insertions(+), 21 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 2babfad..200db73 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev, void
>> *data,  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx, unsigned
>> ring_id);
>>
>>   void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
>> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr); void
>> +amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
>>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
>>
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index 09d35051..659add4 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -111,8 +111,9 @@ static int amdgpu_ctx_init(struct amdgpu_device
>> *adev,
>>   	return r;
>>   }
>>
>> -static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
>> +static void amdgpu_ctx_fini(struct kref *ref)
>>   {
>> +	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx,
>> +refcount);
>>   	struct amdgpu_device *adev = ctx->adev;
>>   	unsigned i, j;
>>
>> @@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx
>> *ctx)
>>   	kfree(ctx->fences);
>>   	ctx->fences = NULL;
>>
>> -	for (i = 0; i < adev->num_rings; i++)
>> -		drm_sched_entity_fini(&adev->rings[i]->sched,
>> -				      &ctx->rings[i].entity);
>> -
>>   	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
>>
>>   	mutex_destroy(&ctx->lock);
>> +
>> +	kfree(ctx);
>>   }
>>
>>   static int amdgpu_ctx_alloc(struct amdgpu_device *adev, @@ -170,12
>> +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
>> static void amdgpu_ctx_do_release(struct kref *ref)  {
>>   	struct amdgpu_ctx *ctx;
>> +	u32 i;
>>
>>   	ctx = container_of(ref, struct amdgpu_ctx, refcount);
>>
>> -	amdgpu_ctx_fini(ctx);
>> +	for (i = 0; i < ctx->adev->num_rings; i++)
>> +		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
>> +							&ctx->rings[i].entity);
>>
>> -	kfree(ctx);
>> +	amdgpu_ctx_fini(ref);
>>   }
>>
>>   static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id) @@ -
>> 435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr
>> *mgr)
>>   	idr_init(&mgr->ctx_handles);
>>   }
>>
>> +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) {
>> +	struct amdgpu_ctx *ctx;
>> +	struct idr *idp;
>> +	uint32_t id, i;
>> +
>> +	idp = &mgr->ctx_handles;
>> +
>> +	idr_for_each_entry(idp, ctx, id) {
>> +
>> +		if (!ctx->adev)
>> +			return;
>> +
>> +		for (i = 0; i < ctx->adev->num_rings; i++)
>> +			if (kref_read(&ctx->refcount) == 1)
>> +				drm_sched_entity_do_release(&ctx->adev-
>>> rings[i]->sched,
>> +						  &ctx->rings[i].entity);
>> +			else
>> +				DRM_ERROR("ctx %p is still alive\n", ctx);
>> +	}
>> +}
>> +
>> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) {
>> +	struct amdgpu_ctx *ctx;
>> +	struct idr *idp;
>> +	uint32_t id, i;
>> +
>> +	idp = &mgr->ctx_handles;
>> +
>> +	idr_for_each_entry(idp, ctx, id) {
>> +
>> +		if (!ctx->adev)
>> +			return;
>> +
>> +		for (i = 0; i < ctx->adev->num_rings; i++)
>> +			if (kref_read(&ctx->refcount) == 1)
>> +				drm_sched_entity_cleanup(&ctx->adev-
>>> rings[i]->sched,
>> +					&ctx->rings[i].entity);
>> +			else
>> +				DRM_ERROR("ctx %p is still alive\n", ctx);
>> +	}
>> +}
>> +
>>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)  {
>>   	struct amdgpu_ctx *ctx;
>>   	struct idr *idp;
>>   	uint32_t id;
>>
>> +	amdgpu_ctx_mgr_entity_cleanup(mgr);
>> +
>>   	idp = &mgr->ctx_handles;
>>
>>   	idr_for_each_entry(idp, ctx, id) {
>> -		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
>> +		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
>>   			DRM_ERROR("ctx %p is still alive\n", ctx);
>>   	}
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index 487d39e..6cbb427 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct
>> drm_device *dev,
>>   		return;
>>
>>   	pm_runtime_get_sync(dev->dev);
>> -
>> -	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
>> +	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
>>
>>   	if (adev->asic_type != CHIP_RAVEN) {
>>   		amdgpu_uvd_free_handles(adev, file_priv); @@ -935,6
>> +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>>   	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
>>
>>   	amdgpu_vm_fini(adev, &fpriv->vm);
>> +	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
>> +
>>   	if (pasid)
>>   		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
>>   	amdgpu_bo_unref(&pd);
>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> index 310275e..9062d44 100644
>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>> @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler
>> *sched,
>>   	entity->rq = rq;
>>   	entity->sched = sched;
>>   	entity->guilty = guilty;
>> +	entity->fini_status = 0;
>> +	entity->finished = NULL;
>>
>>   	spin_lock_init(&entity->rq_lock);
>>   	spin_lock_init(&entity->queue_lock);
>> @@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct
>> drm_sched_entity *entity)
>>   	return true;
>>   }
>>
>> +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
>> +				    struct dma_fence_cb *cb)
>> +{
>> +	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
>> +						 finish_cb);
>> +	drm_sched_fence_finished(job->s_fence);
>> +	WARN_ON(job->s_fence->parent);
>> +	dma_fence_put(&job->s_fence->finished);
>> +	job->sched->ops->free_job(job);
>> +}
>> +
>> +
>>   /**
>>    * Destroy a context entity
>>    *
>>    * @sched       Pointer to scheduler instance
>>    * @entity	The pointer to a valid scheduler entity
>>    *
>> - * Cleanup and free the allocated resources.
>> + * Splitting drm_sched_entity_fini() into two functions, The first one
>> + is does the waiting,
>> + * removes the entity from the runqueue and returns an error when the
>> process was killed.
>>    */
>> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>>   			   struct drm_sched_entity *entity)
>>   {
>> -	int r;
>> -
>>   	if (!drm_sched_entity_is_initialized(sched, entity))
>>   		return;
>>   	/**
>> @@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct
>> drm_gpu_scheduler *sched,
>>   	 * queued IBs or discard them on SIGKILL
>>   	*/
>>   	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
>> -		r = -ERESTARTSYS;
>> +		entity->fini_status = -ERESTARTSYS;
>>   	else
>> -		r = wait_event_killable(sched->job_scheduled,
>> +		entity->fini_status = wait_event_killable(sched-
>>> job_scheduled,
>>   					drm_sched_entity_is_idle(entity));
>>   	drm_sched_entity_set_rq(entity, NULL);
>> -	if (r) {
>> +}
>> +EXPORT_SYMBOL(drm_sched_entity_do_release);
>> +
>> +/**
>> + * Destroy a context entity
>> + *
>> + * @sched       Pointer to scheduler instance
>> + * @entity	The pointer to a valid scheduler entity
>> + *
>> + * The second one then goes over the entity and signals all jobs with an
>> error code.
>> + */
>> +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>> +			   struct drm_sched_entity *entity)
>> +{
>> +	if (entity->fini_status) {
>>   		struct drm_sched_job *job;
>> +		int r;
>>
>>   		/* Park the kernel for a moment to make sure it isn't
>> processing
>>   		 * our enity.
>> @@ -241,13 +269,28 @@ void drm_sched_entity_fini(struct
>> drm_gpu_scheduler *sched,
>>   			struct drm_sched_fence *s_fence = job->s_fence;
>>   			drm_sched_fence_scheduled(s_fence);
>>   			dma_fence_set_error(&s_fence->finished, -ESRCH);
>> -			drm_sched_fence_finished(s_fence);
>> -			WARN_ON(s_fence->parent);
>> -			dma_fence_put(&s_fence->finished);
>> -			sched->ops->free_job(job);
>> +			r = dma_fence_add_callback(entity->finished, &job-
>>> finish_cb,
>> +
>> 	drm_sched_entity_fini_job_cb);
>> +			if (r == -ENOENT)
>> +				drm_sched_entity_fini_job_cb(entity-
>>> finished, &job->finish_cb);
>> +			else if (r)
>> +				DRM_ERROR("fence add callback failed
>> (%d)\n", r);
>> +		}
>> +
>> +		if (entity->finished) {
>> +			dma_fence_put(entity->finished);
>> +			entity->finished = NULL;
>>   		}
>>   	}
>>   }
>> +EXPORT_SYMBOL(drm_sched_entity_cleanup);
>> +
>> +void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>> +				struct drm_sched_entity *entity)
>> +{
>> +	drm_sched_entity_do_release(sched, entity);
>> +	drm_sched_entity_cleanup(sched, entity); }
>>   EXPORT_SYMBOL(drm_sched_entity_fini);
>>
>>   static void drm_sched_entity_wakeup(struct dma_fence *f, struct
>> dma_fence_cb *cb) @@ -530,6 +573,11 @@ void
>> drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>>   		spin_unlock(&sched->job_list_lock);
>>   		fence = sched->ops->run_job(s_job);
>>   		atomic_inc(&sched->hw_rq_count);
>> +
>> +		if (s_job->entity->finished)
>> +			dma_fence_put(s_job->entity->finished);
>> +		s_job->entity->finished = dma_fence_get(&s_fence-
>>> finished);
>> +
>>   		if (fence) {
>>   			s_fence->parent = dma_fence_get(fence);
>>   			r = dma_fence_add_callback(fence, &s_fence->cb,
>> @@ -556,6 +604,7 @@ int drm_sched_job_init(struct drm_sched_job *job,
>>   		       void *owner)
>>   {
>>   	job->sched = sched;
>> +	job->entity = entity;
>>   	job->s_priority = entity->rq - sched->sched_rq;
>>   	job->s_fence = drm_sched_fence_create(entity, owner);
>>   	if (!job->s_fence)
>> @@ -668,6 +717,9 @@ static int drm_sched_main(void *param)
>>
>>   		fence = sched->ops->run_job(sched_job);
>>   		drm_sched_fence_scheduled(s_fence);
>> +		if (entity->finished)
>> +			dma_fence_put(entity->finished);
>> +		entity->finished = dma_fence_get(&s_fence->finished);
>>
>>   		if (fence) {
>>   			s_fence->parent = dma_fence_get(fence); diff --git
>> a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index
>> c053a32..a0dd947 100644
>> --- a/include/drm/gpu_scheduler.h
>> +++ b/include/drm/gpu_scheduler.h
>> @@ -65,6 +65,8 @@ struct drm_sched_entity {
>>   	struct dma_fence		*dependency;
>>   	struct dma_fence_cb		cb;
>>   	atomic_t			*guilty; /* points to ctx's guilty */
>> +	uint32_t            fini_status;
>> +	struct dma_fence    *finished;
>>   };
>>
>>   /**
>> @@ -119,6 +121,7 @@ struct drm_sched_job {
>>   	uint64_t			id;
>>   	atomic_t			karma;
>>   	enum drm_sched_priority		s_priority;
>> +	struct drm_sched_entity  *entity;
>>   };
>>
>>   static inline bool drm_sched_invalidate_job(struct drm_sched_job *s_job,
>> @@ -186,6 +189,10 @@ int drm_sched_entity_init(struct
>> drm_gpu_scheduler *sched,
>>   			  struct drm_sched_entity *entity,
>>   			  struct drm_sched_rq *rq,
>>   			  uint32_t jobs, atomic_t *guilty);
>> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>> +			   struct drm_sched_entity *entity); void
>> +drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>> +			   struct drm_sched_entity *entity);
>>   void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>>   			   struct drm_sched_entity *entity);  void
>> drm_sched_entity_push_job(struct drm_sched_job *sched_job,
>> --
>> 2.7.4
Hi Monk,
     Thanks your review, I will refine the code as your suggestion 1),2),3), 4),5).
     About 6): I think it is no relation to put the amdgpu_ctx_mgr_fini before or after amdgpu_vm_fini.

Hi Christian, 
     Do you have any thoughts about 6)?

Best Wishes,
Emily Deng

> -----Original Message-----

> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf

> Of monk

> Sent: Friday, April 13, 2018 1:01 PM

> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

> Subject: Re: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)

> 

> Hi Christian & Emily

> 

> This v3 version looks pretty good to me, but still some parts need to

> improve:

> 

> e.g.

> 

> 1)entity->finished doesn't presenting what it really means, better rename it

> to entity->last_scheduled.

> 

> 2)drm_sched_entity_fini_job_cb() better renamed to

> drm_sched_entity_kill_jobs_cb()

> 

> 3) no need to pass "entity->finished" (line275 of gpu_schedulers.c) to

> drm_sched_entity_fini_job_cb() if -ENOENT returned after

> dma_fence_add_callback since this parm is not needed at all in this callback

> routine

> 

> 4)better change type of entity->fini_status to "int" instead of "uint32_t" ... it

> should be aligned with the return type of

> wait_event_killable()

> 

> 5)

> 

> +		if (entity->finished) {

> +			dma_fence_put(entity->finished);

> +			entity->finished = NULL;

>   		}

> 

> no need to check entity->finished because dma_fence_put() will do it inside...

> 

> 

> 

> and the same here in job_recovery:

> 

> +

> +		if (s_job->entity->finished)

> +			dma_fence_put(s_job->entity->finished);

> 

> and the same here in sched_main:

> 

> +		if (entity->finished)

> +			dma_fence_put(entity->finished);

> 

> 

> 6) why put amdgpu_ctx_mgr_fini() beneath amdgpu_vm_fini() ? any reason

> for that ?

> 

> 

> thanks

> 

> /Monk

> 

> 

> 

> 

> 

> 

> On 04/13/2018 10:06 AM, Deng, Emily wrote:

> > Ping....

> >

> > Best Wishes,

> > Emily Deng

> >

> >

> >

> >

> >> -----Original Message-----

> >> From: Emily Deng [mailto:Emily.Deng@amd.com]

> >> Sent: Thursday, April 12, 2018 6:22 PM

> >> To: amd-gfx@lists.freedesktop.org

> >> Cc: Deng, Emily <Emily.Deng@amd.com>; Liu, Monk

> <Monk.Liu@amd.com>

> >> Subject: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)

> >>

> >> issue:

> >> there are VMC page fault occurred if force APP kill during 3dmark

> >> test, the cause is in entity_fini we manually signal all those jobs

> >> in entity's queue which confuse the sync/dep

> >> mechanism:

> >>

> >> 1)page fault occurred in sdma's clear job which operate on shadow

> >> buffer, and shadow buffer's Gart table is cleaned by ttm_bo_release

> >> since the fence in its reservation was fake signaled by entity_fini()

> >> under the case of SIGKILL received.

> >>

> >> 2)page fault occurred in gfx' job because during the lifetime of gfx

> >> job we manually fake signal all jobs from its entity in

> >> entity_fini(), thus the unmapping/clear PTE job depend on those

> >> result fence is satisfied and sdma start clearing the PTE and lead to GFX

> page fault.

> >>

> >> fix:

> >> 1)should at least wait all jobs already scheduled complete in

> >> entity_fini() if SIGKILL is the case.

> >>

> >> 2)if a fence signaled and try to clear some entity's dependency,

> >> should set this entity guilty to prevent its job really run since the

> >> dependency is fake signaled.

> >>

> >> v2:

> >> splitting drm_sched_entity_fini() into two functions:

> >> 1)The first one is does the waiting, removes the entity from the

> >> runqueue and returns an error when the process was killed.

> >> 2)The second one then goes over the entity, install it as completion

> >> signal for the remaining jobs and signals all jobs with an error code.

> >>

> >> v3:

> >> 1)Replace the fini1 and fini2 with better name 2)Call the first part

> >> before the VM teardown in

> >> amdgpu_driver_postclose_kms() and the second part after the VM

> >> teardown 3)Keep the original function drm_sched_entity_fini to refine the

> code.

> >>

> >> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

> >> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

> >> ---

> >>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +

> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64

> >> ++++++++++++++++++++++----

> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-

> >>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 74

> >> ++++++++++++++++++++++++++-----

> >>   include/drm/gpu_scheduler.h               |  7 +++

> >>   5 files changed, 131 insertions(+), 21 deletions(-)

> >>

> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >> index 2babfad..200db73 100644

> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >> @@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev,

> void

> >> *data,  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,

> >> unsigned ring_id);

> >>

> >>   void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);

> >> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);

> void

> >> +amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);

> >>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);

> >>

> >>

> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> >> index 09d35051..659add4 100644

> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> >> @@ -111,8 +111,9 @@ static int amdgpu_ctx_init(struct amdgpu_device

> >> *adev,

> >>   	return r;

> >>   }

> >>

> >> -static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)

> >> +static void amdgpu_ctx_fini(struct kref *ref)

> >>   {

> >> +	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx,

> >> +refcount);

> >>   	struct amdgpu_device *adev = ctx->adev;

> >>   	unsigned i, j;

> >>

> >> @@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct amdgpu_ctx

> >> *ctx)

> >>   	kfree(ctx->fences);

> >>   	ctx->fences = NULL;

> >>

> >> -	for (i = 0; i < adev->num_rings; i++)

> >> -		drm_sched_entity_fini(&adev->rings[i]->sched,

> >> -				      &ctx->rings[i].entity);

> >> -

> >>   	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);

> >>

> >>   	mutex_destroy(&ctx->lock);

> >> +

> >> +	kfree(ctx);

> >>   }

> >>

> >>   static int amdgpu_ctx_alloc(struct amdgpu_device *adev, @@ -170,12

> >> +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,

> >> static void amdgpu_ctx_do_release(struct kref *ref)  {

> >>   	struct amdgpu_ctx *ctx;

> >> +	u32 i;

> >>

> >>   	ctx = container_of(ref, struct amdgpu_ctx, refcount);

> >>

> >> -	amdgpu_ctx_fini(ctx);

> >> +	for (i = 0; i < ctx->adev->num_rings; i++)

> >> +		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,

> >> +							&ctx->rings[i].entity);

> >>

> >> -	kfree(ctx);

> >> +	amdgpu_ctx_fini(ref);

> >>   }

> >>

> >>   static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t id)

> >> @@ -

> >> 435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr

> >> *mgr)

> >>   	idr_init(&mgr->ctx_handles);

> >>   }

> >>

> >> +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) {

> >> +	struct amdgpu_ctx *ctx;

> >> +	struct idr *idp;

> >> +	uint32_t id, i;

> >> +

> >> +	idp = &mgr->ctx_handles;

> >> +

> >> +	idr_for_each_entry(idp, ctx, id) {

> >> +

> >> +		if (!ctx->adev)

> >> +			return;

> >> +

> >> +		for (i = 0; i < ctx->adev->num_rings; i++)

> >> +			if (kref_read(&ctx->refcount) == 1)

> >> +				drm_sched_entity_do_release(&ctx->adev-

> >>> rings[i]->sched,

> >> +						  &ctx->rings[i].entity);

> >> +			else

> >> +				DRM_ERROR("ctx %p is still alive\n", ctx);

> >> +	}

> >> +}

> >> +

> >> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) {

> >> +	struct amdgpu_ctx *ctx;

> >> +	struct idr *idp;

> >> +	uint32_t id, i;

> >> +

> >> +	idp = &mgr->ctx_handles;

> >> +

> >> +	idr_for_each_entry(idp, ctx, id) {

> >> +

> >> +		if (!ctx->adev)

> >> +			return;

> >> +

> >> +		for (i = 0; i < ctx->adev->num_rings; i++)

> >> +			if (kref_read(&ctx->refcount) == 1)

> >> +				drm_sched_entity_cleanup(&ctx->adev-

> >>> rings[i]->sched,

> >> +					&ctx->rings[i].entity);

> >> +			else

> >> +				DRM_ERROR("ctx %p is still alive\n", ctx);

> >> +	}

> >> +}

> >> +

> >>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)  {

> >>   	struct amdgpu_ctx *ctx;

> >>   	struct idr *idp;

> >>   	uint32_t id;

> >>

> >> +	amdgpu_ctx_mgr_entity_cleanup(mgr);

> >> +

> >>   	idp = &mgr->ctx_handles;

> >>

> >>   	idr_for_each_entry(idp, ctx, id) {

> >> -		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)

> >> +		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)

> >>   			DRM_ERROR("ctx %p is still alive\n", ctx);

> >>   	}

> >>

> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> >> index 487d39e..6cbb427 100644

> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> >> @@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct

> >> drm_device *dev,

> >>   		return;

> >>

> >>   	pm_runtime_get_sync(dev->dev);

> >> -

> >> -	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);

> >> +	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);

> >>

> >>   	if (adev->asic_type != CHIP_RAVEN) {

> >>   		amdgpu_uvd_free_handles(adev, file_priv); @@ -935,6

> >> +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,

> >>   	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);

> >>

> >>   	amdgpu_vm_fini(adev, &fpriv->vm);

> >> +	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);

> >> +

> >>   	if (pasid)

> >>   		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);

> >>   	amdgpu_bo_unref(&pd);

> >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c

> >> b/drivers/gpu/drm/scheduler/gpu_scheduler.c

> >> index 310275e..9062d44 100644

> >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c

> >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c

> >> @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct

> >> drm_gpu_scheduler *sched,

> >>   	entity->rq = rq;

> >>   	entity->sched = sched;

> >>   	entity->guilty = guilty;

> >> +	entity->fini_status = 0;

> >> +	entity->finished = NULL;

> >>

> >>   	spin_lock_init(&entity->rq_lock);

> >>   	spin_lock_init(&entity->queue_lock);

> >> @@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct

> >> drm_sched_entity *entity)

> >>   	return true;

> >>   }

> >>

> >> +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,

> >> +				    struct dma_fence_cb *cb)

> >> +{

> >> +	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,

> >> +						 finish_cb);

> >> +	drm_sched_fence_finished(job->s_fence);

> >> +	WARN_ON(job->s_fence->parent);

> >> +	dma_fence_put(&job->s_fence->finished);

> >> +	job->sched->ops->free_job(job);

> >> +}

> >> +

> >> +

> >>   /**

> >>    * Destroy a context entity

> >>    *

> >>    * @sched       Pointer to scheduler instance

> >>    * @entity	The pointer to a valid scheduler entity

> >>    *

> >> - * Cleanup and free the allocated resources.

> >> + * Splitting drm_sched_entity_fini() into two functions, The first

> >> + one is does the waiting,

> >> + * removes the entity from the runqueue and returns an error when

> >> + the

> >> process was killed.

> >>    */

> >> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> >> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,

> >>   			   struct drm_sched_entity *entity)

> >>   {

> >> -	int r;

> >> -

> >>   	if (!drm_sched_entity_is_initialized(sched, entity))

> >>   		return;

> >>   	/**

> >> @@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct

> >> drm_gpu_scheduler *sched,

> >>   	 * queued IBs or discard them on SIGKILL

> >>   	*/

> >>   	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)

> >> -		r = -ERESTARTSYS;

> >> +		entity->fini_status = -ERESTARTSYS;

> >>   	else

> >> -		r = wait_event_killable(sched->job_scheduled,

> >> +		entity->fini_status = wait_event_killable(sched-

> >>> job_scheduled,

> >>   					drm_sched_entity_is_idle(entity));

> >>   	drm_sched_entity_set_rq(entity, NULL);

> >> -	if (r) {

> >> +}

> >> +EXPORT_SYMBOL(drm_sched_entity_do_release);

> >> +

> >> +/**

> >> + * Destroy a context entity

> >> + *

> >> + * @sched       Pointer to scheduler instance

> >> + * @entity	The pointer to a valid scheduler entity

> >> + *

> >> + * The second one then goes over the entity and signals all jobs

> >> +with an

> >> error code.

> >> + */

> >> +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,

> >> +			   struct drm_sched_entity *entity) {

> >> +	if (entity->fini_status) {

> >>   		struct drm_sched_job *job;

> >> +		int r;

> >>

> >>   		/* Park the kernel for a moment to make sure it isn't

> processing

> >>   		 * our enity.

> >> @@ -241,13 +269,28 @@ void drm_sched_entity_fini(struct

> >> drm_gpu_scheduler *sched,

> >>   			struct drm_sched_fence *s_fence = job->s_fence;

> >>   			drm_sched_fence_scheduled(s_fence);

> >>   			dma_fence_set_error(&s_fence->finished, -ESRCH);

> >> -			drm_sched_fence_finished(s_fence);

> >> -			WARN_ON(s_fence->parent);

> >> -			dma_fence_put(&s_fence->finished);

> >> -			sched->ops->free_job(job);

> >> +			r = dma_fence_add_callback(entity->finished, &job-

> >>> finish_cb,

> >> +

> >> 	drm_sched_entity_fini_job_cb);

> >> +			if (r == -ENOENT)

> >> +				drm_sched_entity_fini_job_cb(entity-

> >>> finished, &job->finish_cb);

> >> +			else if (r)

> >> +				DRM_ERROR("fence add callback failed

> >> (%d)\n", r);

> >> +		}

> >> +

> >> +		if (entity->finished) {

> >> +			dma_fence_put(entity->finished);

> >> +			entity->finished = NULL;

> >>   		}

> >>   	}

> >>   }

> >> +EXPORT_SYMBOL(drm_sched_entity_cleanup);

> >> +

> >> +void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> >> +				struct drm_sched_entity *entity) {

> >> +	drm_sched_entity_do_release(sched, entity);

> >> +	drm_sched_entity_cleanup(sched, entity); }

> >>   EXPORT_SYMBOL(drm_sched_entity_fini);

> >>

> >>   static void drm_sched_entity_wakeup(struct dma_fence *f, struct

> >> dma_fence_cb *cb) @@ -530,6 +573,11 @@ void

> >> drm_sched_job_recovery(struct drm_gpu_scheduler *sched)

> >>   		spin_unlock(&sched->job_list_lock);

> >>   		fence = sched->ops->run_job(s_job);

> >>   		atomic_inc(&sched->hw_rq_count);

> >> +

> >> +		if (s_job->entity->finished)

> >> +			dma_fence_put(s_job->entity->finished);

> >> +		s_job->entity->finished = dma_fence_get(&s_fence-

> >>> finished);

> >> +

> >>   		if (fence) {

> >>   			s_fence->parent = dma_fence_get(fence);

> >>   			r = dma_fence_add_callback(fence, &s_fence->cb,

> @@ -556,6 +604,7

> >> @@ int drm_sched_job_init(struct drm_sched_job *job,

> >>   		       void *owner)

> >>   {

> >>   	job->sched = sched;

> >> +	job->entity = entity;

> >>   	job->s_priority = entity->rq - sched->sched_rq;

> >>   	job->s_fence = drm_sched_fence_create(entity, owner);

> >>   	if (!job->s_fence)

> >> @@ -668,6 +717,9 @@ static int drm_sched_main(void *param)

> >>

> >>   		fence = sched->ops->run_job(sched_job);

> >>   		drm_sched_fence_scheduled(s_fence);

> >> +		if (entity->finished)

> >> +			dma_fence_put(entity->finished);

> >> +		entity->finished = dma_fence_get(&s_fence->finished);

> >>

> >>   		if (fence) {

> >>   			s_fence->parent = dma_fence_get(fence); diff --git

> >> a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index

> >> c053a32..a0dd947 100644

> >> --- a/include/drm/gpu_scheduler.h

> >> +++ b/include/drm/gpu_scheduler.h

> >> @@ -65,6 +65,8 @@ struct drm_sched_entity {

> >>   	struct dma_fence		*dependency;

> >>   	struct dma_fence_cb		cb;

> >>   	atomic_t			*guilty; /* points to ctx's guilty */

> >> +	uint32_t            fini_status;

> >> +	struct dma_fence    *finished;

> >>   };

> >>

> >>   /**

> >> @@ -119,6 +121,7 @@ struct drm_sched_job {

> >>   	uint64_t			id;

> >>   	atomic_t			karma;

> >>   	enum drm_sched_priority		s_priority;

> >> +	struct drm_sched_entity  *entity;

> >>   };

> >>

> >>   static inline bool drm_sched_invalidate_job(struct drm_sched_job

> >> *s_job, @@ -186,6 +189,10 @@ int drm_sched_entity_init(struct

> >> drm_gpu_scheduler *sched,

> >>   			  struct drm_sched_entity *entity,

> >>   			  struct drm_sched_rq *rq,

> >>   			  uint32_t jobs, atomic_t *guilty);

> >> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,

> >> +			   struct drm_sched_entity *entity); void

> >> +drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,

> >> +			   struct drm_sched_entity *entity);

> >>   void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> >>   			   struct drm_sched_entity *entity);  void

> >> drm_sched_entity_push_job(struct drm_sched_job *sched_job,

> >> --

> >> 2.7.4

> 

> _______________________________________________

> amd-gfx mailing list

> amd-gfx@lists.freedesktop.org

> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Hi Monk,
    Another consideration, it will be better to put amdgpu_ctx_mgr_fini() beneath amdgpu_vm_fini, in 
this case, it will first set all ctx and vm entity rq to null, then set all the not scheduled job's fence to signal.

Best Wishes,
Emily Deng

> -----Original Message-----

> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf

> Of Deng, Emily

> Sent: Friday, April 13, 2018 2:08 PM

> To: Liu, Monk <Monk.Liu@amd.com>; Christian König

> <ckoenig.leichtzumerken@gmail.com>

> Cc: amd-gfx@lists.freedesktop.org

> Subject: RE: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)

> 

> Hi Monk,

>      Thanks your review, I will refine the code as your suggestion 1),2),3), 4),5).

>      About 6): I think it is no relation to put the amdgpu_ctx_mgr_fini before or

> after amdgpu_vm_fini.

> 

> Hi Christian,

>      Do you have any thoughts about 6)?

> 

> Best Wishes,

> Emily Deng

> 

> > -----Original Message-----

> > From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf

> > Of monk

> > Sent: Friday, April 13, 2018 1:01 PM

> > To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

> > Subject: Re: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)

> >

> > Hi Christian & Emily

> >

> > This v3 version looks pretty good to me, but still some parts need to

> > improve:

> >

> > e.g.

> >

> > 1)entity->finished doesn't presenting what it really means, better

> > rename it to entity->last_scheduled.

> >

> > 2)drm_sched_entity_fini_job_cb() better renamed to

> > drm_sched_entity_kill_jobs_cb()

> >

> > 3) no need to pass "entity->finished" (line275 of gpu_schedulers.c) to

> > drm_sched_entity_fini_job_cb() if -ENOENT returned after

> > dma_fence_add_callback since this parm is not needed at all in this

> > callback routine

> >

> > 4)better change type of entity->fini_status to "int" instead of

> > "uint32_t" ... it should be aligned with the return type of

> > wait_event_killable()

> >

> > 5)

> >

> > +		if (entity->finished) {

> > +			dma_fence_put(entity->finished);

> > +			entity->finished = NULL;

> >   		}

> >

> > no need to check entity->finished because dma_fence_put() will do it

> inside...

> >

> >

> >

> > and the same here in job_recovery:

> >

> > +

> > +		if (s_job->entity->finished)

> > +			dma_fence_put(s_job->entity->finished);

> >

> > and the same here in sched_main:

> >

> > +		if (entity->finished)

> > +			dma_fence_put(entity->finished);

> >

> >

> > 6) why put amdgpu_ctx_mgr_fini() beneath amdgpu_vm_fini() ? any reason

> > for that ?

> >

> >

> > thanks

> >

> > /Monk

> >

> >

> >

> >

> >

> >

> > On 04/13/2018 10:06 AM, Deng, Emily wrote:

> > > Ping....

> > >

> > > Best Wishes,

> > > Emily Deng

> > >

> > >

> > >

> > >

> > >> -----Original Message-----

> > >> From: Emily Deng [mailto:Emily.Deng@amd.com]

> > >> Sent: Thursday, April 12, 2018 6:22 PM

> > >> To: amd-gfx@lists.freedesktop.org

> > >> Cc: Deng, Emily <Emily.Deng@amd.com>; Liu, Monk

> > <Monk.Liu@amd.com>

> > >> Subject: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)

> > >>

> > >> issue:

> > >> there are VMC page fault occurred if force APP kill during 3dmark

> > >> test, the cause is in entity_fini we manually signal all those jobs

> > >> in entity's queue which confuse the sync/dep

> > >> mechanism:

> > >>

> > >> 1)page fault occurred in sdma's clear job which operate on shadow

> > >> buffer, and shadow buffer's Gart table is cleaned by ttm_bo_release

> > >> since the fence in its reservation was fake signaled by

> > >> entity_fini() under the case of SIGKILL received.

> > >>

> > >> 2)page fault occurred in gfx' job because during the lifetime of

> > >> gfx job we manually fake signal all jobs from its entity in

> > >> entity_fini(), thus the unmapping/clear PTE job depend on those

> > >> result fence is satisfied and sdma start clearing the PTE and lead

> > >> to GFX

> > page fault.

> > >>

> > >> fix:

> > >> 1)should at least wait all jobs already scheduled complete in

> > >> entity_fini() if SIGKILL is the case.

> > >>

> > >> 2)if a fence signaled and try to clear some entity's dependency,

> > >> should set this entity guilty to prevent its job really run since

> > >> the dependency is fake signaled.

> > >>

> > >> v2:

> > >> splitting drm_sched_entity_fini() into two functions:

> > >> 1)The first one is does the waiting, removes the entity from the

> > >> runqueue and returns an error when the process was killed.

> > >> 2)The second one then goes over the entity, install it as

> > >> completion signal for the remaining jobs and signals all jobs with an

> error code.

> > >>

> > >> v3:

> > >> 1)Replace the fini1 and fini2 with better name 2)Call the first

> > >> part before the VM teardown in

> > >> amdgpu_driver_postclose_kms() and the second part after the VM

> > >> teardown 3)Keep the original function drm_sched_entity_fini to

> > >> refine the

> > code.

> > >>

> > >> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

> > >> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

> > >> ---

> > >>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +

> > >>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64

> > >> ++++++++++++++++++++++----

> > >>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-

> > >>   drivers/gpu/drm/scheduler/gpu_scheduler.c | 74

> > >> ++++++++++++++++++++++++++-----

> > >>   include/drm/gpu_scheduler.h               |  7 +++

> > >>   5 files changed, 131 insertions(+), 21 deletions(-)

> > >>

> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > >> index 2babfad..200db73 100644

> > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > >> @@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev,

> > void

> > >> *data,  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,

> > >> unsigned ring_id);

> > >>

> > >>   void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);

> > >> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);

> > void

> > >> +amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);

> > >>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);

> > >>

> > >>

> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > >> index 09d35051..659add4 100644

> > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

> > >> @@ -111,8 +111,9 @@ static int amdgpu_ctx_init(struct

> amdgpu_device

> > >> *adev,

> > >>   	return r;

> > >>   }

> > >>

> > >> -static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)

> > >> +static void amdgpu_ctx_fini(struct kref *ref)

> > >>   {

> > >> +	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx,

> > >> +refcount);

> > >>   	struct amdgpu_device *adev = ctx->adev;

> > >>   	unsigned i, j;

> > >>

> > >> @@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct

> amdgpu_ctx

> > >> *ctx)

> > >>   	kfree(ctx->fences);

> > >>   	ctx->fences = NULL;

> > >>

> > >> -	for (i = 0; i < adev->num_rings; i++)

> > >> -		drm_sched_entity_fini(&adev->rings[i]->sched,

> > >> -				      &ctx->rings[i].entity);

> > >> -

> > >>   	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);

> > >>

> > >>   	mutex_destroy(&ctx->lock);

> > >> +

> > >> +	kfree(ctx);

> > >>   }

> > >>

> > >>   static int amdgpu_ctx_alloc(struct amdgpu_device *adev, @@

> > >> -170,12

> > >> +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,

> > >> static void amdgpu_ctx_do_release(struct kref *ref)  {

> > >>   	struct amdgpu_ctx *ctx;

> > >> +	u32 i;

> > >>

> > >>   	ctx = container_of(ref, struct amdgpu_ctx, refcount);

> > >>

> > >> -	amdgpu_ctx_fini(ctx);

> > >> +	for (i = 0; i < ctx->adev->num_rings; i++)

> > >> +		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,

> > >> +							&ctx->rings[i].entity);

> > >>

> > >> -	kfree(ctx);

> > >> +	amdgpu_ctx_fini(ref);

> > >>   }

> > >>

> > >>   static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t

> > >> id) @@ -

> > >> 435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr

> > >> *mgr)

> > >>   	idr_init(&mgr->ctx_handles);

> > >>   }

> > >>

> > >> +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) {

> > >> +	struct amdgpu_ctx *ctx;

> > >> +	struct idr *idp;

> > >> +	uint32_t id, i;

> > >> +

> > >> +	idp = &mgr->ctx_handles;

> > >> +

> > >> +	idr_for_each_entry(idp, ctx, id) {

> > >> +

> > >> +		if (!ctx->adev)

> > >> +			return;

> > >> +

> > >> +		for (i = 0; i < ctx->adev->num_rings; i++)

> > >> +			if (kref_read(&ctx->refcount) == 1)

> > >> +				drm_sched_entity_do_release(&ctx->adev-

> > >>> rings[i]->sched,

> > >> +						  &ctx->rings[i].entity);

> > >> +			else

> > >> +				DRM_ERROR("ctx %p is still alive\n", ctx);

> > >> +	}

> > >> +}

> > >> +

> > >> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) {

> > >> +	struct amdgpu_ctx *ctx;

> > >> +	struct idr *idp;

> > >> +	uint32_t id, i;

> > >> +

> > >> +	idp = &mgr->ctx_handles;

> > >> +

> > >> +	idr_for_each_entry(idp, ctx, id) {

> > >> +

> > >> +		if (!ctx->adev)

> > >> +			return;

> > >> +

> > >> +		for (i = 0; i < ctx->adev->num_rings; i++)

> > >> +			if (kref_read(&ctx->refcount) == 1)

> > >> +				drm_sched_entity_cleanup(&ctx->adev-

> > >>> rings[i]->sched,

> > >> +					&ctx->rings[i].entity);

> > >> +			else

> > >> +				DRM_ERROR("ctx %p is still alive\n", ctx);

> > >> +	}

> > >> +}

> > >> +

> > >>   void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)  {

> > >>   	struct amdgpu_ctx *ctx;

> > >>   	struct idr *idp;

> > >>   	uint32_t id;

> > >>

> > >> +	amdgpu_ctx_mgr_entity_cleanup(mgr);

> > >> +

> > >>   	idp = &mgr->ctx_handles;

> > >>

> > >>   	idr_for_each_entry(idp, ctx, id) {

> > >> -		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)

> > >> +		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)

> > >>   			DRM_ERROR("ctx %p is still alive\n", ctx);

> > >>   	}

> > >>

> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> > >> index 487d39e..6cbb427 100644

> > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> > >> @@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct

> > >> drm_device *dev,

> > >>   		return;

> > >>

> > >>   	pm_runtime_get_sync(dev->dev);

> > >> -

> > >> -	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);

> > >> +	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);

> > >>

> > >>   	if (adev->asic_type != CHIP_RAVEN) {

> > >>   		amdgpu_uvd_free_handles(adev, file_priv); @@ -935,6

> > >> +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,

> > >>   	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);

> > >>

> > >>   	amdgpu_vm_fini(adev, &fpriv->vm);

> > >> +	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);

> > >> +

> > >>   	if (pasid)

> > >>   		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);

> > >>   	amdgpu_bo_unref(&pd);

> > >> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > >> b/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > >> index 310275e..9062d44 100644

> > >> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > >> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c

> > >> @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct

> > >> drm_gpu_scheduler *sched,

> > >>   	entity->rq = rq;

> > >>   	entity->sched = sched;

> > >>   	entity->guilty = guilty;

> > >> +	entity->fini_status = 0;

> > >> +	entity->finished = NULL;

> > >>

> > >>   	spin_lock_init(&entity->rq_lock);

> > >>   	spin_lock_init(&entity->queue_lock);

> > >> @@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct

> > >> drm_sched_entity *entity)

> > >>   	return true;

> > >>   }

> > >>

> > >> +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,

> > >> +				    struct dma_fence_cb *cb)

> > >> +{

> > >> +	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,

> > >> +						 finish_cb);

> > >> +	drm_sched_fence_finished(job->s_fence);

> > >> +	WARN_ON(job->s_fence->parent);

> > >> +	dma_fence_put(&job->s_fence->finished);

> > >> +	job->sched->ops->free_job(job);

> > >> +}

> > >> +

> > >> +

> > >>   /**

> > >>    * Destroy a context entity

> > >>    *

> > >>    * @sched       Pointer to scheduler instance

> > >>    * @entity	The pointer to a valid scheduler entity

> > >>    *

> > >> - * Cleanup and free the allocated resources.

> > >> + * Splitting drm_sched_entity_fini() into two functions, The first

> > >> + one is does the waiting,

> > >> + * removes the entity from the runqueue and returns an error when

> > >> + the

> > >> process was killed.

> > >>    */

> > >> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> > >> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,

> > >>   			   struct drm_sched_entity *entity)

> > >>   {

> > >> -	int r;

> > >> -

> > >>   	if (!drm_sched_entity_is_initialized(sched, entity))

> > >>   		return;

> > >>   	/**

> > >> @@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct

> > >> drm_gpu_scheduler *sched,

> > >>   	 * queued IBs or discard them on SIGKILL

> > >>   	*/

> > >>   	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)

> > >> -		r = -ERESTARTSYS;

> > >> +		entity->fini_status = -ERESTARTSYS;

> > >>   	else

> > >> -		r = wait_event_killable(sched->job_scheduled,

> > >> +		entity->fini_status = wait_event_killable(sched-

> > >>> job_scheduled,

> > >>   					drm_sched_entity_is_idle(entity));

> > >>   	drm_sched_entity_set_rq(entity, NULL);

> > >> -	if (r) {

> > >> +}

> > >> +EXPORT_SYMBOL(drm_sched_entity_do_release);

> > >> +

> > >> +/**

> > >> + * Destroy a context entity

> > >> + *

> > >> + * @sched       Pointer to scheduler instance

> > >> + * @entity	The pointer to a valid scheduler entity

> > >> + *

> > >> + * The second one then goes over the entity and signals all jobs

> > >> +with an

> > >> error code.

> > >> + */

> > >> +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,

> > >> +			   struct drm_sched_entity *entity) {

> > >> +	if (entity->fini_status) {

> > >>   		struct drm_sched_job *job;

> > >> +		int r;

> > >>

> > >>   		/* Park the kernel for a moment to make sure it isn't

> > processing

> > >>   		 * our enity.

> > >> @@ -241,13 +269,28 @@ void drm_sched_entity_fini(struct

> > >> drm_gpu_scheduler *sched,

> > >>   			struct drm_sched_fence *s_fence = job->s_fence;

> > >>   			drm_sched_fence_scheduled(s_fence);

> > >>   			dma_fence_set_error(&s_fence->finished, -ESRCH);

> > >> -			drm_sched_fence_finished(s_fence);

> > >> -			WARN_ON(s_fence->parent);

> > >> -			dma_fence_put(&s_fence->finished);

> > >> -			sched->ops->free_job(job);

> > >> +			r = dma_fence_add_callback(entity->finished, &job-

> > >>> finish_cb,

> > >> +

> > >> 	drm_sched_entity_fini_job_cb);

> > >> +			if (r == -ENOENT)

> > >> +				drm_sched_entity_fini_job_cb(entity-

> > >>> finished, &job->finish_cb);

> > >> +			else if (r)

> > >> +				DRM_ERROR("fence add callback failed

> > >> (%d)\n", r);

> > >> +		}

> > >> +

> > >> +		if (entity->finished) {

> > >> +			dma_fence_put(entity->finished);

> > >> +			entity->finished = NULL;

> > >>   		}

> > >>   	}

> > >>   }

> > >> +EXPORT_SYMBOL(drm_sched_entity_cleanup);

> > >> +

> > >> +void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> > >> +				struct drm_sched_entity *entity) {

> > >> +	drm_sched_entity_do_release(sched, entity);

> > >> +	drm_sched_entity_cleanup(sched, entity); }

> > >>   EXPORT_SYMBOL(drm_sched_entity_fini);

> > >>

> > >>   static void drm_sched_entity_wakeup(struct dma_fence *f, struct

> > >> dma_fence_cb *cb) @@ -530,6 +573,11 @@ void

> > >> drm_sched_job_recovery(struct drm_gpu_scheduler *sched)

> > >>   		spin_unlock(&sched->job_list_lock);

> > >>   		fence = sched->ops->run_job(s_job);

> > >>   		atomic_inc(&sched->hw_rq_count);

> > >> +

> > >> +		if (s_job->entity->finished)

> > >> +			dma_fence_put(s_job->entity->finished);

> > >> +		s_job->entity->finished = dma_fence_get(&s_fence-

> > >>> finished);

> > >> +

> > >>   		if (fence) {

> > >>   			s_fence->parent = dma_fence_get(fence);

> > >>   			r = dma_fence_add_callback(fence, &s_fence->cb,

> > @@ -556,6 +604,7

> > >> @@ int drm_sched_job_init(struct drm_sched_job *job,

> > >>   		       void *owner)

> > >>   {

> > >>   	job->sched = sched;

> > >> +	job->entity = entity;

> > >>   	job->s_priority = entity->rq - sched->sched_rq;

> > >>   	job->s_fence = drm_sched_fence_create(entity, owner);

> > >>   	if (!job->s_fence)

> > >> @@ -668,6 +717,9 @@ static int drm_sched_main(void *param)

> > >>

> > >>   		fence = sched->ops->run_job(sched_job);

> > >>   		drm_sched_fence_scheduled(s_fence);

> > >> +		if (entity->finished)

> > >> +			dma_fence_put(entity->finished);

> > >> +		entity->finished = dma_fence_get(&s_fence->finished);

> > >>

> > >>   		if (fence) {

> > >>   			s_fence->parent = dma_fence_get(fence); diff --git

> > >> a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index

> > >> c053a32..a0dd947 100644

> > >> --- a/include/drm/gpu_scheduler.h

> > >> +++ b/include/drm/gpu_scheduler.h

> > >> @@ -65,6 +65,8 @@ struct drm_sched_entity {

> > >>   	struct dma_fence		*dependency;

> > >>   	struct dma_fence_cb		cb;

> > >>   	atomic_t			*guilty; /* points to ctx's guilty */

> > >> +	uint32_t            fini_status;

> > >> +	struct dma_fence    *finished;

> > >>   };

> > >>

> > >>   /**

> > >> @@ -119,6 +121,7 @@ struct drm_sched_job {

> > >>   	uint64_t			id;

> > >>   	atomic_t			karma;

> > >>   	enum drm_sched_priority		s_priority;

> > >> +	struct drm_sched_entity  *entity;

> > >>   };

> > >>

> > >>   static inline bool drm_sched_invalidate_job(struct drm_sched_job

> > >> *s_job, @@ -186,6 +189,10 @@ int drm_sched_entity_init(struct

> > >> drm_gpu_scheduler *sched,

> > >>   			  struct drm_sched_entity *entity,

> > >>   			  struct drm_sched_rq *rq,

> > >>   			  uint32_t jobs, atomic_t *guilty);

> > >> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,

> > >> +			   struct drm_sched_entity *entity); void

> > >> +drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,

> > >> +			   struct drm_sched_entity *entity);

> > >>   void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,

> > >>   			   struct drm_sched_entity *entity);  void

> > >> drm_sched_entity_push_job(struct drm_sched_job *sched_job,

> > >> --

> > >> 2.7.4

> >

> > _______________________________________________

> > amd-gfx mailing list

> > amd-gfx@lists.freedesktop.org

> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx

> _______________________________________________

> amd-gfx mailing list

> amd-gfx@lists.freedesktop.org

> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Hi Monk/Emily,

give me the weekend to take a closer look since I'm very busy this morning.

In general the order of ctx_fini and vm fini is very important cause we 
otherwise dereference invalid pointers here.

Regards,
Christian.

Am 13.04.2018 um 08:18 schrieb Deng, Emily:
> Hi Monk,
>      Another consideration, it will be better to put amdgpu_ctx_mgr_fini() beneath amdgpu_vm_fini, in
> this case, it will first set all ctx and vm entity rq to null, then set all the not scheduled job's fence to signal.
>
> Best Wishes,
> Emily Deng
>
>> -----Original Message-----
>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>> Of Deng, Emily
>> Sent: Friday, April 13, 2018 2:08 PM
>> To: Liu, Monk <Monk.Liu@amd.com>; Christian König
>> <ckoenig.leichtzumerken@gmail.com>
>> Cc: amd-gfx@lists.freedesktop.org
>> Subject: RE: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)
>>
>> Hi Monk,
>>       Thanks your review, I will refine the code as your suggestion 1),2),3), 4),5).
>>       About 6): I think it is no relation to put the amdgpu_ctx_mgr_fini before or
>> after amdgpu_vm_fini.
>>
>> Hi Christian,
>>       Do you have any thoughts about 6)?
>>
>> Best Wishes,
>> Emily Deng
>>
>>> -----Original Message-----
>>> From: amd-gfx [mailto:amd-gfx-bounces@lists.freedesktop.org] On Behalf
>>> Of monk
>>> Sent: Friday, April 13, 2018 1:01 PM
>>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>>> Subject: Re: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)
>>>
>>> Hi Christian & Emily
>>>
>>> This v3 version looks pretty good to me, but still some parts need to
>>> improve:
>>>
>>> e.g.
>>>
>>> 1)entity->finished doesn't presenting what it really means, better
>>> rename it to entity->last_scheduled.
>>>
>>> 2)drm_sched_entity_fini_job_cb() better renamed to
>>> drm_sched_entity_kill_jobs_cb()
>>>
>>> 3) no need to pass "entity->finished" (line275 of gpu_schedulers.c) to
>>> drm_sched_entity_fini_job_cb() if -ENOENT returned after
>>> dma_fence_add_callback since this parm is not needed at all in this
>>> callback routine
>>>
>>> 4)better change type of entity->fini_status to "int" instead of
>>> "uint32_t" ... it should be aligned with the return type of
>>> wait_event_killable()
>>>
>>> 5)
>>>
>>> +		if (entity->finished) {
>>> +			dma_fence_put(entity->finished);
>>> +			entity->finished = NULL;
>>>    		}
>>>
>>> no need to check entity->finished because dma_fence_put() will do it
>> inside...
>>>
>>>
>>> and the same here in job_recovery:
>>>
>>> +
>>> +		if (s_job->entity->finished)
>>> +			dma_fence_put(s_job->entity->finished);
>>>
>>> and the same here in sched_main:
>>>
>>> +		if (entity->finished)
>>> +			dma_fence_put(entity->finished);
>>>
>>>
>>> 6) why put amdgpu_ctx_mgr_fini() beneath amdgpu_vm_fini() ? any reason
>>> for that ?
>>>
>>>
>>> thanks
>>>
>>> /Monk
>>>
>>>
>>>
>>>
>>>
>>>
>>> On 04/13/2018 10:06 AM, Deng, Emily wrote:
>>>> Ping....
>>>>
>>>> Best Wishes,
>>>> Emily Deng
>>>>
>>>>
>>>>
>>>>
>>>>> -----Original Message-----
>>>>> From: Emily Deng [mailto:Emily.Deng@amd.com]
>>>>> Sent: Thursday, April 12, 2018 6:22 PM
>>>>> To: amd-gfx@lists.freedesktop.org
>>>>> Cc: Deng, Emily <Emily.Deng@amd.com>; Liu, Monk
>>> <Monk.Liu@amd.com>
>>>>> Subject: [PATCH] drm/gpu-sched: fix force APP kill hang(v3)
>>>>>
>>>>> issue:
>>>>> there are VMC page fault occurred if force APP kill during 3dmark
>>>>> test, the cause is in entity_fini we manually signal all those jobs
>>>>> in entity's queue which confuse the sync/dep
>>>>> mechanism:
>>>>>
>>>>> 1)page fault occurred in sdma's clear job which operate on shadow
>>>>> buffer, and shadow buffer's Gart table is cleaned by ttm_bo_release
>>>>> since the fence in its reservation was fake signaled by
>>>>> entity_fini() under the case of SIGKILL received.
>>>>>
>>>>> 2)page fault occurred in gfx' job because during the lifetime of
>>>>> gfx job we manually fake signal all jobs from its entity in
>>>>> entity_fini(), thus the unmapping/clear PTE job depend on those
>>>>> result fence is satisfied and sdma start clearing the PTE and lead
>>>>> to GFX
>>> page fault.
>>>>> fix:
>>>>> 1)should at least wait all jobs already scheduled complete in
>>>>> entity_fini() if SIGKILL is the case.
>>>>>
>>>>> 2)if a fence signaled and try to clear some entity's dependency,
>>>>> should set this entity guilty to prevent its job really run since
>>>>> the dependency is fake signaled.
>>>>>
>>>>> v2:
>>>>> splitting drm_sched_entity_fini() into two functions:
>>>>> 1)The first one is does the waiting, removes the entity from the
>>>>> runqueue and returns an error when the process was killed.
>>>>> 2)The second one then goes over the entity, install it as
>>>>> completion signal for the remaining jobs and signals all jobs with an
>> error code.
>>>>> v3:
>>>>> 1)Replace the fini1 and fini2 with better name 2)Call the first
>>>>> part before the VM teardown in
>>>>> amdgpu_driver_postclose_kms() and the second part after the VM
>>>>> teardown 3)Keep the original function drm_sched_entity_fini to
>>>>> refine the
>>> code.
>>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>>> ---
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  2 +
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 64
>>>>> ++++++++++++++++++++++----
>>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  5 ++-
>>>>>    drivers/gpu/drm/scheduler/gpu_scheduler.c | 74
>>>>> ++++++++++++++++++++++++++-----
>>>>>    include/drm/gpu_scheduler.h               |  7 +++
>>>>>    5 files changed, 131 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> index 2babfad..200db73 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>>> @@ -681,6 +681,8 @@ int amdgpu_ctx_ioctl(struct drm_device *dev,
>>> void
>>>>> *data,  int amdgpu_ctx_wait_prev_fence(struct amdgpu_ctx *ctx,
>>>>> unsigned ring_id);
>>>>>
>>>>>    void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr *mgr);
>>>>> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr);
>>> void
>>>>> +amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr);
>>>>>    void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
>>>>>
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>>>> index 09d35051..659add4 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>>>> @@ -111,8 +111,9 @@ static int amdgpu_ctx_init(struct
>> amdgpu_device
>>>>> *adev,
>>>>>    	return r;
>>>>>    }
>>>>>
>>>>> -static void amdgpu_ctx_fini(struct amdgpu_ctx *ctx)
>>>>> +static void amdgpu_ctx_fini(struct kref *ref)
>>>>>    {
>>>>> +	struct amdgpu_ctx *ctx = container_of(ref, struct amdgpu_ctx,
>>>>> +refcount);
>>>>>    	struct amdgpu_device *adev = ctx->adev;
>>>>>    	unsigned i, j;
>>>>>
>>>>> @@ -125,13 +126,11 @@ static void amdgpu_ctx_fini(struct
>> amdgpu_ctx
>>>>> *ctx)
>>>>>    	kfree(ctx->fences);
>>>>>    	ctx->fences = NULL;
>>>>>
>>>>> -	for (i = 0; i < adev->num_rings; i++)
>>>>> -		drm_sched_entity_fini(&adev->rings[i]->sched,
>>>>> -				      &ctx->rings[i].entity);
>>>>> -
>>>>>    	amdgpu_queue_mgr_fini(adev, &ctx->queue_mgr);
>>>>>
>>>>>    	mutex_destroy(&ctx->lock);
>>>>> +
>>>>> +	kfree(ctx);
>>>>>    }
>>>>>
>>>>>    static int amdgpu_ctx_alloc(struct amdgpu_device *adev, @@
>>>>> -170,12
>>>>> +169,15 @@ static int amdgpu_ctx_alloc(struct amdgpu_device *adev,
>>>>> static void amdgpu_ctx_do_release(struct kref *ref)  {
>>>>>    	struct amdgpu_ctx *ctx;
>>>>> +	u32 i;
>>>>>
>>>>>    	ctx = container_of(ref, struct amdgpu_ctx, refcount);
>>>>>
>>>>> -	amdgpu_ctx_fini(ctx);
>>>>> +	for (i = 0; i < ctx->adev->num_rings; i++)
>>>>> +		drm_sched_entity_fini(&ctx->adev->rings[i]->sched,
>>>>> +							&ctx->rings[i].entity);
>>>>>
>>>>> -	kfree(ctx);
>>>>> +	amdgpu_ctx_fini(ref);
>>>>>    }
>>>>>
>>>>>    static int amdgpu_ctx_free(struct amdgpu_fpriv *fpriv, uint32_t
>>>>> id) @@ -
>>>>> 435,16 +437,62 @@ void amdgpu_ctx_mgr_init(struct amdgpu_ctx_mgr
>>>>> *mgr)
>>>>>    	idr_init(&mgr->ctx_handles);
>>>>>    }
>>>>>
>>>>> +void amdgpu_ctx_mgr_entity_fini(struct amdgpu_ctx_mgr *mgr) {
>>>>> +	struct amdgpu_ctx *ctx;
>>>>> +	struct idr *idp;
>>>>> +	uint32_t id, i;
>>>>> +
>>>>> +	idp = &mgr->ctx_handles;
>>>>> +
>>>>> +	idr_for_each_entry(idp, ctx, id) {
>>>>> +
>>>>> +		if (!ctx->adev)
>>>>> +			return;
>>>>> +
>>>>> +		for (i = 0; i < ctx->adev->num_rings; i++)
>>>>> +			if (kref_read(&ctx->refcount) == 1)
>>>>> +				drm_sched_entity_do_release(&ctx->adev-
>>>>>> rings[i]->sched,
>>>>> +						  &ctx->rings[i].entity);
>>>>> +			else
>>>>> +				DRM_ERROR("ctx %p is still alive\n", ctx);
>>>>> +	}
>>>>> +}
>>>>> +
>>>>> +void amdgpu_ctx_mgr_entity_cleanup(struct amdgpu_ctx_mgr *mgr) {
>>>>> +	struct amdgpu_ctx *ctx;
>>>>> +	struct idr *idp;
>>>>> +	uint32_t id, i;
>>>>> +
>>>>> +	idp = &mgr->ctx_handles;
>>>>> +
>>>>> +	idr_for_each_entry(idp, ctx, id) {
>>>>> +
>>>>> +		if (!ctx->adev)
>>>>> +			return;
>>>>> +
>>>>> +		for (i = 0; i < ctx->adev->num_rings; i++)
>>>>> +			if (kref_read(&ctx->refcount) == 1)
>>>>> +				drm_sched_entity_cleanup(&ctx->adev-
>>>>>> rings[i]->sched,
>>>>> +					&ctx->rings[i].entity);
>>>>> +			else
>>>>> +				DRM_ERROR("ctx %p is still alive\n", ctx);
>>>>> +	}
>>>>> +}
>>>>> +
>>>>>    void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr)  {
>>>>>    	struct amdgpu_ctx *ctx;
>>>>>    	struct idr *idp;
>>>>>    	uint32_t id;
>>>>>
>>>>> +	amdgpu_ctx_mgr_entity_cleanup(mgr);
>>>>> +
>>>>>    	idp = &mgr->ctx_handles;
>>>>>
>>>>>    	idr_for_each_entry(idp, ctx, id) {
>>>>> -		if (kref_put(&ctx->refcount, amdgpu_ctx_do_release) != 1)
>>>>> +		if (kref_put(&ctx->refcount, amdgpu_ctx_fini) != 1)
>>>>>    			DRM_ERROR("ctx %p is still alive\n", ctx);
>>>>>    	}
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>>> index 487d39e..6cbb427 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>>> @@ -913,8 +913,7 @@ void amdgpu_driver_postclose_kms(struct
>>>>> drm_device *dev,
>>>>>    		return;
>>>>>
>>>>>    	pm_runtime_get_sync(dev->dev);
>>>>> -
>>>>> -	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
>>>>> +	amdgpu_ctx_mgr_entity_fini(&fpriv->ctx_mgr);
>>>>>
>>>>>    	if (adev->asic_type != CHIP_RAVEN) {
>>>>>    		amdgpu_uvd_free_handles(adev, file_priv); @@ -935,6
>>>>> +934,8 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev,
>>>>>    	pd = amdgpu_bo_ref(fpriv->vm.root.base.bo);
>>>>>
>>>>>    	amdgpu_vm_fini(adev, &fpriv->vm);
>>>>> +	amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr);
>>>>> +
>>>>>    	if (pasid)
>>>>>    		amdgpu_pasid_free_delayed(pd->tbo.resv, pasid);
>>>>>    	amdgpu_bo_unref(&pd);
>>>>> diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>>>>> b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>>>>> index 310275e..9062d44 100644
>>>>> --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c
>>>>> +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c
>>>>> @@ -136,6 +136,8 @@ int drm_sched_entity_init(struct
>>>>> drm_gpu_scheduler *sched,
>>>>>    	entity->rq = rq;
>>>>>    	entity->sched = sched;
>>>>>    	entity->guilty = guilty;
>>>>> +	entity->fini_status = 0;
>>>>> +	entity->finished = NULL;
>>>>>
>>>>>    	spin_lock_init(&entity->rq_lock);
>>>>>    	spin_lock_init(&entity->queue_lock);
>>>>> @@ -197,19 +199,30 @@ static bool drm_sched_entity_is_ready(struct
>>>>> drm_sched_entity *entity)
>>>>>    	return true;
>>>>>    }
>>>>>
>>>>> +static void drm_sched_entity_fini_job_cb(struct dma_fence *f,
>>>>> +				    struct dma_fence_cb *cb)
>>>>> +{
>>>>> +	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
>>>>> +						 finish_cb);
>>>>> +	drm_sched_fence_finished(job->s_fence);
>>>>> +	WARN_ON(job->s_fence->parent);
>>>>> +	dma_fence_put(&job->s_fence->finished);
>>>>> +	job->sched->ops->free_job(job);
>>>>> +}
>>>>> +
>>>>> +
>>>>>    /**
>>>>>     * Destroy a context entity
>>>>>     *
>>>>>     * @sched       Pointer to scheduler instance
>>>>>     * @entity	The pointer to a valid scheduler entity
>>>>>     *
>>>>> - * Cleanup and free the allocated resources.
>>>>> + * Splitting drm_sched_entity_fini() into two functions, The first
>>>>> + one is does the waiting,
>>>>> + * removes the entity from the runqueue and returns an error when
>>>>> + the
>>>>> process was killed.
>>>>>     */
>>>>> -void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>>>>> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>>>>>    			   struct drm_sched_entity *entity)
>>>>>    {
>>>>> -	int r;
>>>>> -
>>>>>    	if (!drm_sched_entity_is_initialized(sched, entity))
>>>>>    		return;
>>>>>    	/**
>>>>> @@ -217,13 +230,28 @@ void drm_sched_entity_fini(struct
>>>>> drm_gpu_scheduler *sched,
>>>>>    	 * queued IBs or discard them on SIGKILL
>>>>>    	*/
>>>>>    	if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL)
>>>>> -		r = -ERESTARTSYS;
>>>>> +		entity->fini_status = -ERESTARTSYS;
>>>>>    	else
>>>>> -		r = wait_event_killable(sched->job_scheduled,
>>>>> +		entity->fini_status = wait_event_killable(sched-
>>>>>> job_scheduled,
>>>>>    					drm_sched_entity_is_idle(entity));
>>>>>    	drm_sched_entity_set_rq(entity, NULL);
>>>>> -	if (r) {
>>>>> +}
>>>>> +EXPORT_SYMBOL(drm_sched_entity_do_release);
>>>>> +
>>>>> +/**
>>>>> + * Destroy a context entity
>>>>> + *
>>>>> + * @sched       Pointer to scheduler instance
>>>>> + * @entity	The pointer to a valid scheduler entity
>>>>> + *
>>>>> + * The second one then goes over the entity and signals all jobs
>>>>> +with an
>>>>> error code.
>>>>> + */
>>>>> +void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>>>>> +			   struct drm_sched_entity *entity) {
>>>>> +	if (entity->fini_status) {
>>>>>    		struct drm_sched_job *job;
>>>>> +		int r;
>>>>>
>>>>>    		/* Park the kernel for a moment to make sure it isn't
>>> processing
>>>>>    		 * our enity.
>>>>> @@ -241,13 +269,28 @@ void drm_sched_entity_fini(struct
>>>>> drm_gpu_scheduler *sched,
>>>>>    			struct drm_sched_fence *s_fence = job->s_fence;
>>>>>    			drm_sched_fence_scheduled(s_fence);
>>>>>    			dma_fence_set_error(&s_fence->finished, -ESRCH);
>>>>> -			drm_sched_fence_finished(s_fence);
>>>>> -			WARN_ON(s_fence->parent);
>>>>> -			dma_fence_put(&s_fence->finished);
>>>>> -			sched->ops->free_job(job);
>>>>> +			r = dma_fence_add_callback(entity->finished, &job-
>>>>>> finish_cb,
>>>>> +
>>>>> 	drm_sched_entity_fini_job_cb);
>>>>> +			if (r == -ENOENT)
>>>>> +				drm_sched_entity_fini_job_cb(entity-
>>>>>> finished, &job->finish_cb);
>>>>> +			else if (r)
>>>>> +				DRM_ERROR("fence add callback failed
>>>>> (%d)\n", r);
>>>>> +		}
>>>>> +
>>>>> +		if (entity->finished) {
>>>>> +			dma_fence_put(entity->finished);
>>>>> +			entity->finished = NULL;
>>>>>    		}
>>>>>    	}
>>>>>    }
>>>>> +EXPORT_SYMBOL(drm_sched_entity_cleanup);
>>>>> +
>>>>> +void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>>>>> +				struct drm_sched_entity *entity) {
>>>>> +	drm_sched_entity_do_release(sched, entity);
>>>>> +	drm_sched_entity_cleanup(sched, entity); }
>>>>>    EXPORT_SYMBOL(drm_sched_entity_fini);
>>>>>
>>>>>    static void drm_sched_entity_wakeup(struct dma_fence *f, struct
>>>>> dma_fence_cb *cb) @@ -530,6 +573,11 @@ void
>>>>> drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>>>>>    		spin_unlock(&sched->job_list_lock);
>>>>>    		fence = sched->ops->run_job(s_job);
>>>>>    		atomic_inc(&sched->hw_rq_count);
>>>>> +
>>>>> +		if (s_job->entity->finished)
>>>>> +			dma_fence_put(s_job->entity->finished);
>>>>> +		s_job->entity->finished = dma_fence_get(&s_fence-
>>>>>> finished);
>>>>> +
>>>>>    		if (fence) {
>>>>>    			s_fence->parent = dma_fence_get(fence);
>>>>>    			r = dma_fence_add_callback(fence, &s_fence->cb,
>>> @@ -556,6 +604,7
>>>>> @@ int drm_sched_job_init(struct drm_sched_job *job,
>>>>>    		       void *owner)
>>>>>    {
>>>>>    	job->sched = sched;
>>>>> +	job->entity = entity;
>>>>>    	job->s_priority = entity->rq - sched->sched_rq;
>>>>>    	job->s_fence = drm_sched_fence_create(entity, owner);
>>>>>    	if (!job->s_fence)
>>>>> @@ -668,6 +717,9 @@ static int drm_sched_main(void *param)
>>>>>
>>>>>    		fence = sched->ops->run_job(sched_job);
>>>>>    		drm_sched_fence_scheduled(s_fence);
>>>>> +		if (entity->finished)
>>>>> +			dma_fence_put(entity->finished);
>>>>> +		entity->finished = dma_fence_get(&s_fence->finished);
>>>>>
>>>>>    		if (fence) {
>>>>>    			s_fence->parent = dma_fence_get(fence); diff --git
>>>>> a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index
>>>>> c053a32..a0dd947 100644
>>>>> --- a/include/drm/gpu_scheduler.h
>>>>> +++ b/include/drm/gpu_scheduler.h
>>>>> @@ -65,6 +65,8 @@ struct drm_sched_entity {
>>>>>    	struct dma_fence		*dependency;
>>>>>    	struct dma_fence_cb		cb;
>>>>>    	atomic_t			*guilty; /* points to ctx's guilty */
>>>>> +	uint32_t            fini_status;
>>>>> +	struct dma_fence    *finished;
>>>>>    };
>>>>>
>>>>>    /**
>>>>> @@ -119,6 +121,7 @@ struct drm_sched_job {
>>>>>    	uint64_t			id;
>>>>>    	atomic_t			karma;
>>>>>    	enum drm_sched_priority		s_priority;
>>>>> +	struct drm_sched_entity  *entity;
>>>>>    };
>>>>>
>>>>>    static inline bool drm_sched_invalidate_job(struct drm_sched_job
>>>>> *s_job, @@ -186,6 +189,10 @@ int drm_sched_entity_init(struct
>>>>> drm_gpu_scheduler *sched,
>>>>>    			  struct drm_sched_entity *entity,
>>>>>    			  struct drm_sched_rq *rq,
>>>>>    			  uint32_t jobs, atomic_t *guilty);
>>>>> +void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched,
>>>>> +			   struct drm_sched_entity *entity); void
>>>>> +drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched,
>>>>> +			   struct drm_sched_entity *entity);
>>>>>    void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
>>>>>    			   struct drm_sched_entity *entity);  void
>>>>> drm_sched_entity_push_job(struct drm_sched_job *sched_job,
>>>>> --
>>>>> 2.7.4
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx