[v7,05/20] drm/i915: Cancel reset-engine if we couldn't find an active request

Submitted by Michel Thierry on April 27, 2017, 11:12 p.m.

Details

Message ID 20170427231300.32841-6-michel.thierry@intel.com
State New
Headers show
Series "Gen8+ engine-reset" ( rev: 6 5 4 ) in Intel GFX

Browsing this patch as part of:
"Gen8+ engine-reset" rev 4 in Intel GFX
<< prev patch [5/20] next patch >>

Commit Message

Michel Thierry April 27, 2017, 11:12 p.m.
Before reseting an engine, check if there is an active request, and if
the _hung_ request has completed. In these two cases, the seqno has moved
after hang declaration and we can skip the reset.

Also store the active request so that we only search for it once.

Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c | 37 +++++++++++++++++++++++++++++--------
 drivers/gpu/drm/i915/i915_drv.h |  6 ++++--
 drivers/gpu/drm/i915/i915_gem.c | 37 ++++++++++++++++++++++++-------------
 3 files changed, 57 insertions(+), 23 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ae891529dedd..a64e9b63cdbc 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1811,7 +1811,7 @@  void i915_reset(struct drm_i915_private *dev_priv)
 	pr_notice("drm/i915: Resetting chip after gpu hang\n");
 	disable_irq(dev_priv->drm.irq);
 	ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
-	if (ret) {
+	if (ret == -EIO) {
 		DRM_ERROR("GPU recovery failed\n");
 		intel_gpu_reset(dev_priv, ALL_ENGINES);
 		goto error;
@@ -1883,23 +1883,40 @@  int i915_reset_engine(struct intel_engine_cs *engine)
 	int ret;
 	struct drm_i915_private *dev_priv = engine->i915;
 	struct i915_gpu_error *error = &dev_priv->gpu_error;
+	struct drm_i915_gem_request *active_request;
 
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
 	DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
 
-	ret = i915_gem_reset_prepare_engine(engine);
-	if (ret) {
-		DRM_ERROR("Previous reset failed - promote to full reset\n");
-		goto out;
+	active_request = i915_gem_reset_prepare_engine(engine);
+	if (!active_request) {
+		DRM_DEBUG_DRIVER("seqno moved after hang declaration, pardoned\n");
+		goto canceled;
+	}
+	if (IS_ERR(active_request)) {
+		ret = PTR_ERR(active_request);
+		if (ret == -ECANCELED) {
+			DRM_DEBUG_DRIVER("no active request found, skip reset\n");
+			goto canceled;
+		} else if (ret) {
+			DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
+			goto out;
+		}
 	}
 
+	if (__i915_gem_request_completed(active_request, engine->hangcheck.seqno)) {
+		DRM_DEBUG_DRIVER("request completed, skip the reset\n");
+		goto canceled;
+	}
+
+
 	/*
-	 * the request that caused the hang is stuck on elsp, identify the
-	 * active request and drop it, adjust head to skip the offending
+	 * the request that caused the hang is stuck on elsp, we know the
+	 * active request and can drop it, adjust head to skip the offending
 	 * request to resume executing remaining requests in the queue.
 	 */
-	i915_gem_reset_engine(engine);
+	i915_gem_reset_engine(engine, active_request);
 
 	/* forcing engine to idle */
 	ret = intel_reset_engine_start(engine);
@@ -1928,6 +1945,10 @@  int i915_reset_engine(struct intel_engine_cs *engine)
 
 out:
 	return ret;
+
+canceled:
+	i915_gem_reset_finish_engine(engine);
+	return 0;
 }
 
 static int i915_pm_suspend(struct device *kdev)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index efbf34318893..8e93189c2104 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3439,7 +3439,8 @@  static inline u32 i915_reset_count(struct i915_gpu_error *error)
 	return READ_ONCE(error->reset_count);
 }
 
-int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
 			   unsigned int engine_mask);
 void i915_gem_reset(struct drm_i915_private *dev_priv);
@@ -3448,7 +3449,8 @@  void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
 			   unsigned int engine_mask);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
-void i915_gem_reset_engine(struct intel_engine_cs *engine);
+void i915_gem_reset_engine(struct intel_engine_cs *engine,
+			   struct drm_i915_gem_request *request);
 
 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index bce38062f94e..4e357d333cc2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2793,12 +2793,15 @@  static bool engine_stalled(struct intel_engine_cs *engine)
 	return true;
 }
 
-/* Ensure irq handler finishes, and not run again. */
-int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
+/*
+ * Ensure irq handler finishes, and not run again.
+ * For reset-engine we also store the active request so that we only search
+ * for it once.
+ */
+struct drm_i915_gem_request *
+i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *request;
-	int err = 0;
-
+	struct drm_i915_gem_request *request = NULL;
 
 	/* Prevent the signaler thread from updating the request
 	 * state (by calling dma_fence_signal) as we are processing
@@ -2827,22 +2830,29 @@  int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
 
 	if (engine_stalled(engine)) {
 		request = i915_gem_find_active_request(engine);
+		if (!request)
+			return ERR_PTR(-ECANCELED); /* Can't find a request, abort! */
+
 		if (request && request->fence.error == -EIO)
-			err = -EIO; /* Previous reset failed! */
+			return ERR_PTR(-EIO); /* Previous reset failed! */
 	}
 
-	return err;
+	return request;
 }
 
 int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
 			   unsigned int engine_mask)
 {
 	struct intel_engine_cs *engine;
+	struct drm_i915_gem_request *request;
 	unsigned int tmp;
 	int err = 0;
 
-	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-		err = i915_gem_reset_prepare_engine(engine);
+	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
+		request = i915_gem_reset_prepare_engine(engine);
+		if (request && IS_ERR(request))
+			err = PTR_ERR(request);
+	}
 
 	i915_gem_revoke_fences(dev_priv);
 
@@ -2929,11 +2939,12 @@  static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
 	return guilty;
 }
 
-void i915_gem_reset_engine(struct intel_engine_cs *engine)
+void i915_gem_reset_engine(struct intel_engine_cs *engine,
+			   struct drm_i915_gem_request *request)
 {
-	struct drm_i915_gem_request *request;
+	if (!request)
+		request = i915_gem_find_active_request(engine);
 
-	request = i915_gem_find_active_request(engine);
 	if (request && i915_gem_reset_request(request)) {
 		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
 				 engine->name, request->global_seqno);
@@ -2959,7 +2970,7 @@  void i915_gem_reset(struct drm_i915_private *dev_priv)
 	for_each_engine(engine, dev_priv, id) {
 		struct i915_gem_context *ctx;
 
-		i915_gem_reset_engine(engine);
+		i915_gem_reset_engine(engine, NULL);
 		ctx = fetch_and_zero(&engine->last_retired_context);
 		if (ctx)
 			engine->context_unpin(engine, ctx);

Comments

On Thu, Apr 27, 2017 at 04:12:45PM -0700, Michel Thierry wrote:
> Before reseting an engine, check if there is an active request, and if
> the _hung_ request has completed. In these two cases, the seqno has moved
> after hang declaration and we can skip the reset.
> 
> Also store the active request so that we only search for it once.
> 
> Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c | 37 +++++++++++++++++++++++++++++--------
>  drivers/gpu/drm/i915/i915_drv.h |  6 ++++--
>  drivers/gpu/drm/i915/i915_gem.c | 37 ++++++++++++++++++++++++-------------
>  3 files changed, 57 insertions(+), 23 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index ae891529dedd..a64e9b63cdbc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1811,7 +1811,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  	pr_notice("drm/i915: Resetting chip after gpu hang\n");
>  	disable_irq(dev_priv->drm.irq);
>  	ret = i915_gem_reset_prepare(dev_priv, ALL_ENGINES);
> -	if (ret) {
> +	if (ret == -EIO) {
>  		DRM_ERROR("GPU recovery failed\n");
>  		intel_gpu_reset(dev_priv, ALL_ENGINES);
>  		goto error;
> @@ -1883,23 +1883,40 @@ int i915_reset_engine(struct intel_engine_cs *engine)
>  	int ret;
>  	struct drm_i915_private *dev_priv = engine->i915;
>  	struct i915_gpu_error *error = &dev_priv->gpu_error;
> +	struct drm_i915_gem_request *active_request;
>  
>  	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
>  
>  	DRM_DEBUG_DRIVER("resetting %s\n", engine->name);
>  
> -	ret = i915_gem_reset_prepare_engine(engine);
> -	if (ret) {
> -		DRM_ERROR("Previous reset failed - promote to full reset\n");
> -		goto out;
> +	active_request = i915_gem_reset_prepare_engine(engine);
> +	if (!active_request) {
> +		DRM_DEBUG_DRIVER("seqno moved after hang declaration, pardoned\n");
> +		goto canceled;
> +	}
> +	if (IS_ERR(active_request)) {
> +		ret = PTR_ERR(active_request);
> +		if (ret == -ECANCELED) {
> +			DRM_DEBUG_DRIVER("no active request found, skip reset\n");
> +			goto canceled;

-ECANCELED is just NULL. Make it so.

> +		} else if (ret) {
> +			DRM_DEBUG_DRIVER("Previous reset failed, promote to full reset\n");
> +			goto out;
> +		}
>  	}
>  
> +	if (__i915_gem_request_completed(active_request, engine->hangcheck.seqno)) {

Hmm, this is very incorrect. (Do it correctly as part of
prepare_engine.)

> +		DRM_DEBUG_DRIVER("request completed, skip the reset\n");
> +		goto canceled;
> +	}

This is part of the pardon check above. My comment was if we store the
active_request under engine->hangcheck for the global reset case, then
we would need to double check it wasn't completed after the delays of
setting up the global reset. Here, the check that the seqno is still
active is sufficient.

> +
> +
>  	/*
> -	 * the request that caused the hang is stuck on elsp, identify the
> -	 * active request and drop it, adjust head to skip the offending
> +	 * the request that caused the hang is stuck on elsp, we know the
> +	 * active request and can drop it, adjust head to skip the offending
>  	 * request to resume executing remaining requests in the queue.
>  	 */
> -	i915_gem_reset_engine(engine);
> +	i915_gem_reset_engine(engine, active_request);
>  
>  	/* forcing engine to idle */
>  	ret = intel_reset_engine_start(engine);
> @@ -1928,6 +1945,10 @@ int i915_reset_engine(struct intel_engine_cs *engine)
>  
>  out:
>  	return ret;
> +
> +canceled:
> +	i915_gem_reset_finish_engine(engine);
> +	return 0;
>  }
>  
>  static int i915_pm_suspend(struct device *kdev)
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index efbf34318893..8e93189c2104 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3439,7 +3439,8 @@ static inline u32 i915_reset_count(struct i915_gpu_error *error)
>  	return READ_ONCE(error->reset_count);
>  }
>  
> -int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
> +struct drm_i915_gem_request *
> +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
>  int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
>  			   unsigned int engine_mask);
>  void i915_gem_reset(struct drm_i915_private *dev_priv);
> @@ -3448,7 +3449,8 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv,
>  			   unsigned int engine_mask);
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
>  bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
> -void i915_gem_reset_engine(struct intel_engine_cs *engine);
> +void i915_gem_reset_engine(struct intel_engine_cs *engine,
> +			   struct drm_i915_gem_request *request);
>  
>  void i915_gem_init_mmio(struct drm_i915_private *i915);
>  int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index bce38062f94e..4e357d333cc2 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2793,12 +2793,15 @@ static bool engine_stalled(struct intel_engine_cs *engine)
>  	return true;
>  }
>  
> -/* Ensure irq handler finishes, and not run again. */
> -int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> +/*
> + * Ensure irq handler finishes, and not run again.
> + * For reset-engine we also store the active request so that we only search
> + * for it once.
> + */
> +struct drm_i915_gem_request *
> +i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *request;
> -	int err = 0;
> -
> +	struct drm_i915_gem_request *request = NULL;
>  
>  	/* Prevent the signaler thread from updating the request
>  	 * state (by calling dma_fence_signal) as we are processing
> @@ -2827,22 +2830,29 @@ int i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
>  
>  	if (engine_stalled(engine)) {
>  		request = i915_gem_find_active_request(engine);
> +		if (!request)
> +			return ERR_PTR(-ECANCELED); /* Can't find a request, abort! */
> +
>  		if (request && request->fence.error == -EIO)
> -			err = -EIO; /* Previous reset failed! */
> +			return ERR_PTR(-EIO); /* Previous reset failed! */
>  	}
>  
> -	return err;
> +	return request;
>  }
>  
>  int i915_gem_reset_prepare(struct drm_i915_private *dev_priv,
>  			   unsigned int engine_mask)
>  {
>  	struct intel_engine_cs *engine;
> +	struct drm_i915_gem_request *request;
>  	unsigned int tmp;
>  	int err = 0;
>  
> -	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -		err = i915_gem_reset_prepare_engine(engine);
> +	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> +		request = i915_gem_reset_prepare_engine(engine);
> +		if (request && IS_ERR(request))

Can just be IS_ERR(). And this doesn't want to report -ECANCELED!

> +			err = PTR_ERR(request);
> +	}
>  
>  	i915_gem_revoke_fences(dev_priv);
>  
> @@ -2929,11 +2939,12 @@ static bool i915_gem_reset_request(struct drm_i915_gem_request *request)
>  	return guilty;
>  }
>  
> -void i915_gem_reset_engine(struct intel_engine_cs *engine)
> +void i915_gem_reset_engine(struct intel_engine_cs *engine,
> +			   struct drm_i915_gem_request *request)
>  {
> -	struct drm_i915_gem_request *request;
> +	if (!request)
> +		request = i915_gem_find_active_request(engine);
>  
> -	request = i915_gem_find_active_request(engine);
>  	if (request && i915_gem_reset_request(request)) {
>  		DRM_DEBUG_DRIVER("resetting %s to restart from tail of request 0x%x\n",
>  				 engine->name, request->global_seqno);
> @@ -2959,7 +2970,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
>  	for_each_engine(engine, dev_priv, id) {
>  		struct i915_gem_context *ctx;
>  
> -		i915_gem_reset_engine(engine);
> +		i915_gem_reset_engine(engine, NULL);
>  		ctx = fetch_and_zero(&engine->last_retired_context);
>  		if (ctx)
>  			engine->context_unpin(engine, ctx);
> -- 
> 2.11.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx