drm/i915: Fix up -EIO ABI per igt/gem_eio

Submitted by Daniel Vetter on Nov. 26, 2015, 10:51 a.m.

Details

Message ID 1448535069-31876-1-git-send-email-daniel.vetter@ffwll.ch
State New
Headers show
Series "drm/i915: Fix up -EIO ABI per igt/gem_eio" ( rev: 2 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Daniel Vetter Nov. 26, 2015, 10:51 a.m.
My apologies to Chris Wilson for being dense on this topic for
essentially for years.

This patch doesn't do any big redesign of the overall reset flow, but
instead just applies changes where it's needed to obey gem_eio. We can
redesign later on, but for now I prefer to make the testcase happy
with some minimally invasive changes:

- Ensure that set_domain_ioctl never returns -EIO. Tricky part there
  is that we might race with the reset handler when doing the lockless
  wait.

- Make sure debugfs/i915_wedged is actually useful to reset a wedged
  gpu - atm it bails out with -EAGAIN for a terminally wedged gpu.
  That's because reset_in_progress actually includes that terminal
  state, which is super-confusing (and most callers got this wrong).
  Fix this by changing the semantics of reset_in_progress to do what
  it says on the tin (and fixup all other callers).

  Also make sure that reset_in_progress is cleared when we reach the
  terminal "wedged" state. Without this we kept returning -EAGAIN in
  some places forever.

- Second problem with debugfs/i915_wedge was that we never got out of
  the terminal wedged state. Hence manually set the reset counter out
  of that state before starting the hung gpu recovery.

  For safety also make sure that we are consintent with resetting the
  gpu between i915_reset_and_wakeup and i915_handler_error by just
  passing the boolean "wedged" around instead of trying to recompute
  it wrongly. This isn't an issue for gem_eio with the debugfs fix,
  but just general robustness of the code.

- Finally make sure that when gpu reset is disabled through the module
  paramter the kernel doesn't generate dmesg noise that would upset
  our CI/piglit. Simplest solution for that was to lift the i915.reset
  check up into i915_reset().

With all these changes, plus the igt fixes I've posted, gem_eio is now
happy on many snb.

v2: Don't upset lockdep with my set_domain_ioctl changes.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Testcase: igt/gem_eio/*
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c |  4 ++++
 drivers/gpu/drm/i915/i915_drv.c     |  6 ++++++
 drivers/gpu/drm/i915/i915_drv.h     |  2 +-
 drivers/gpu/drm/i915/i915_gem.c     | 23 +++++++++++------------
 drivers/gpu/drm/i915/i915_irq.c     | 11 ++++++-----
 drivers/gpu/drm/i915/intel_uncore.c |  3 ---
 6 files changed, 28 insertions(+), 21 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 411a9c68b4ee..c4006c09ef1b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4744,6 +4744,10 @@  i915_wedged_set(void *data, u64 val)
 	if (i915_reset_in_progress(&dev_priv->gpu_error))
 		return -EAGAIN;
 
+	/* Already wedged, force us out of this terminal state. */
+	if (i915_terminally_wedged(&dev_priv->gpu_error))
+		atomic_set(&dev_priv->gpu_error.reset_counter, 0);
+
 	intel_runtime_pm_get(dev_priv);
 
 	i915_handle_error(dev, val,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index ec1e877c4a36..1f5386bb78f4 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -909,6 +909,12 @@  int i915_reset(struct drm_device *dev)
 
 	simulated = dev_priv->gpu_error.stop_rings != 0;
 
+	if (!i915.reset) {
+		DRM_INFO("GPU reset disabled in module option, not resetting\n");
+		mutex_unlock(&dev->struct_mutex);
+		return -ENODEV;
+	}
+
 	ret = intel_gpu_reset(dev);
 
 	/* Also reset the gpu hangman. */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a47e0f4fab56..8876b4891d56 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2939,7 +2939,7 @@  int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
 static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
 {
 	return unlikely(atomic_read(&error->reset_counter)
-			& (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
+			& I915_RESET_IN_PROGRESS_FLAG);
 }
 
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f10a5d57377e..64c63409b9d0 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -85,8 +85,7 @@  i915_gem_wait_for_error(struct i915_gpu_error *error)
 {
 	int ret;
 
-#define EXIT_COND (!i915_reset_in_progress(error) || \
-		   i915_terminally_wedged(error))
+#define EXIT_COND (!i915_reset_in_progress(error))
 	if (EXIT_COND)
 		return 0;
 
@@ -1113,16 +1112,16 @@  int
 i915_gem_check_wedge(struct i915_gpu_error *error,
 		     bool interruptible)
 {
+	/* Recovery complete, but the reset failed ... */
+	if (i915_terminally_wedged(error))
+		return -EIO;
+
 	if (i915_reset_in_progress(error)) {
 		/* Non-interruptible callers can't handle -EAGAIN, hence return
 		 * -EIO unconditionally for these. */
 		if (!interruptible)
 			return -EIO;
 
-		/* Recovery complete, but the reset failed ... */
-		if (i915_terminally_wedged(error))
-			return -EIO;
-
 		/*
 		 * Check if GPU Reset is in progress - we need intel_ring_begin
 		 * to work properly to reinit the hw state while the gpu is
@@ -1239,11 +1238,7 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
 		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
-			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
-			 * is truely gone. */
-			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
-			if (ret == 0)
-				ret = -EAGAIN;
+			ret = -EAGAIN;
 			break;
 		}
 
@@ -1577,7 +1572,7 @@  i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
-		return ret;
+		goto out;
 
 	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
 	if (&obj->base == NULL) {
@@ -1609,6 +1604,10 @@  unref:
 	drm_gem_object_unreference(&obj->base);
 unlock:
 	mutex_unlock(&dev->struct_mutex);
+out:
+	/* ABI: set_domain_ioctl must not fail even when the gpu is wedged. */
+	WARN_ON(ret == -EIO);
+
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index c8ba94968aaf..dbbc6159584b 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2401,7 +2401,8 @@  static void i915_error_wake_up(struct drm_i915_private *dev_priv,
  * Fire an error uevent so userspace can see that a hang or error
  * was detected.
  */
-static void i915_reset_and_wakeup(struct drm_device *dev)
+static void i915_reset_and_wakeup(struct drm_device *dev,
+				  bool wedged)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
 	struct i915_gpu_error *error = &dev_priv->gpu_error;
@@ -2422,7 +2423,7 @@  static void i915_reset_and_wakeup(struct drm_device *dev)
 	 * the reset in-progress bit is only ever set by code outside of this
 	 * work we don't need to worry about any other races.
 	 */
-	if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
+	if (wedged) {
 		DRM_DEBUG_DRIVER("resetting chip\n");
 		kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
 				   reset_event);
@@ -2432,7 +2433,7 @@  static void i915_reset_and_wakeup(struct drm_device *dev)
 		 * reference held, for example because there is a pending GPU
 		 * request that won't finish until the reset is done. This
 		 * isn't the case at least when we get here by doing a
-		 * simulated reset via debugs, so get an RPM reference.
+		 * simulated reset via debugfs, so get an RPM reference.
 		 */
 		intel_runtime_pm_get(dev_priv);
 
@@ -2467,7 +2468,7 @@  static void i915_reset_and_wakeup(struct drm_device *dev)
 			kobject_uevent_env(&dev->primary->kdev->kobj,
 					   KOBJ_CHANGE, reset_done_event);
 		} else {
-			atomic_or(I915_WEDGED, &error->reset_counter);
+			atomic_set(&error->reset_counter, I915_WEDGED);
 		}
 
 		/*
@@ -2614,7 +2615,7 @@  void i915_handle_error(struct drm_device *dev, bool wedged,
 		i915_error_wake_up(dev_priv, false);
 	}
 
-	i915_reset_and_wakeup(dev);
+	i915_reset_and_wakeup(dev, wedged);
 }
 
 /* Called from drm generic code, passed 'crtc' which
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index c2358ba78b30..4c5ae1154dd1 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1543,9 +1543,6 @@  not_ready:
 
 static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device *)
 {
-	if (!i915.reset)
-		return NULL;
-
 	if (INTEL_INFO(dev)->gen >= 8)
 		return gen8_do_reset;
 	else if (INTEL_INFO(dev)->gen >= 6)

Comments

On Thu, Nov 26, 2015 at 11:51:09AM +0100, Daniel Vetter wrote:
> My apologies to Chris Wilson for being dense on this topic for
> essentially for years.
> 
> This patch doesn't do any big redesign of the overall reset flow, but
> instead just applies changes where it's needed to obey gem_eio. We can
> redesign later on, but for now I prefer to make the testcase happy
> with some minimally invasive changes:
> 
> - Ensure that set_domain_ioctl never returns -EIO. Tricky part there
>   is that we might race with the reset handler when doing the lockless
>   wait.
> 
> - Make sure debugfs/i915_wedged is actually useful to reset a wedged
>   gpu - atm it bails out with -EAGAIN for a terminally wedged gpu.
>   That's because reset_in_progress actually includes that terminal
>   state, which is super-confusing (and most callers got this wrong).
>   Fix this by changing the semantics of reset_in_progress to do what
>   it says on the tin (and fixup all other callers).
> 
>   Also make sure that reset_in_progress is cleared when we reach the
>   terminal "wedged" state. Without this we kept returning -EAGAIN in
>   some places forever.
> 
> - Second problem with debugfs/i915_wedge was that we never got out of
>   the terminal wedged state. Hence manually set the reset counter out
>   of that state before starting the hung gpu recovery.
> 
>   For safety also make sure that we are consintent with resetting the
>   gpu between i915_reset_and_wakeup and i915_handler_error by just
>   passing the boolean "wedged" around instead of trying to recompute
>   it wrongly. This isn't an issue for gem_eio with the debugfs fix,
>   but just general robustness of the code.
> 
> - Finally make sure that when gpu reset is disabled through the module
>   paramter the kernel doesn't generate dmesg noise that would upset
>   our CI/piglit. Simplest solution for that was to lift the i915.reset
>   check up into i915_reset().
> 
> With all these changes, plus the igt fixes I've posted, gem_eio is now
> happy on many snb.
> 
> v2: Don't upset lockdep with my set_domain_ioctl changes.

Blergh, forgotten to update the commit message:

v3: Don't special case set_domain_ioctl, instead curb all -EIO at the
source in wait_request, like in Chris' patch. That means anyone waiting
for a request will not notice the -EIO and fall over due to that. We
definitely want that for modeset code.

> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Testcase: igt/gem_eio/*
> Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c |  4 ++++
>  drivers/gpu/drm/i915/i915_drv.c     |  6 ++++++
>  drivers/gpu/drm/i915/i915_drv.h     |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c     | 23 +++++++++++------------
>  drivers/gpu/drm/i915/i915_irq.c     | 11 ++++++-----
>  drivers/gpu/drm/i915/intel_uncore.c |  3 ---
>  6 files changed, 28 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 411a9c68b4ee..c4006c09ef1b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4744,6 +4744,10 @@ i915_wedged_set(void *data, u64 val)
>  	if (i915_reset_in_progress(&dev_priv->gpu_error))
>  		return -EAGAIN;
>  
> +	/* Already wedged, force us out of this terminal state. */
> +	if (i915_terminally_wedged(&dev_priv->gpu_error))
> +		atomic_set(&dev_priv->gpu_error.reset_counter, 0);
> +
>  	intel_runtime_pm_get(dev_priv);
>  
>  	i915_handle_error(dev, val,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index ec1e877c4a36..1f5386bb78f4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -909,6 +909,12 @@ int i915_reset(struct drm_device *dev)
>  
>  	simulated = dev_priv->gpu_error.stop_rings != 0;
>  
> +	if (!i915.reset) {
> +		DRM_INFO("GPU reset disabled in module option, not resetting\n");
> +		mutex_unlock(&dev->struct_mutex);
> +		return -ENODEV;
> +	}
> +
>  	ret = intel_gpu_reset(dev);
>  
>  	/* Also reset the gpu hangman. */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a47e0f4fab56..8876b4891d56 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2939,7 +2939,7 @@ int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
>  static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
>  {
>  	return unlikely(atomic_read(&error->reset_counter)
> -			& (I915_RESET_IN_PROGRESS_FLAG | I915_WEDGED));
> +			& I915_RESET_IN_PROGRESS_FLAG);
>  }
>  
>  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f10a5d57377e..64c63409b9d0 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -85,8 +85,7 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
>  {
>  	int ret;
>  
> -#define EXIT_COND (!i915_reset_in_progress(error) || \
> -		   i915_terminally_wedged(error))
> +#define EXIT_COND (!i915_reset_in_progress(error))
>  	if (EXIT_COND)
>  		return 0;
>  
> @@ -1113,16 +1112,16 @@ int
>  i915_gem_check_wedge(struct i915_gpu_error *error,
>  		     bool interruptible)
>  {
> +	/* Recovery complete, but the reset failed ... */
> +	if (i915_terminally_wedged(error))
> +		return -EIO;
> +
>  	if (i915_reset_in_progress(error)) {
>  		/* Non-interruptible callers can't handle -EAGAIN, hence return
>  		 * -EIO unconditionally for these. */
>  		if (!interruptible)
>  			return -EIO;
>  
> -		/* Recovery complete, but the reset failed ... */
> -		if (i915_terminally_wedged(error))
> -			return -EIO;
> -
>  		/*
>  		 * Check if GPU Reset is in progress - we need intel_ring_begin
>  		 * to work properly to reinit the hw state while the gpu is
> @@ -1239,11 +1238,7 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>  		/* We need to check whether any gpu reset happened in between
>  		 * the caller grabbing the seqno and now ... */
>  		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
> -			/* ... but upgrade the -EAGAIN to an -EIO if the gpu
> -			 * is truely gone. */
> -			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> -			if (ret == 0)
> -				ret = -EAGAIN;
> +			ret = -EAGAIN;
>  			break;
>  		}
>  
> @@ -1577,7 +1572,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>  
>  	ret = i915_mutex_lock_interruptible(dev);
>  	if (ret)
> -		return ret;
> +		goto out;
>  
>  	obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle));
>  	if (&obj->base == NULL) {
> @@ -1609,6 +1604,10 @@ unref:
>  	drm_gem_object_unreference(&obj->base);
>  unlock:
>  	mutex_unlock(&dev->struct_mutex);
> +out:
> +	/* ABI: set_domain_ioctl must not fail even when the gpu is wedged. */
> +	WARN_ON(ret == -EIO);
> +
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index c8ba94968aaf..dbbc6159584b 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2401,7 +2401,8 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>   * Fire an error uevent so userspace can see that a hang or error
>   * was detected.
>   */
> -static void i915_reset_and_wakeup(struct drm_device *dev)
> +static void i915_reset_and_wakeup(struct drm_device *dev,
> +				  bool wedged)
>  {
>  	struct drm_i915_private *dev_priv = to_i915(dev);
>  	struct i915_gpu_error *error = &dev_priv->gpu_error;
> @@ -2422,7 +2423,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
>  	 * the reset in-progress bit is only ever set by code outside of this
>  	 * work we don't need to worry about any other races.
>  	 */
> -	if (i915_reset_in_progress(error) && !i915_terminally_wedged(error)) {
> +	if (wedged) {
>  		DRM_DEBUG_DRIVER("resetting chip\n");
>  		kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE,
>  				   reset_event);
> @@ -2432,7 +2433,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
>  		 * reference held, for example because there is a pending GPU
>  		 * request that won't finish until the reset is done. This
>  		 * isn't the case at least when we get here by doing a
> -		 * simulated reset via debugs, so get an RPM reference.
> +		 * simulated reset via debugfs, so get an RPM reference.
>  		 */
>  		intel_runtime_pm_get(dev_priv);
>  
> @@ -2467,7 +2468,7 @@ static void i915_reset_and_wakeup(struct drm_device *dev)
>  			kobject_uevent_env(&dev->primary->kdev->kobj,
>  					   KOBJ_CHANGE, reset_done_event);
>  		} else {
> -			atomic_or(I915_WEDGED, &error->reset_counter);
> +			atomic_set(&error->reset_counter, I915_WEDGED);
>  		}
>  
>  		/*
> @@ -2614,7 +2615,7 @@ void i915_handle_error(struct drm_device *dev, bool wedged,
>  		i915_error_wake_up(dev_priv, false);
>  	}
>  
> -	i915_reset_and_wakeup(dev);
> +	i915_reset_and_wakeup(dev, wedged);
>  }
>  
>  /* Called from drm generic code, passed 'crtc' which
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index c2358ba78b30..4c5ae1154dd1 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1543,9 +1543,6 @@ not_ready:
>  
>  static int (*intel_get_gpu_reset(struct drm_device *dev))(struct drm_device *)
>  {
> -	if (!i915.reset)
> -		return NULL;
> -
>  	if (INTEL_INFO(dev)->gen >= 8)
>  		return gen8_do_reset;
>  	else if (INTEL_INFO(dev)->gen >= 6)
> -- 
> 2.1.0
>