drm/i915/gt: Only unwedge if we can reset first

Submitted by Chris Wilson on Sept. 13, 2019, 7:47 a.m.

Details

Message ID 20190913074720.7718-1-chris@chris-wilson.co.uk
State New
Headers show
Series "drm/i915/gt: Only unwedge if we can reset first" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Chris Wilson Sept. 13, 2019, 7:47 a.m.
Unwedging the GPU requires a successful GPU reset before we restore the
default submission, or else we may see residual context switch events
that we were not expecting.

v2: Pull in the special-case reset_clobbers_display, and explain why it
should be safe in the context of unwedging.

Reported-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_reset.c | 30 ++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index ee52947eb31d..d3b1cdafd4c2 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -7,6 +7,7 @@ 
 #include <linux/sched/mm.h>
 #include <linux/stop_machine.h>
 
+#include "display/intel_display.h"
 #include "display/intel_display_types.h"
 #include "display/intel_overlay.h"
 
@@ -729,6 +730,28 @@  static void nop_submit_request(struct i915_request *request)
 	intel_engine_queue_breadcrumbs(engine);
 }
 
+static bool reset_clobbers_display(struct drm_i915_private *i915)
+{
+	struct intel_crtc *crtc;
+
+	if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
+		return false;
+
+	/*
+	 * While this appears racy, we should only be inspecting the display
+	 * state at runtime from inside a GPU reset, which will be serialized
+	 * with modesets on affected machines. For a full device reset,
+	 * we should already have cleared the active CRTC state in
+	 * intel_prepare_reset().
+	 */
+	for_each_intel_crtc(&i915->drm, crtc) {
+		if (crtc->active)
+			return true;
+	}
+
+	return false;
+}
+
 static void __intel_gt_set_wedged(struct intel_gt *gt)
 {
 	struct intel_engine_cs *engine;
@@ -793,6 +816,7 @@  static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 	struct intel_gt_timelines *timelines = &gt->timelines;
 	struct intel_timeline *tl;
 	unsigned long flags;
+	bool ok;
 
 	if (!test_bit(I915_WEDGED, &gt->reset.flags))
 		return true;
@@ -838,7 +862,11 @@  static bool __intel_gt_unset_wedged(struct intel_gt *gt)
 	}
 	spin_unlock_irqrestore(&timelines->lock, flags);
 
-	intel_gt_sanitize(gt, false);
+	ok = false;
+	if (!reset_clobbers_display(gt->i915))
+		ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
+	if (!ok)
+		return false;
 
 	/*
 	 * Undo nop_submit_request. We prevent all new i915 requests from

Comments

On Fri, Sep 13, 2019 at 08:47:20AM +0100, Chris Wilson wrote:
> Unwedging the GPU requires a successful GPU reset before we restore the
> default submission, or else we may see residual context switch events
> that we were not expecting.
> 
> v2: Pull in the special-case reset_clobbers_display, and explain why it
> should be safe in the context of unwedging.
> 
> Reported-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_reset.c | 30 ++++++++++++++++++++++++++-
>  1 file changed, 29 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> index ee52947eb31d..d3b1cdafd4c2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> @@ -7,6 +7,7 @@
>  #include <linux/sched/mm.h>
>  #include <linux/stop_machine.h>
>  
> +#include "display/intel_display.h"
>  #include "display/intel_display_types.h"
>  #include "display/intel_overlay.h"
>  
> @@ -729,6 +730,28 @@ static void nop_submit_request(struct i915_request *request)
>  	intel_engine_queue_breadcrumbs(engine);
>  }
>  
> +static bool reset_clobbers_display(struct drm_i915_private *i915)
> +{
> +	struct intel_crtc *crtc;
> +
> +	if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
> +		return false;
> +
> +	/*
> +	 * While this appears racy, we should only be inspecting the display
> +	 * state at runtime from inside a GPU reset, which will be serialized
> +	 * with modesets on affected machines. For a full device reset,
> +	 * we should already have cleared the active CRTC state in
> +	 * intel_prepare_reset().
> +	 */
> +	for_each_intel_crtc(&i915->drm, crtc) {
> +		if (crtc->active)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  static void __intel_gt_set_wedged(struct intel_gt *gt)
>  {
>  	struct intel_engine_cs *engine;
> @@ -793,6 +816,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
>  	struct intel_gt_timelines *timelines = &gt->timelines;
>  	struct intel_timeline *tl;
>  	unsigned long flags;
> +	bool ok;
>  
>  	if (!test_bit(I915_WEDGED, &gt->reset.flags))
>  		return true;
> @@ -838,7 +862,11 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
>  	}
>  	spin_unlock_irqrestore(&timelines->lock, flags);
>  
> -	intel_gt_sanitize(gt, false);
> +	ok = false;
> +	if (!reset_clobbers_display(gt->i915))
> +		ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;

/me re-reading a lot of the reset code..

I'm rather confused about the whole reset flow. We now do a reset here,
but then we still do another one later on? Except for i915_gem_sanitize(),
which gets called during probe and resume so only does the single reset
I guess.

Hopefully we can't be marked as wedged during probe because I think
this gets called before crtc->active is populated so we'd just do the
reset anyway.

As for the resume cases, I think the display should be off already
when this gets called.

So I guess I'm not really sure what this check is meant to do for us.

> +	if (!ok)
> +		return false;
>  
>  	/*
>  	 * Undo nop_submit_request. We prevent all new i915 requests from
> -- 
> 2.23.0
Quoting Ville Syrjälä (2019-09-13 17:03:34)
> On Fri, Sep 13, 2019 at 08:47:20AM +0100, Chris Wilson wrote:
> > Unwedging the GPU requires a successful GPU reset before we restore the
> > default submission, or else we may see residual context switch events
> > that we were not expecting.
> > 
> > v2: Pull in the special-case reset_clobbers_display, and explain why it
> > should be safe in the context of unwedging.
> > 
> > Reported-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> > Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
> > Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> > ---
> >  drivers/gpu/drm/i915/gt/intel_reset.c | 30 ++++++++++++++++++++++++++-
> >  1 file changed, 29 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
> > index ee52947eb31d..d3b1cdafd4c2 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_reset.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_reset.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/sched/mm.h>
> >  #include <linux/stop_machine.h>
> >  
> > +#include "display/intel_display.h"
> >  #include "display/intel_display_types.h"
> >  #include "display/intel_overlay.h"
> >  
> > @@ -729,6 +730,28 @@ static void nop_submit_request(struct i915_request *request)
> >       intel_engine_queue_breadcrumbs(engine);
> >  }
> >  
> > +static bool reset_clobbers_display(struct drm_i915_private *i915)
> > +{
> > +     struct intel_crtc *crtc;
> > +
> > +     if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
> > +             return false;
> > +
> > +     /*
> > +      * While this appears racy, we should only be inspecting the display
> > +      * state at runtime from inside a GPU reset, which will be serialized
> > +      * with modesets on affected machines. For a full device reset,
> > +      * we should already have cleared the active CRTC state in
> > +      * intel_prepare_reset().
> > +      */
> > +     for_each_intel_crtc(&i915->drm, crtc) {
> > +             if (crtc->active)
> > +                     return true;
> > +     }
> > +
> > +     return false;
> > +}
> > +
> >  static void __intel_gt_set_wedged(struct intel_gt *gt)
> >  {
> >       struct intel_engine_cs *engine;
> > @@ -793,6 +816,7 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
> >       struct intel_gt_timelines *timelines = &gt->timelines;
> >       struct intel_timeline *tl;
> >       unsigned long flags;
> > +     bool ok;
> >  
> >       if (!test_bit(I915_WEDGED, &gt->reset.flags))
> >               return true;
> > @@ -838,7 +862,11 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
> >       }
> >       spin_unlock_irqrestore(&timelines->lock, flags);
> >  
> > -     intel_gt_sanitize(gt, false);
> > +     ok = false;
> > +     if (!reset_clobbers_display(gt->i915))
> > +             ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
> 
> /me re-reading a lot of the reset code..
> 
> I'm rather confused about the whole reset flow. We now do a reset here,
> but then we still do another one later on? Except for i915_gem_sanitize(),
> which gets called during probe and resume so only does the single reset
> I guess.

The idea was to keep wedging as a separate process (since that gives
nice symmetry because set-wedge unset-wedge). We can merge it so
that we only do the single reset, but we still should keep the request
flushing in place, or we need to play games with rcu barriers again.
 
> Hopefully we can't be marked as wedged during probe because I think
> this gets called before crtc->active is populated so we'd just do the
> reset anyway.

I was hoping the crtcs would be off at that point... They weren't.

> As for the resume cases, I think the display should be off already
> when this gets called.
> 
> So I guess I'm not really sure what this check is meant to do for us.

I gave up,

-       intel_gt_sanitize(gt, false);
+       /* We must reset pending GPU events before restoring our submission */
+       ok = !HAS_EXECLISTS(gt->i915);
+       if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
+               ok = __intel_gt_reset(gt, ALL_ENGINES) == 0;
+       if (!ok)
+               return false;

-Chris