drm/i915: Don't ignore STOP_RING failures during reset

Submitted by Janusz Krzysztofik on Sept. 9, 2019, 4:11 p.m.

Details

Message ID 20190909161152.14804-1-janusz.krzysztofik@linux.intel.com
State New
Headers show
Series "drm/i915: Don't ignore STOP_RING failures during reset" ( rev: 1 ) in Intel GFX - Try Bot

Not browsing as part of any series.

Commit Message

Janusz Krzysztofik Sept. 9, 2019, 4:11 p.m.
Otherwise kernel panic may occur when process_csb() calls trace_ports()
on an unsuccessfully reset and still non-idle engine.

Signed-off-by: Janusz Krzysztofik <janusz.krzysztofik@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  2 +-
 drivers/gpu/drm/i915/gt/intel_lrc.c           |  4 +-
 drivers/gpu/drm/i915/gt/intel_reset.c         | 37 +++++++++++--------
 drivers/gpu/drm/i915/gt/intel_ringbuffer.c    | 14 +++++--
 drivers/gpu/drm/i915/gt/mock_engine.c         |  3 +-
 drivers/gpu/drm/i915/gt/selftest_reset.c      |  5 ++-
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  4 +-
 7 files changed, 45 insertions(+), 24 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 943f0663837e..e488813275dc 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -407,7 +407,7 @@  struct intel_engine_cs {
 	int		(*resume)(struct intel_engine_cs *engine);
 
 	struct {
-		void (*prepare)(struct intel_engine_cs *engine);
+		int (*prepare)(struct intel_engine_cs *engine);
 		void (*reset)(struct intel_engine_cs *engine, bool stalled);
 		void (*finish)(struct intel_engine_cs *engine);
 	} reset;
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 0ddfbebbcbbc..c8fae411e0c2 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2287,7 +2287,7 @@  static int execlists_resume(struct intel_engine_cs *engine)
 	return 0;
 }
 
-static void execlists_reset_prepare(struct intel_engine_cs *engine)
+static int execlists_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	unsigned long flags;
@@ -2323,7 +2323,7 @@  static void execlists_reset_prepare(struct intel_engine_cs *engine)
 	 *
 	 * FIXME: Wa for more modern gens needs to be validated
 	 */
-	intel_engine_stop_cs(engine);
+	return intel_engine_stop_cs(engine);
 }
 
 static void reset_csb_pointers(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
index b9d84d52e986..387fe30c110b 100644
--- a/drivers/gpu/drm/i915/gt/intel_reset.c
+++ b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -609,7 +609,7 @@  int intel_reset_guc(struct intel_gt *gt)
  * Ensure irq handler finishes, and not run again.
  * Also return the active request so that we only search for it once.
  */
-static void reset_prepare_engine(struct intel_engine_cs *engine)
+static int reset_prepare_engine(struct intel_engine_cs *engine)
 {
 	/*
 	 * During the reset sequence, we must prevent the engine from
@@ -619,7 +619,7 @@  static void reset_prepare_engine(struct intel_engine_cs *engine)
 	 * GPU state upon resume, i.e. fail to restart after a reset.
 	 */
 	intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL);
-	engine->reset.prepare(engine);
+	return engine->reset.prepare(engine);
 }
 
 static void revoke_mmaps(struct intel_gt *gt)
@@ -648,21 +648,25 @@  static void revoke_mmaps(struct intel_gt *gt)
 	}
 }
 
-static intel_engine_mask_t reset_prepare(struct intel_gt *gt)
+static int reset_prepare(struct intel_gt *gt, intel_engine_mask_t *awake)
 {
 	struct intel_engine_cs *engine;
-	intel_engine_mask_t awake = 0;
 	enum intel_engine_id id;
+	int ret, err = 0;
+
+	*awake = 0;
 
 	for_each_engine(engine, gt->i915, id) {
 		if (intel_engine_pm_get_if_awake(engine))
-			awake |= engine->mask;
-		reset_prepare_engine(engine);
+			*awake |= engine->mask;
+		ret = reset_prepare_engine(engine);
+		if (ret && !err)
+			err = ret;
 	}
 
 	intel_uc_reset_prepare(&gt->uc);
 
-	return awake;
+	return err;
 }
 
 static void gt_revoke(struct intel_gt *gt)
@@ -752,7 +756,7 @@  static void __intel_gt_set_wedged(struct intel_gt *gt)
 	 * rolling the global seqno forward (since this would complete requests
 	 * for which we haven't set the fence error to EIO yet).
 	 */
-	awake = reset_prepare(gt);
+	reset_prepare(gt, &awake);
 
 	/* Even if the GPU reset fails, it should still stop the engines */
 	if (!INTEL_INFO(gt->i915)->gpu_reset_clobbers_display)
@@ -932,16 +936,18 @@  void intel_gt_reset(struct intel_gt *gt,
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &gt->reset.flags));
 	mutex_lock(&gt->reset.mutex);
 
-	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!__intel_gt_unset_wedged(gt))
-		goto unlock;
-
 	if (reason)
 		dev_notice(gt->i915->drm.dev,
 			   "Resetting chip for %s\n", reason);
 	atomic_inc(&gt->i915->gpu_error.reset_count);
 
-	awake = reset_prepare(gt);
+	ret = reset_prepare(gt, &awake);
+	if (ret)
+		goto finish;
+
+	/* Clear any previous failed attempts at recovery. Time to try again. */
+	if (!__intel_gt_unset_wedged(gt))
+		goto finish;
 
 	if (!intel_has_gpu_reset(gt->i915)) {
 		if (i915_modparams.reset)
@@ -987,7 +993,6 @@  void intel_gt_reset(struct intel_gt *gt,
 
 finish:
 	reset_finish(gt, awake);
-unlock:
 	mutex_unlock(&gt->reset.mutex);
 	return;
 
@@ -1039,7 +1044,9 @@  int intel_engine_reset(struct intel_engine_cs *engine, const char *msg)
 	if (!intel_engine_pm_get_if_awake(engine))
 		return 0;
 
-	reset_prepare_engine(engine);
+	ret = reset_prepare_engine(engine);
+	if (ret)
+		goto out;
 
 	if (msg)
 		dev_notice(engine->i915->drm.dev,
diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
index ac55a0d054bd..1c1d4dbbed78 100644
--- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
@@ -736,10 +736,11 @@  static int xcs_resume(struct intel_engine_cs *engine)
 	return ret;
 }
 
-static void reset_prepare(struct intel_engine_cs *engine)
+static int reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_uncore *uncore = engine->uncore;
 	const u32 base = engine->mmio_base;
+	int err;
 
 	/*
 	 * We stop engines, otherwise we might get failed reset and a
@@ -757,8 +758,11 @@  static void reset_prepare(struct intel_engine_cs *engine)
 	 */
 	GEM_TRACE("%s\n", engine->name);
 
-	if (intel_engine_stop_cs(engine))
+	err = intel_engine_stop_cs(engine);
+	if (err) {
 		GEM_TRACE("%s: timed out on STOP_RING\n", engine->name);
+		return err;
+	}
 
 	intel_uncore_write_fw(uncore,
 			      RING_HEAD(base),
@@ -773,10 +777,14 @@  static void reset_prepare(struct intel_engine_cs *engine)
 	intel_uncore_write_fw(uncore, RING_CTL(base), 0);
 
 	/* Check acts as a post */
-	if (intel_uncore_read_fw(uncore, RING_HEAD(base)))
+	if (intel_uncore_read_fw(uncore, RING_HEAD(base))) {
 		GEM_TRACE("%s: ring head [%x] not parked\n",
 			  engine->name,
 			  intel_uncore_read_fw(uncore, RING_HEAD(base)));
+		err = -EIO;
+	}
+
+	return err;
 }
 
 static void reset_ring(struct intel_engine_cs *engine, bool stalled)
diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c
index 5d43cbc3f345..a04e69c81fb6 100644
--- a/drivers/gpu/drm/i915/gt/mock_engine.c
+++ b/drivers/gpu/drm/i915/gt/mock_engine.c
@@ -202,8 +202,9 @@  static void mock_submit_request(struct i915_request *request)
 	spin_unlock_irqrestore(&engine->hw_lock, flags);
 }
 
-static void mock_reset_prepare(struct intel_engine_cs *engine)
+static int mock_reset_prepare(struct intel_engine_cs *engine)
 {
+	return 0;
 }
 
 static void mock_reset(struct intel_engine_cs *engine, bool stalled)
diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c
index 00a4f60cdfd5..fefdececd680 100644
--- a/drivers/gpu/drm/i915/gt/selftest_reset.c
+++ b/drivers/gpu/drm/i915/gt/selftest_reset.c
@@ -78,7 +78,10 @@  static int igt_atomic_reset(void *arg)
 
 		GEM_TRACE("__intel_gt_reset under %s\n", p->name);
 
-		awake = reset_prepare(gt);
+		err = reset_prepare(gt, &awake);
+		if (err)
+			goto unlock;
+
 		p->critical_section_begin();
 
 		err = __intel_gt_reset(gt, ALL_ENGINES);
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index f325d3dd564f..0ed8e89774f7 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -620,7 +620,7 @@  static void guc_submission_tasklet(unsigned long data)
 	spin_unlock_irqrestore(&engine->active.lock, flags);
 }
 
-static void guc_reset_prepare(struct intel_engine_cs *engine)
+static int guc_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 
@@ -636,6 +636,8 @@  static void guc_reset_prepare(struct intel_engine_cs *engine)
 	 * prevents the race.
 	 */
 	__tasklet_disable_sync_once(&execlists->tasklet);
+
+	return 0;
 }
 
 static void