drm/i915/lrc: Stop writing to ELSP until HW has processed the previous write

Submitted by Michel Thierry on Nov. 18, 2017, 12:30 a.m.

Details

Message ID 20171118003038.7935-1-michel.thierry@intel.com
State New
Headers show
Series "drm/i915/lrc: Stop writing to ELSP until HW has processed the previous write" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Michel Thierry Nov. 18, 2017, 12:30 a.m.
The hardware needs some time to process the information received in the
ExecList Submission Port, and expects us to don't write anything new until
it has 'acknowledged' this new execlist by sending an IDLE_ACTIVE or
PREEMPTED CSB event.

If we do not follow this, the driver could write new data into the ELSP
before HW had finishing fetching the previous one, putting us in
'undefined behaviour' space.

This seems to be the problem causing the spurious PREEMPTED & COMPLETE
events after a COMPLETE like the one below:

[] vcs0: sw rd pointer = 2, hw wr pointer = 0, current 'head' = 3.
[] vcs0:  Execlist CSB[0]: 0x00000018 _ 0x00000007
[] vcs0:  Execlist CSB[1]: 0x00000001 _ 0x00000000
[] vcs0:  Execlist CSB[2]: 0x00000018 _ 0x00000007  <<< COMPLETE
[] vcs0:  Execlist CSB[3]: 0x00000012 _ 0x00000007  <<< PREEMPTED & COMPLETE
[] vcs0:  Execlist CSB[4]: 0x00008002 _ 0x00000006
[] vcs0:  Execlist CSB[5]: 0x00000014 _ 0x00000006

The ELSP writes that lead to this CSB sequence show that the HW hadn't
started executing the previous execlist (the one with only ctx 0x6) by the
time the new one was submitted; this is a bit more clear in the data
show in the EXECLIST_STATUS register at the time of the ELSP write.

[] vcs0: ELSP[0] = 0x0_0        [execlist1] - status_reg = 0x0_302
[] vcs0: ELSP[1] = 0x6_fedb2119 [execlist0] - status_reg = 0x0_8302

[] vcs0: ELSP[2] = 0x7_fedaf119 [execlist1] - status_reg = 0x0_8308
[] vcs0: ELSP[3] = 0x6_fedb2119 [execlist0] - status_reg = 0x7_8308

Note that having to wait for this ack does not disable lite-restores,
although it may reduce their numbers.

And take this as a RFC, since there are probably better ways to still
respect this HW requirement.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102035
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c        | 16 +++++++++++++++-
 drivers/gpu/drm/i915/intel_ringbuffer.h |  6 ++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index af41165e3da4..10b7eb64f169 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -449,11 +449,16 @@  static inline void elsp_write(u64 desc, u32 __iomem *elsp)
 
 static void execlists_submit_ports(struct intel_engine_cs *engine)
 {
+	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct execlist_port *port = engine->execlists.port;
 	u32 __iomem *elsp =
 		engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 	unsigned int n;
 
+	if (wait_for_atomic(!READ_ONCE(execlists->outstanding_submission), 10))
+		GEM_TRACE("%s outstanding submission stuck\n",
+			  engine->name);
+
 	for (n = execlists_num_ports(&engine->execlists); n--; ) {
 		struct drm_i915_gem_request *rq;
 		unsigned int count;
@@ -479,6 +484,8 @@  static void execlists_submit_ports(struct intel_engine_cs *engine)
 
 		elsp_write(desc, elsp);
 	}
+
+	WRITE_ONCE(execlists->outstanding_submission, 1);
 }
 
 static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
@@ -889,6 +896,11 @@  static void execlists_submission_tasklet(unsigned long data)
 			GEM_TRACE("%s csb[%dd]: status=0x%08x:0x%08x\n",
 				  engine->name, head,
 				  status, buf[2*head + 1]);
+
+			if ((status & GEN8_CTX_STATUS_IDLE_ACTIVE) ||
+			    (status & GEN8_CTX_STATUS_PREEMPTED))
+				WRITE_ONCE(execlists->outstanding_submission, 0);
+
 			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
 				continue;
 
@@ -944,9 +956,11 @@  static void execlists_submission_tasklet(unsigned long data)
 			/* After the final element, the hw should be idle */
 			GEM_BUG_ON(port_count(port) == 0 &&
 				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
-			if (port_count(port) == 0)
+			if (port_count(port) == 0) {
 				execlists_clear_active(execlists,
 						       EXECLISTS_ACTIVE_USER);
+				WRITE_ONCE(execlists->outstanding_submission, 0);
+			}
 		}
 
 		if (head != execlists->csb_head) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e5c62f8ef0da..2c8e1a74c266 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -288,6 +288,12 @@  struct intel_engine_execlists {
 	 * @csb_use_mmio: access csb through mmio, instead of hwsp
 	 */
 	bool csb_use_mmio;
+
+	/**
+	 * @outstanding_submission: prevent further ELSP writes until HW
+	 * has ack'd one (with IDLE_ACTIVE or PREEMPT CSB events)
+	 */
+	bool outstanding_submission;
 };
 
 #define INTEL_ENGINE_CS_MAX_NAME 8

Comments

Quoting Michel Thierry (2017-11-18 00:30:38)
> The hardware needs some time to process the information received in the
> ExecList Submission Port, and expects us to don't write anything new until
> it has 'acknowledged' this new execlist by sending an IDLE_ACTIVE or
> PREEMPTED CSB event.
> 
> If we do not follow this, the driver could write new data into the ELSP
> before HW had finishing fetching the previous one, putting us in
> 'undefined behaviour' space.
> 
> This seems to be the problem causing the spurious PREEMPTED & COMPLETE
> events after a COMPLETE like the one below:
> 
> [] vcs0: sw rd pointer = 2, hw wr pointer = 0, current 'head' = 3.
> [] vcs0:  Execlist CSB[0]: 0x00000018 _ 0x00000007
> [] vcs0:  Execlist CSB[1]: 0x00000001 _ 0x00000000
> [] vcs0:  Execlist CSB[2]: 0x00000018 _ 0x00000007  <<< COMPLETE
> [] vcs0:  Execlist CSB[3]: 0x00000012 _ 0x00000007  <<< PREEMPTED & COMPLETE
> [] vcs0:  Execlist CSB[4]: 0x00008002 _ 0x00000006
> [] vcs0:  Execlist CSB[5]: 0x00000014 _ 0x00000006
> 
> The ELSP writes that lead to this CSB sequence show that the HW hadn't
> started executing the previous execlist (the one with only ctx 0x6) by the
> time the new one was submitted; this is a bit more clear in the data
> show in the EXECLIST_STATUS register at the time of the ELSP write.
> 
> [] vcs0: ELSP[0] = 0x0_0        [execlist1] - status_reg = 0x0_302
> [] vcs0: ELSP[1] = 0x6_fedb2119 [execlist0] - status_reg = 0x0_8302
> 
> [] vcs0: ELSP[2] = 0x7_fedaf119 [execlist1] - status_reg = 0x0_8308
> [] vcs0: ELSP[3] = 0x6_fedb2119 [execlist0] - status_reg = 0x7_8308
> 
> Note that having to wait for this ack does not disable lite-restores,
> although it may reduce their numbers.
> 
> And take this as a RFC, since there are probably better ways to still
> respect this HW requirement.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102035
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c        | 16 +++++++++++++++-
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  6 ++++++
>  2 files changed, 21 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index af41165e3da4..10b7eb64f169 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -449,11 +449,16 @@ static inline void elsp_write(u64 desc, u32 __iomem *elsp)
>  
>  static void execlists_submit_ports(struct intel_engine_cs *engine)
>  {
> +       struct intel_engine_execlists * const execlists = &engine->execlists;
>         struct execlist_port *port = engine->execlists.port;
>         u32 __iomem *elsp =
>                 engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine));
>         unsigned int n;
>  
> +       if (wait_for_atomic(!READ_ONCE(execlists->outstanding_submission), 10))
> +               GEM_TRACE("%s outstanding submission stuck\n",
> +                         engine->name);

That can never succeed. Processing events and submitting ports is
serialised to the same thread. All the READ_ONCE/WRITE_ONCE are
incorrect.

I actually did try tracking something like this; the problem is that we
do not always get the IDLE_ACTIVE indicator so actually tracking when
the hw is awake is tricky.
-Chris