[v3] drm/i915/selftests: Check the context size

Submitted by Chris Wilson on Aug. 16, 2019, 7:17 p.m.

Details

Message ID 20190816191703.31519-1-chris@chris-wilson.co.uk
State New
Headers show
Series "drm/i915/selftests: Check the context size" ( rev: 3 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Chris Wilson Aug. 16, 2019, 7:17 p.m.
Add a redzone to our context image and check the HW does not write into
after a context save, to verify that we have the correct context size.
(This does vary with feature bits, so test with a live setup that should
match how we run userspace.)

v2: Check the redzone on every context unpin
v3: Use a kernel context to prevent loading garbage for ringbuffer
submission

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c        |  33 ++++++
 drivers/gpu/drm/i915/gt/selftest_context.c | 128 +++++++++++++++++++++
 2 files changed, 161 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index e9863f4d826b..ade212686bf6 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1578,9 +1578,38 @@  static void execlists_context_destroy(struct kref *kref)
 	intel_context_fini(ce);
 	intel_context_free(ce);
 }
+static void
+set_redzone(void *vaddr, const struct intel_engine_cs *engine)
+{
+	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+		return;
+
+	vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
+	vaddr += engine->context_size;
+
+	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
+}
+
+static void
+check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
+{
+	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+		return;
+
+	vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
+	vaddr += engine->context_size;
+
+	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
+		dev_err_once(engine->i915->drm.dev,
+			     "%s context redzone overwritten!\n",
+			     engine->name);
+}
 
 static void execlists_context_unpin(struct intel_context *ce)
 {
+	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
+		      ce->engine);
+
 	i915_gem_context_unpin_hw_id(ce->gem_context);
 	i915_gem_object_unpin_map(ce->state->obj);
 	intel_ring_reset(ce->ring, ce->ring->tail);
@@ -3119,6 +3148,8 @@  populate_lr_context(struct intel_context *ce,
 		return ret;
 	}
 
+	set_redzone(vaddr, engine);
+
 	if (engine->default_state) {
 		/*
 		 * We only want to copy over the template context state;
@@ -3173,6 +3204,8 @@  static int __execlists_context_alloc(struct intel_context *ce,
 	 * for our own use and for sharing with the GuC.
 	 */
 	context_size += LRC_HEADER_PAGES * PAGE_SIZE;
+	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
+		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
 
 	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
 	if (IS_ERR(ctx_obj))
diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
index 6fbc72bc290e..cefd2df086fb 100644
--- a/drivers/gpu/drm/i915/gt/selftest_context.c
+++ b/drivers/gpu/drm/i915/gt/selftest_context.c
@@ -5,6 +5,7 @@ 
  */
 
 #include "i915_selftest.h"
+#include "intel_engine_pm.h"
 #include "intel_gt.h"
 
 #include "gem/selftests/mock_context.h"
@@ -64,6 +65,132 @@  static int context_sync(struct intel_context *ce)
 	return err;
 }
 
+static int __live_context_size(struct intel_engine_cs *engine,
+			       struct i915_gem_context *fixme)
+{
+	struct intel_context *ce;
+	struct i915_request *rq;
+	void *vaddr;
+	int err;
+
+	ce = intel_context_create(fixme, engine);
+	if (IS_ERR(ce))
+		return PTR_ERR(ce);
+
+	err = intel_context_pin(ce);
+	if (err)
+		goto err;
+
+	vaddr = i915_gem_object_pin_map(ce->state->obj,
+					i915_coherent_map_type(engine->i915));
+	if (IS_ERR(vaddr)) {
+		err = PTR_ERR(vaddr);
+		intel_context_unpin(ce);
+		goto err;
+	}
+
+	if (HAS_EXECLISTS(engine->i915))
+		vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
+
+	vaddr += engine->context_size - I915_GTT_PAGE_SIZE;
+	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
+
+	rq = intel_context_create_request(ce);
+	intel_context_unpin(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_unpin;
+	}
+
+	err = request_sync(rq);
+	if (err)
+		goto err_unpin;
+
+	/* Force the context switch */
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_unpin;
+	}
+	err = request_sync(rq);
+	if (err)
+		goto err_unpin;
+
+	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) {
+		pr_err("%s context overwrote trailing red-zone!", engine->name);
+		err = -EINVAL;
+	}
+
+err_unpin:
+	i915_gem_object_unpin_map(ce->state->obj);
+err:
+	intel_context_put(ce);
+	return err;
+}
+
+static int live_context_size(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *fixme;
+	enum intel_engine_id id;
+	int err = 0;
+
+	/*
+	 * Check that our context sizes are correct by seeing if the
+	 * HW tries to write past the end of one.
+	 */
+
+	mutex_lock(&gt->i915->drm.struct_mutex);
+
+	fixme = kernel_context(gt->i915);
+	if (IS_ERR(fixme)) {
+		err = PTR_ERR(fixme);
+		goto unlock;
+	}
+
+	for_each_engine(engine, gt->i915, id) {
+		struct {
+			struct drm_i915_gem_object *state;
+			void *pinned;
+		} saved;
+
+		if (!engine->context_size)
+			continue;
+
+		intel_engine_pm_get(engine);
+
+		/*
+		 * Hide the old default state -- we lie about the context size
+		 * and get confused when the default state is smaller than
+		 * expected. For our do nothing request, inheriting the
+		 * active state is sufficient, we are only checking that we
+		 * don't use more than we planned.
+		 */
+		saved.state = fetch_and_zero(&engine->default_state);
+		saved.pinned = fetch_and_zero(&engine->pinned_default_state);
+
+		engine->context_size += I915_GTT_PAGE_SIZE;
+
+		err = __live_context_size(engine, fixme);
+
+		engine->context_size -= I915_GTT_PAGE_SIZE;
+
+		engine->pinned_default_state = saved.pinned;
+		engine->default_state = saved.state;
+
+		intel_engine_pm_put(engine);
+
+		if (err)
+			break;
+	}
+
+	kernel_context_close(fixme);
+unlock:
+	mutex_unlock(&gt->i915->drm.struct_mutex);
+	return err;
+}
+
 static int __live_active_context(struct intel_engine_cs *engine,
 				 struct i915_gem_context *fixme)
 {
@@ -303,6 +430,7 @@  static int live_remote_context(void *arg)
 int intel_context_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(live_context_size),
 		SUBTEST(live_active_context),
 		SUBTEST(live_remote_context),
 	};

Comments

On 8/16/19 12:17 PM, Chris Wilson wrote:
> Add a redzone to our context image and check the HW does not write into
> after a context save, to verify that we have the correct context size.
> (This does vary with feature bits, so test with a live setup that should
> match how we run userspace.)
> 
> v2: Check the redzone on every context unpin
> v3: Use a kernel context to prevent loading garbage for ringbuffer
> submission
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c        |  33 ++++++
>   drivers/gpu/drm/i915/gt/selftest_context.c | 128 +++++++++++++++++++++
>   2 files changed, 161 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index e9863f4d826b..ade212686bf6 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1578,9 +1578,38 @@ static void execlists_context_destroy(struct kref *kref)
>   	intel_context_fini(ce);
>   	intel_context_free(ce);
>   }
> +static void
> +set_redzone(void *vaddr, const struct intel_engine_cs *engine)
> +{
> +	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> +		return;
> +
> +	vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
> +	vaddr += engine->context_size;
> +
> +	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
> +}
> +
> +static void
> +check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
> +{
> +	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> +		return;
> +
> +	vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
> +	vaddr += engine->context_size;
> +
> +	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE))
> +		dev_err_once(engine->i915->drm.dev,
> +			     "%s context redzone overwritten!\n",
> +			     engine->name);
> +}
>   
>   static void execlists_context_unpin(struct intel_context *ce)
>   {
> +	check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
> +		      ce->engine);
> +
>   	i915_gem_context_unpin_hw_id(ce->gem_context);
>   	i915_gem_object_unpin_map(ce->state->obj);
>   	intel_ring_reset(ce->ring, ce->ring->tail);
> @@ -3119,6 +3148,8 @@ populate_lr_context(struct intel_context *ce,
>   		return ret;
>   	}
>   
> +	set_redzone(vaddr, engine);
> +
>   	if (engine->default_state) {
>   		/*
>   		 * We only want to copy over the template context state;
> @@ -3173,6 +3204,8 @@ static int __execlists_context_alloc(struct intel_context *ce,
>   	 * for our own use and for sharing with the GuC.
>   	 */
>   	context_size += LRC_HEADER_PAGES * PAGE_SIZE;
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> +		context_size += I915_GTT_PAGE_SIZE; /* for redzone */
>   
>   	ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
>   	if (IS_ERR(ctx_obj))
> diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
> index 6fbc72bc290e..cefd2df086fb 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_context.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_context.c
> @@ -5,6 +5,7 @@
>    */
>   
>   #include "i915_selftest.h"
> +#include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   
>   #include "gem/selftests/mock_context.h"
> @@ -64,6 +65,132 @@ static int context_sync(struct intel_context *ce)
>   	return err;
>   }
>   
> +static int __live_context_size(struct intel_engine_cs *engine,
> +			       struct i915_gem_context *fixme)
> +{
> +	struct intel_context *ce;
> +	struct i915_request *rq;
> +	void *vaddr;
> +	int err;
> +
> +	ce = intel_context_create(fixme, engine);
> +	if (IS_ERR(ce))
> +		return PTR_ERR(ce);
> +
> +	err = intel_context_pin(ce);
> +	if (err)
> +		goto err;
> +
> +	vaddr = i915_gem_object_pin_map(ce->state->obj,
> +					i915_coherent_map_type(engine->i915));
> +	if (IS_ERR(vaddr)) {
> +		err = PTR_ERR(vaddr);
> +		intel_context_unpin(ce);
> +		goto err;
> +	}
> +
> +	if (HAS_EXECLISTS(engine->i915))
> +		vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
> +
> +	vaddr += engine->context_size - I915_GTT_PAGE_SIZE;
> +	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
> +
> +	rq = intel_context_create_request(ce);
> +	intel_context_unpin(ce);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto err_unpin;
> +	}
> +
> +	err = request_sync(rq);
> +	if (err)
> +		goto err_unpin;
> +
> +	/* Force the context switch */
> +	rq = i915_request_create(engine->kernel_context);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto err_unpin;
> +	}
> +	err = request_sync(rq);
> +	if (err)
> +		goto err_unpin;
> +
> +	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) {
> +		pr_err("%s context overwrote trailing red-zone!", engine->name);
> +		err = -EINVAL;
> +	}
> +
> +err_unpin:
> +	i915_gem_object_unpin_map(ce->state->obj);
> +err:
> +	intel_context_put(ce);
> +	return err;
> +}
> +
> +static int live_context_size(void *arg)
> +{
> +	struct intel_gt *gt = arg;
> +	struct intel_engine_cs *engine;
> +	struct i915_gem_context *fixme;
> +	enum intel_engine_id id;
> +	int err = 0;
> +
> +	/*
> +	 * Check that our context sizes are correct by seeing if the
> +	 * HW tries to write past the end of one.
> +	 */
> +
> +	mutex_lock(&gt->i915->drm.struct_mutex);
> +
> +	fixme = kernel_context(gt->i915);
> +	if (IS_ERR(fixme)) {
> +		err = PTR_ERR(fixme);
> +		goto unlock;
> +	}
> +
> +	for_each_engine(engine, gt->i915, id) {
> +		struct {
> +			struct drm_i915_gem_object *state;
> +			void *pinned;
> +		} saved;
> +
> +		if (!engine->context_size)
> +			continue;
> +
> +		intel_engine_pm_get(engine);
> +
> +		/*
> +		 * Hide the old default state -- we lie about the context size
> +		 * and get confused when the default state is smaller than
> +		 * expected. For our do nothing request, inheriting the
> +		 * active state is sufficient, we are only checking that we
> +		 * don't use more than we planned.
> +		 */
> +		saved.state = fetch_and_zero(&engine->default_state);
> +		saved.pinned = fetch_and_zero(&engine->pinned_default_state);
> +
> +		engine->context_size += I915_GTT_PAGE_SIZE;

if CONFIG_DRM_I915_DEBUG_GEM is set we already bump the size inside the 
context_alloc(), do we need to bump it again here?

Daniele

> +
> +		err = __live_context_size(engine, fixme);
> +
> +		engine->context_size -= I915_GTT_PAGE_SIZE;
> +
> +		engine->pinned_default_state = saved.pinned;
> +		engine->default_state = saved.state;
> +
> +		intel_engine_pm_put(engine);
> +
> +		if (err)
> +			break;
> +	}
> +
> +	kernel_context_close(fixme);
> +unlock:
> +	mutex_unlock(&gt->i915->drm.struct_mutex);
> +	return err;
> +}
> +
>   static int __live_active_context(struct intel_engine_cs *engine,
>   				 struct i915_gem_context *fixme)
>   {
> @@ -303,6 +430,7 @@ static int live_remote_context(void *arg)
>   int intel_context_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
> +		SUBTEST(live_context_size),
>   		SUBTEST(live_active_context),
>   		SUBTEST(live_remote_context),
>   	};
>
Quoting Daniele Ceraolo Spurio (2019-08-16 22:50:43)
> 
> 
> On 8/16/19 12:17 PM, Chris Wilson wrote:
> > +static int live_context_size(void *arg)
> > +{
> > +             /*
> > +              * Hide the old default state -- we lie about the context size
> > +              * and get confused when the default state is smaller than
> > +              * expected. For our do nothing request, inheriting the
> > +              * active state is sufficient, we are only checking that we
> > +              * don't use more than we planned.
> > +              */
> > +             saved.state = fetch_and_zero(&engine->default_state);
> > +             saved.pinned = fetch_and_zero(&engine->pinned_default_state);
> > +
> > +             engine->context_size += I915_GTT_PAGE_SIZE;
> 
> if CONFIG_DRM_I915_DEBUG_GEM is set we already bump the size inside the 
> context_alloc(), do we need to bump it again here?

No, it comes out in the wash as we apply the same redzone twice. At least,
adding and checking a second page after what we believe to be the end of
the context image does not help sensitivity (so makes a worse test imo).

The benefit of this selftest is that we check all submission modes, and
can set up any execution pattern we think might be required (within
reason). So, I think it still has a use even if we need to remind
ourselves of the overlap.
-Chris
On 8/16/19 3:13 PM, Chris Wilson wrote:
> Quoting Daniele Ceraolo Spurio (2019-08-16 22:50:43)
>>
>>
>> On 8/16/19 12:17 PM, Chris Wilson wrote:
>>> +static int live_context_size(void *arg)
>>> +{
>>> +             /*
>>> +              * Hide the old default state -- we lie about the context size
>>> +              * and get confused when the default state is smaller than
>>> +              * expected. For our do nothing request, inheriting the
>>> +              * active state is sufficient, we are only checking that we
>>> +              * don't use more than we planned.
>>> +              */
>>> +             saved.state = fetch_and_zero(&engine->default_state);
>>> +             saved.pinned = fetch_and_zero(&engine->pinned_default_state);
>>> +
>>> +             engine->context_size += I915_GTT_PAGE_SIZE;
>>
>> if CONFIG_DRM_I915_DEBUG_GEM is set we already bump the size inside the
>> context_alloc(), do we need to bump it again here?
> 
> No, it comes out in the wash as we apply the same redzone twice. At least,
> adding and checking a second page after what we believe to be the end of
> the context image does not help sensitivity (so makes a worse test imo).
> 
> The benefit of this selftest is that we check all submission modes, and
> can set up any execution pattern we think might be required (within
> reason). So, I think it still has a use even if we need to remind
> ourselves of the overlap.
> -Chris
> 

Fair enough. With the above written down as a comment in the code:

Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele