drm/i915/selftests: Check the context size

Submitted by Chris Wilson on Aug. 16, 2019, 5:58 p.m.

Details

Message ID 20190816175801.8215-1-chris@chris-wilson.co.uk
State New
Headers show
Series "drm/i915/selftests: Check the context size" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Chris Wilson Aug. 16, 2019, 5:58 p.m.
Add a redzone to our context image and check the HW does not write into
after a context save, to verify that we have the correct context size.
(This does vary with feature bits, so test with a live setup that should
match how we run userspace.)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_context.c | 133 +++++++++++++++++++++
 1 file changed, 133 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
index 6fbc72bc290e..69f2233104f1 100644
--- a/drivers/gpu/drm/i915/gt/selftest_context.c
+++ b/drivers/gpu/drm/i915/gt/selftest_context.c
@@ -5,6 +5,7 @@ 
  */
 
 #include "i915_selftest.h"
+#include "intel_engine_pm.h"
 #include "intel_gt.h"
 
 #include "gem/selftests/mock_context.h"
@@ -64,6 +65,137 @@  static int context_sync(struct intel_context *ce)
 	return err;
 }
 
+static int __live_context_size(struct intel_engine_cs *engine,
+			       struct i915_gem_context *fixme)
+{
+	struct intel_context *ce;
+	struct i915_request *rq;
+	void *vaddr;
+	int err;
+
+	ce = intel_context_create(fixme, engine);
+	if (IS_ERR(ce))
+		return PTR_ERR(ce);
+
+	err = intel_context_pin(ce);
+	if (err)
+		goto err;
+
+	vaddr = i915_gem_object_pin_map(ce->state->obj,
+					i915_coherent_map_type(engine->i915));
+	if (IS_ERR(vaddr)) {
+		err = PTR_ERR(vaddr);
+		intel_context_unpin(ce);
+		goto err;
+	}
+
+	if (HAS_EXECLISTS(engine->i915))
+		vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
+
+	vaddr += engine->context_size - I915_GTT_PAGE_SIZE;
+	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
+
+	rq = intel_context_create_request(ce);
+	intel_context_unpin(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_unpin;
+	}
+
+	err = request_sync(rq);
+	if (err)
+		goto err_unpin;
+
+	/* Force the context switch */
+	rq = i915_request_create(engine->kernel_context);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_unpin;
+	}
+	err = request_sync(rq);
+	if (err)
+		goto err_unpin;
+
+	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) {
+		pr_err("%s context overwrote trailing red-zone!", engine->name);
+		err = -EINVAL;
+	}
+
+err_unpin:
+	i915_gem_object_unpin_map(ce->state->obj);
+err:
+	intel_context_put(ce);
+	return err;
+}
+
+static int live_context_size(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	struct i915_gem_context *fixme;
+	enum intel_engine_id id;
+	struct drm_file *file;
+	int err = 0;
+
+	/*
+	 * Check that are context sizes are correct by seeing if the
+	 * HW tries to write past the end of one.
+	 */
+
+	file = mock_file(gt->i915);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	mutex_lock(&gt->i915->drm.struct_mutex);
+
+	fixme = live_context(gt->i915, file);
+	if (IS_ERR(fixme)) {
+		err = PTR_ERR(fixme);
+		goto unlock;
+	}
+
+	for_each_engine(engine, gt->i915, id) {
+		struct {
+			struct drm_i915_gem_object *state;
+			void *pinned;
+		} saved;
+
+		if (!engine->context_size)
+			continue;
+
+		intel_engine_pm_get(engine);
+
+		/*
+		 * Hide the old default state -- we lie about the context size
+		 * and get confused when the default state is smaller than
+		 * expected. For our do nothing request, inheriting the
+		 * active state is sufficient, we are only checking that we
+		 * don't use more than we planned.
+		 */
+		saved.state = fetch_and_zero(&engine->default_state);
+		saved.pinned = fetch_and_zero(&engine->pinned_default_state);
+
+		engine->context_size += I915_GTT_PAGE_SIZE;
+
+		err = __live_context_size(engine, fixme);
+
+		engine->context_size -= I915_GTT_PAGE_SIZE;
+
+		engine->pinned_default_state = saved.pinned;
+		engine->default_state = saved.state;
+
+		intel_engine_pm_put(engine);
+
+		if (err)
+			break;
+	}
+
+unlock:
+	mutex_unlock(&gt->i915->drm.struct_mutex);
+	mock_file_free(gt->i915, file);
+	return err;
+}
+
 static int __live_active_context(struct intel_engine_cs *engine,
 				 struct i915_gem_context *fixme)
 {
@@ -303,6 +435,7 @@  static int live_remote_context(void *arg)
 int intel_context_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(live_context_size),
 		SUBTEST(live_active_context),
 		SUBTEST(live_remote_context),
 	};

Comments

On 8/16/19 10:58 AM, Chris Wilson wrote:
> Add a redzone to our context image and check the HW does not write into
> after a context save, to verify that we have the correct context size.
> (This does vary with feature bits, so test with a live setup that should
> match how we run userspace.)
> 

On newer gens the data saved during the ctx switch is variable, based on 
the type of switch and the current state of the HW, e.g. some state is 
only saved during a preemption, and the ctx layout is compressed 
accordingly. We'd need the test to generate the maximum possible size 
(and I have no idea which usage case produces that), but I don't think 
that'd scale well from gen to gen.

Daniele

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/selftest_context.c | 133 +++++++++++++++++++++
>   1 file changed, 133 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
> index 6fbc72bc290e..69f2233104f1 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_context.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_context.c
> @@ -5,6 +5,7 @@
>    */
>   
>   #include "i915_selftest.h"
> +#include "intel_engine_pm.h"
>   #include "intel_gt.h"
>   
>   #include "gem/selftests/mock_context.h"
> @@ -64,6 +65,137 @@ static int context_sync(struct intel_context *ce)
>   	return err;
>   }
>   
> +static int __live_context_size(struct intel_engine_cs *engine,
> +			       struct i915_gem_context *fixme)
> +{
> +	struct intel_context *ce;
> +	struct i915_request *rq;
> +	void *vaddr;
> +	int err;
> +
> +	ce = intel_context_create(fixme, engine);
> +	if (IS_ERR(ce))
> +		return PTR_ERR(ce);
> +
> +	err = intel_context_pin(ce);
> +	if (err)
> +		goto err;
> +
> +	vaddr = i915_gem_object_pin_map(ce->state->obj,
> +					i915_coherent_map_type(engine->i915));
> +	if (IS_ERR(vaddr)) {
> +		err = PTR_ERR(vaddr);
> +		intel_context_unpin(ce);
> +		goto err;
> +	}
> +
> +	if (HAS_EXECLISTS(engine->i915))
> +		vaddr += LRC_HEADER_PAGES * PAGE_SIZE;
> +
> +	vaddr += engine->context_size - I915_GTT_PAGE_SIZE;
> +	memset(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE);
> +
> +	rq = intel_context_create_request(ce);
> +	intel_context_unpin(ce);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto err_unpin;
> +	}
> +
> +	err = request_sync(rq);
> +	if (err)
> +		goto err_unpin;
> +
> +	/* Force the context switch */
> +	rq = i915_request_create(engine->kernel_context);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto err_unpin;
> +	}
> +	err = request_sync(rq);
> +	if (err)
> +		goto err_unpin;
> +
> +	if (memchr_inv(vaddr, POISON_INUSE, I915_GTT_PAGE_SIZE)) {
> +		pr_err("%s context overwrote trailing red-zone!", engine->name);
> +		err = -EINVAL;
> +	}
> +
> +err_unpin:
> +	i915_gem_object_unpin_map(ce->state->obj);
> +err:
> +	intel_context_put(ce);
> +	return err;
> +}
> +
> +static int live_context_size(void *arg)
> +{
> +	struct intel_gt *gt = arg;
> +	struct intel_engine_cs *engine;
> +	struct i915_gem_context *fixme;
> +	enum intel_engine_id id;
> +	struct drm_file *file;
> +	int err = 0;
> +
> +	/*
> +	 * Check that are context sizes are correct by seeing if the
> +	 * HW tries to write past the end of one.
> +	 */
> +
> +	file = mock_file(gt->i915);
> +	if (IS_ERR(file))
> +		return PTR_ERR(file);
> +
> +	mutex_lock(&gt->i915->drm.struct_mutex);
> +
> +	fixme = live_context(gt->i915, file);
> +	if (IS_ERR(fixme)) {
> +		err = PTR_ERR(fixme);
> +		goto unlock;
> +	}
> +
> +	for_each_engine(engine, gt->i915, id) {
> +		struct {
> +			struct drm_i915_gem_object *state;
> +			void *pinned;
> +		} saved;
> +
> +		if (!engine->context_size)
> +			continue;
> +
> +		intel_engine_pm_get(engine);
> +
> +		/*
> +		 * Hide the old default state -- we lie about the context size
> +		 * and get confused when the default state is smaller than
> +		 * expected. For our do nothing request, inheriting the
> +		 * active state is sufficient, we are only checking that we
> +		 * don't use more than we planned.
> +		 */
> +		saved.state = fetch_and_zero(&engine->default_state);
> +		saved.pinned = fetch_and_zero(&engine->pinned_default_state);
> +
> +		engine->context_size += I915_GTT_PAGE_SIZE;
> +
> +		err = __live_context_size(engine, fixme);
> +
> +		engine->context_size -= I915_GTT_PAGE_SIZE;
> +
> +		engine->pinned_default_state = saved.pinned;
> +		engine->default_state = saved.state;
> +
> +		intel_engine_pm_put(engine);
> +
> +		if (err)
> +			break;
> +	}
> +
> +unlock:
> +	mutex_unlock(&gt->i915->drm.struct_mutex);
> +	mock_file_free(gt->i915, file);
> +	return err;
> +}
> +
>   static int __live_active_context(struct intel_engine_cs *engine,
>   				 struct i915_gem_context *fixme)
>   {
> @@ -303,6 +435,7 @@ static int live_remote_context(void *arg)
>   int intel_context_live_selftests(struct drm_i915_private *i915)
>   {
>   	static const struct i915_subtest tests[] = {
> +		SUBTEST(live_context_size),
>   		SUBTEST(live_active_context),
>   		SUBTEST(live_remote_context),
>   	};
>
Quoting Daniele Ceraolo Spurio (2019-08-16 19:43:47)
> 
> 
> On 8/16/19 10:58 AM, Chris Wilson wrote:
> > Add a redzone to our context image and check the HW does not write into
> > after a context save, to verify that we have the correct context size.
> > (This does vary with feature bits, so test with a live setup that should
> > match how we run userspace.)
> > 
> 
> On newer gens the data saved during the ctx switch is variable, based on 
> the type of switch and the current state of the HW, e.g. some state is 
> only saved during a preemption, and the ctx layout is compressed 
> accordingly. We'd need the test to generate the maximum possible size 
> (and I have no idea which usage case produces that), but I don't think 
> that'd scale well from gen to gen.

I'd take this as a starting point, and we can definitely generate
preemption events easily etc -- it's only when it start depending on state
set by userspace do we run into logistical problems (I can only dread
encountering such a bug in the wild).

So v2, add a redzone everywhere and check on context unpinning.
-Chris