[v5,17/18] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

Submitted by Michel Thierry on March 25, 2017, 1:30 a.m.

Details

Message ID 20170325013010.36244-18-michel.thierry@intel.com
State New
Headers show
Series "Gen8+ engine-reset" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Michel Thierry March 25, 2017, 1:30 a.m.
Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in microseconds, and the driver will do the conversion to
'timestamps'.

The recommended default watchdog threshold for video engines is 60000 us,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106000us and the theoretical max value (all 1s) is
353 seconds.

v2: Fixed get api to return values in microseconds. Threshold updated to
be per context engine. Check for u32 overflow. Capture ctx threshold
value in error state.

Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
 drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
 drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
 include/uapi/drm/i915_drm.h             |  1 +
 6 files changed, 108 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index b43c37a911bb..1741584d858f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1039,6 +1039,7 @@  struct i915_gpu_state {
 			int ban_score;
 			int active;
 			int guilty;
+			int watchdog_threshold;
 		} context;
 
 		struct drm_i915_error_object {
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index edbed85a1c88..f5c126c0c681 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -422,6 +422,78 @@  i915_gem_context_create_gvt(struct drm_device *dev)
 	return ctx;
 }
 
+/* Return the timer count threshold in microseconds. */
+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
+				  struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_private *dev_priv = ctx->i915;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	u32 threshold_in_us[I915_NUM_ENGINES];
+
+	if (!dev_priv->engine[VCS]->emit_start_watchdog)
+		return -ENODEV;
+
+	for_each_engine(engine, dev_priv, id) {
+		struct intel_context *ce = &ctx->engine[id];
+
+		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
+	}
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+	if (__copy_to_user(u64_to_user_ptr(args->value),
+			   &threshold_in_us,
+			   sizeof(threshold_in_us))) {
+		mutex_lock(&dev_priv->drm.struct_mutex);
+		return -EFAULT;
+	}
+	mutex_lock(&dev_priv->drm.struct_mutex);
+	args->size = sizeof(threshold_in_us);
+
+	return 0;
+}
+
+/*
+ * Based on time out value in microseconds (us) calculate
+ * timer count thresholds needed based on core frequency.
+ * Watchdog can be disabled by setting it to 0.
+ */
+int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
+				  struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_private *dev_priv = ctx->i915;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	u32 threshold_in_us[I915_NUM_ENGINES];
+
+	if (!dev_priv->engine[VCS]->emit_start_watchdog)
+		return -ENODEV;
+	else if (args->size < sizeof(threshold_in_us))
+		return -EINVAL;
+
+	mutex_unlock(&dev_priv->drm.struct_mutex);
+	if (copy_from_user(&threshold_in_us,
+			   u64_to_user_ptr(args->value),
+			   sizeof(threshold_in_us))) {
+		mutex_lock(&dev_priv->drm.struct_mutex);
+		return -EFAULT;
+	}
+	mutex_lock(&dev_priv->drm.struct_mutex);
+
+	/* not supported in blitter engine */
+	if (threshold_in_us[BCS] != 0)
+		return -EINVAL;
+
+	for_each_engine(engine, dev_priv, id) {
+		struct intel_context *ce = &ctx->engine[id];
+
+		ce->watchdog_threshold =
+			watchdog_to_clock_counts((u64)threshold_in_us[id]);
+	}
+
+	return 0;
+}
+
 int i915_gem_context_init(struct drm_i915_private *dev_priv)
 {
 	struct i915_gem_context *ctx;
@@ -1061,6 +1133,9 @@  int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_BANNABLE:
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = i915_gem_context_get_watchdog(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1118,6 +1193,9 @@  int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		else
 			i915_gem_context_clear_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		ret = i915_gem_context_set_watchdog(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 88700bdbb4e1..6867b1fead8b 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -250,6 +250,26 @@  static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
 	return !ctx->file_priv;
 }
 
+/*
+ * Timestamp timer resolution = 0.080 uSec,
+ * or 12500000 counts per second, or ~12 counts per microsecond.
+ */
+#define TIMESTAMP_CNTS_PER_USEC 12
+static inline u32 watchdog_to_us(u32 value_in_clock_counts)
+{
+	return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
+}
+
+static inline u32 watchdog_to_clock_counts(u64 value_in_us)
+{
+	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
+
+	if (GEM_WARN_ON(overflows_type(threshold, u32)))
+		return 0;
+
+	return threshold;
+}
+
 /* i915_gem_context.c */
 int __must_check i915_gem_context_init(struct drm_i915_private *dev_priv);
 void i915_gem_context_lost(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5d015bcc7484..8acb83778030 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -388,9 +388,10 @@  static void error_print_context(struct drm_i915_error_state_buf *m,
 				const char *header,
 				const struct drm_i915_error_context *ctx)
 {
-	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
+	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d, watchdog %dus\n",
 		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
-		   ctx->ban_score, ctx->guilty, ctx->active);
+		   ctx->ban_score, ctx->guilty, ctx->active,
+		   watchdog_to_us(ctx->watchdog_threshold));
 }
 
 static void error_print_engine(struct drm_i915_error_state_buf *m,
@@ -1336,7 +1337,8 @@  static void error_record_engine_execlists(struct intel_engine_cs *engine,
 }
 
 static void record_context(struct drm_i915_error_context *e,
-			   struct i915_gem_context *ctx)
+			   struct i915_gem_context *ctx,
+			   u32 engine_id)
 {
 	if (ctx->pid) {
 		struct task_struct *task;
@@ -1355,6 +1357,7 @@  static void record_context(struct drm_i915_error_context *e,
 	e->ban_score = ctx->ban_score;
 	e->guilty = ctx->guilty_count;
 	e->active = ctx->active_count;
+	e->watchdog_threshold =	ctx->engine[engine_id].watchdog_threshold;
 }
 
 static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
@@ -1389,7 +1392,7 @@  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			ee->vm = request->ctx->ppgtt ?
 				&request->ctx->ppgtt->base : &ggtt->base;
 
-			record_context(&ee->context, request->ctx);
+			record_context(&ee->context, request->ctx, engine->id);
 
 			/* We need to copy these to an anonymous buffer
 			 * as the simplest method to avoid being overwritten
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2736f642dc76..3f2b57a22338 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1496,7 +1496,7 @@  static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
 	return 0;
 }
 
-#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function
+#define GEN8_WATCHDOG_1000US watchdog_to_clock_counts(1000)
 static void gen8_watchdog_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index f083931a7809..448c9c0faa69 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1293,6 +1293,7 @@  struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
 #define I915_CONTEXT_PARAM_BANNABLE	0x5
+#define I915_CONTEXT_PARAM_WATCHDOG	0x6
 	__u64 value;
 };
 

Comments

On Fri, Mar 24, 2017 at 06:30:09PM -0700, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in microseconds, and the driver will do the conversion to
> 'timestamps'.
> 
> The recommended default watchdog threshold for video engines is 60000 us,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106000us and the theoretical max value (all 1s) is
> 353 seconds.
> 
> v2: Fixed get api to return values in microseconds. Threshold updated to
> be per context engine. Check for u32 overflow. Capture ctx threshold
> value in error state.
> 
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>  include/uapi/drm/i915_drm.h             |  1 +
>  6 files changed, 108 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b43c37a911bb..1741584d858f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>  			int ban_score;
>  			int active;
>  			int guilty;
> +			int watchdog_threshold;

Shouldn't this be added in the patch adding it to the context for real?

>  		} context;
>  
>  		struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index edbed85a1c88..f5c126c0c681 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>  	return ctx;
>  }
>  
> +/* Return the timer count threshold in microseconds. */
> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];

Reading is a 2 step process. First CONTEXT_GET_PARAM passes in size==0
and gets told by the kernel what size array to allocate. Second pass
supplies a buffer with that size. (Or userspace can preallocate a large
enough buffer, declare it's full size and let the kernel fill as much as
it wants.)

if (args->size == 0)
	goto out;

if (args->size < sizeof(threshold_in_us))
	return -EFAULT;

> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
> +	}
> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);

Grr. We should look at why we have this lock here in the first place.
IIRC, it was there to make TRTT easier, but we can always move the
burden of work again.

> +	if (__copy_to_user(u64_to_user_ptr(args->value),
> +			   &threshold_in_us,
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);

out:
> +	args->size = sizeof(threshold_in_us);
> +
> +	return 0;
> +}
> +
> +/*
> + * Based on time out value in microseconds (us) calculate
> + * timer count thresholds needed based on core frequency.
> + * Watchdog can be disabled by setting it to 0.
> + */
> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];
> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +	else if (args->size < sizeof(threshold_in_us))
> +		return -EINVAL;

args->size == 0 here could be used to mean one value for all? That might
be a little clumsy compared to the getter, but easy for example for
disabling the watchdog on all engines.

if (args->size == 0) do_set_all()
if (args->size < sizeof())
	return -EFAULT;

> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	if (copy_from_user(&threshold_in_us,
> +			   u64_to_user_ptr(args->value),
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);
> +
> +	/* not supported in blitter engine */
> +	if (threshold_in_us[BCS] != 0)
> +		return -EINVAL;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		ce->watchdog_threshold =
> +			watchdog_to_clock_counts((u64)threshold_in_us[id]);

Cast is superfluous.

> +	}
> +
> +	return 0;
> +}
> +
>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>  {
>  	struct i915_gem_context *ctx;
> @@ -1061,6 +1133,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_get_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -1118,6 +1193,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  		else
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_set_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 88700bdbb4e1..6867b1fead8b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -250,6 +250,26 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>  	return !ctx->file_priv;
>  }
>  
> +/*
> + * Timestamp timer resolution = 0.080 uSec,
> + * or 12500000 counts per second, or ~12 counts per microsecond.
> + */
> +#define TIMESTAMP_CNTS_PER_USEC 12
> +static inline u32 watchdog_to_us(u32 value_in_clock_counts)
> +{
> +	return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
> +}

(brackets) (for fun) (?)

> +
> +static inline u32 watchdog_to_clock_counts(u64 value_in_us)
> +{
> +	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
> +
> +	if (GEM_WARN_ON(overflows_type(threshold, u32)))

Nice idea, and yes we should do the range check. But this is a userspace
interface, so always do it and don't warn, just -EINVAL.
-Chris
On 25/03/17 02:38, Chris Wilson wrote:
> On Fri, Mar 24, 2017 at 06:30:09PM -0700, Michel Thierry wrote:
>> Final enablement patch for GPU hang detection using watchdog timeout.
>> Using the gem_context_setparam ioctl, users can specify the desired
>> timeout value in microseconds, and the driver will do the conversion to
>> 'timestamps'.
>>
>> The recommended default watchdog threshold for video engines is 60000 us,
>> since this has been _empirically determined_ to be a good compromise for
>> low-latency requirements and low rate of false positives. The default
>> register value is ~106000us and the theoretical max value (all 1s) is
>> 353 seconds.
>>
>> v2: Fixed get api to return values in microseconds. Threshold updated to
>> be per context engine. Check for u32 overflow. Capture ctx threshold
>> value in error state.
>>
>> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>>  drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
>>  drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>>  drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
>>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>>  include/uapi/drm/i915_drm.h             |  1 +
>>  6 files changed, 108 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>> index b43c37a911bb..1741584d858f 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>>  			int ban_score;
>>  			int active;
>>  			int guilty;
>> +			int watchdog_threshold;
>
> Shouldn't this be added in the patch adding it to the context for real?
>

I wanted to wait until I could print it in the error_state using 
watchdog_to_us (which is added until this patch).
I can also move all the i915_gpu_error.c changes to a new patch.

>>  		} context;
>>
>>  		struct drm_i915_error_object {
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
>> index edbed85a1c88..f5c126c0c681 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>>  	return ctx;
>>  }
>>
>> +/* Return the timer count threshold in microseconds. */
>> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
>> +				  struct drm_i915_gem_context_param *args)
>> +{
>> +	struct drm_i915_private *dev_priv = ctx->i915;
>> +	struct intel_engine_cs *engine;
>> +	enum intel_engine_id id;
>> +	u32 threshold_in_us[I915_NUM_ENGINES];
>
> Reading is a 2 step process. First CONTEXT_GET_PARAM passes in size==0
> and gets told by the kernel what size array to allocate. Second pass
> supplies a buffer with that size. (Or userspace can preallocate a large
> enough buffer, declare it's full size and let the kernel fill as much as
> it wants.)
>
> if (args->size == 0)
> 	goto out;
>
> if (args->size < sizeof(threshold_in_us))
> 	return -EFAULT;
EFAULT or EINVAL?

>> +
>> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> +		return -ENODEV;
>> +
>> +	for_each_engine(engine, dev_priv, id) {
>> +		struct intel_context *ce = &ctx->engine[id];
>> +
>> +		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
>> +	}
>> +
>> +	mutex_unlock(&dev_priv->drm.struct_mutex);
>
> Grr. We should look at why we have this lock here in the first place.
> IIRC, it was there to make TRTT easier, but we can always move the
> burden of work again.
>

It helps TRTT (that code also takes an extra reference of the ctx)... 
but it all started in i915_gem_context_lookup (499f2697da1d)

>> +	if (__copy_to_user(u64_to_user_ptr(args->value),
>> +			   &threshold_in_us,
>> +			   sizeof(threshold_in_us))) {
>> +		mutex_lock(&dev_priv->drm.struct_mutex);
>> +		return -EFAULT;
>> +	}
>> +	mutex_lock(&dev_priv->drm.struct_mutex);
>
> out:
>> +	args->size = sizeof(threshold_in_us);
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Based on time out value in microseconds (us) calculate
>> + * timer count thresholds needed based on core frequency.
>> + * Watchdog can be disabled by setting it to 0.
>> + */
>> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
>> +				  struct drm_i915_gem_context_param *args)
>> +{
>> +	struct drm_i915_private *dev_priv = ctx->i915;
>> +	struct intel_engine_cs *engine;
>> +	enum intel_engine_id id;
>> +	u32 threshold_in_us[I915_NUM_ENGINES];
>> +
>> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> +		return -ENODEV;
>> +	else if (args->size < sizeof(threshold_in_us))
>> +		return -EINVAL;
>
> args->size == 0 here could be used to mean one value for all? That might
> be a little clumsy compared to the getter, but easy for example for
> disabling the watchdog on all engines.
>
> if (args->size == 0) do_set_all()
> if (args->size < sizeof())
> 	return -EFAULT;
>
>> +
>> +	mutex_unlock(&dev_priv->drm.struct_mutex);
>> +	if (copy_from_user(&threshold_in_us,
>> +			   u64_to_user_ptr(args->value),
>> +			   sizeof(threshold_in_us))) {
>> +		mutex_lock(&dev_priv->drm.struct_mutex);
>> +		return -EFAULT;
>> +	}
>> +	mutex_lock(&dev_priv->drm.struct_mutex);
>> +
>> +	/* not supported in blitter engine */
>> +	if (threshold_in_us[BCS] != 0)
>> +		return -EINVAL;
>> +
>> +	for_each_engine(engine, dev_priv, id) {
>> +		struct intel_context *ce = &ctx->engine[id];
>> +
>> +		ce->watchdog_threshold =
>> +			watchdog_to_clock_counts((u64)threshold_in_us[id]);
>
> Cast is superfluous.
>

without it, the operation in watchdog_to_clock_counts was always casted 
to u32.

>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>>  {
>>  	struct i915_gem_context *ctx;
>> @@ -1061,6 +1133,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>>  	case I915_CONTEXT_PARAM_BANNABLE:
>>  		args->value = i915_gem_context_is_bannable(ctx);
>>  		break;
>> +	case I915_CONTEXT_PARAM_WATCHDOG:
>> +		ret = i915_gem_context_get_watchdog(ctx, args);
>> +		break;
>>  	default:
>>  		ret = -EINVAL;
>>  		break;
>> @@ -1118,6 +1193,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>>  		else
>>  			i915_gem_context_clear_bannable(ctx);
>>  		break;
>> +	case I915_CONTEXT_PARAM_WATCHDOG:
>> +		ret = i915_gem_context_set_watchdog(ctx, args);
>> +		break;
>>  	default:
>>  		ret = -EINVAL;
>>  		break;
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
>> index 88700bdbb4e1..6867b1fead8b 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.h
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
>> @@ -250,6 +250,26 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>>  	return !ctx->file_priv;
>>  }
>>
>> +/*
>> + * Timestamp timer resolution = 0.080 uSec,
>> + * or 12500000 counts per second, or ~12 counts per microsecond.
>> + */
>> +#define TIMESTAMP_CNTS_PER_USEC 12
>> +static inline u32 watchdog_to_us(u32 value_in_clock_counts)
>> +{
>> +	return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
>> +}
>
> (brackets) (for fun) (?)
>
>> +
>> +static inline u32 watchdog_to_clock_counts(u64 value_in_us)
>> +{
>> +	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
>> +
>> +	if (GEM_WARN_ON(overflows_type(threshold, u32)))
>
> Nice idea, and yes we should do the range check. But this is a userspace
> interface, so always do it and don't warn, just -EINVAL.

What if it fails/overflows after some engines' thresholds have been set, 
should it set them back to 0's or leave them enable?

All other fixes added.

Thanks
On Mon, Mar 27, 2017 at 06:03:37PM -0700, Michel Thierry wrote:
> On 25/03/17 02:38, Chris Wilson wrote:
> >On Fri, Mar 24, 2017 at 06:30:09PM -0700, Michel Thierry wrote:
> >>diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >>index b43c37a911bb..1741584d858f 100644
> >>--- a/drivers/gpu/drm/i915/i915_drv.h
> >>+++ b/drivers/gpu/drm/i915/i915_drv.h
> >>@@ -1039,6 +1039,7 @@ struct i915_gpu_state {
> >> 			int ban_score;
> >> 			int active;
> >> 			int guilty;
> >>+			int watchdog_threshold;
> >
> >Shouldn't this be added in the patch adding it to the context for real?
> >
> 
> I wanted to wait until I could print it in the error_state using
> watchdog_to_us (which is added until this patch).
> I can also move all the i915_gpu_error.c changes to a new patch.

Probably the best compromise.

> >>+/* Return the timer count threshold in microseconds. */
> >>+int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> >>+				  struct drm_i915_gem_context_param *args)
> >>+{
> >>+	struct drm_i915_private *dev_priv = ctx->i915;
> >>+	struct intel_engine_cs *engine;
> >>+	enum intel_engine_id id;
> >>+	u32 threshold_in_us[I915_NUM_ENGINES];
> >
> >Reading is a 2 step process. First CONTEXT_GET_PARAM passes in size==0
> >and gets told by the kernel what size array to allocate. Second pass
> >supplies a buffer with that size. (Or userspace can preallocate a large
> >enough buffer, declare it's full size and let the kernel fill as much as
> >it wants.)
> >
> >if (args->size == 0)
> >	goto out;
> >
> >if (args->size < sizeof(threshold_in_us))
> >	return -EFAULT;
> EFAULT or EINVAL?

EFAULT. The user buffer is not an absolute fixed requirement, so too
small a buffer is an overrun -> fault.
 
> >>+
> >>+	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> >>+		return -ENODEV;
> >>+
> >>+	for_each_engine(engine, dev_priv, id) {
> >>+		struct intel_context *ce = &ctx->engine[id];
> >>+
> >>+		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
> >>+	}
> >>+
> >>+	mutex_unlock(&dev_priv->drm.struct_mutex);
> >
> >Grr. We should look at why we have this lock here in the first place.
> >IIRC, it was there to make TRTT easier, but we can always move the
> >burden of work again.
> >
> 
> It helps TRTT (that code also takes an extra reference of the
> ctx)... but it all started in i915_gem_context_lookup (499f2697da1d)

It is just laziness that we kept it around the switch, and part of the
reason for that laziness to remain was trtt. Note, we don't need the
mutex around the context lookup (or unref) with a bit of jiggery-pokery.

> >>+	mutex_unlock(&dev_priv->drm.struct_mutex);
> >>+	if (copy_from_user(&threshold_in_us,
> >>+			   u64_to_user_ptr(args->value),
> >>+			   sizeof(threshold_in_us))) {
> >>+		mutex_lock(&dev_priv->drm.struct_mutex);
> >>+		return -EFAULT;
> >>+	}
> >>+	mutex_lock(&dev_priv->drm.struct_mutex);
> >>+
> >>+	/* not supported in blitter engine */
> >>+	if (threshold_in_us[BCS] != 0)
> >>+		return -EINVAL;
> >>+
> >>+	for_each_engine(engine, dev_priv, id) {
> >>+		struct intel_context *ce = &ctx->engine[id];
> >>+
> >>+		ce->watchdog_threshold =
> >>+			watchdog_to_clock_counts((u64)threshold_in_us[id]);
> >
> >Cast is superfluous.
> >
> 
> without it, the operation in watchdog_to_clock_counts was always
> casted to u32.

static inline u32 watchdog_to_clock_counts(u64 value_in_us) ? 

> >>+static inline u32 watchdog_to_clock_counts(u64 value_in_us)
> >>+{
> >>+	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
> >>+
> >>+	if (GEM_WARN_ON(overflows_type(threshold, u32)))
> >
> >Nice idea, and yes we should do the range check. But this is a userspace
> >interface, so always do it and don't warn, just -EINVAL.
> 
> What if it fails/overflows after some engines' thresholds have been
> set, should it set them back to 0's or leave them enable?

Yes. Validate userspace first, then apply, so the set of changes is
atomic and the ret is either success or EINVAL.
-Chris
On Tue, Mar 28, 2017 at 09:31:16AM +0100, Chris Wilson wrote:
> On Mon, Mar 27, 2017 at 06:03:37PM -0700, Michel Thierry wrote:
> > What if it fails/overflows after some engines' thresholds have been
> > set, should it set them back to 0's or leave them enable?
> 
> Yes. Validate userspace first, then apply, so the set of changes is
> atomic and the ret is either success or EINVAL.

Oh, it is much worse than that. We can't use engine->id as the UABI
index. We need to fix the exec_id indexing first and rename it to uabi_id.
-Chris
On 24/03/17 18:30, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in microseconds, and the driver will do the conversion to
> 'timestamps'.
>
> The recommended default watchdog threshold for video engines is 60000 us,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106000us and the theoretical max value (all 1s) is
> 353 seconds.
>
> v2: Fixed get api to return values in microseconds. Threshold updated to
> be per context engine. Check for u32 overflow. Capture ctx threshold
> value in error state.
>
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem_context.c | 78 +++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>  drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>  include/uapi/drm/i915_drm.h             |  1 +
>  6 files changed, 108 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b43c37a911bb..1741584d858f 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>  			int ban_score;
>  			int active;
>  			int guilty;
> +			int watchdog_threshold;
>  		} context;
>
>  		struct drm_i915_error_object {
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index edbed85a1c88..f5c126c0c681 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>  	return ctx;
>  }
>
> +/* Return the timer count threshold in microseconds. */
> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];
> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
> +	}
> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	if (__copy_to_user(u64_to_user_ptr(args->value),
> +			   &threshold_in_us,
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);
> +	args->size = sizeof(threshold_in_us);
> +
> +	return 0;
> +}
> +
> +/*
> + * Based on time out value in microseconds (us) calculate
> + * timer count thresholds needed based on core frequency.
> + * Watchdog can be disabled by setting it to 0.
> + */
> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
> +				  struct drm_i915_gem_context_param *args)
> +{
> +	struct drm_i915_private *dev_priv = ctx->i915;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	u32 threshold_in_us[I915_NUM_ENGINES];
> +
> +	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> +		return -ENODEV;
> +	else if (args->size < sizeof(threshold_in_us))
> +		return -EINVAL;

won't we break userspace with this check if we ever get more engines on 
a new platform and thus bump I915_NUM_ENGINES?

Thanks,
Daniele

> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	if (copy_from_user(&threshold_in_us,
> +			   u64_to_user_ptr(args->value),
> +			   sizeof(threshold_in_us))) {
> +		mutex_lock(&dev_priv->drm.struct_mutex);
> +		return -EFAULT;
> +	}
> +	mutex_lock(&dev_priv->drm.struct_mutex);
> +
> +	/* not supported in blitter engine */
> +	if (threshold_in_us[BCS] != 0)
> +		return -EINVAL;
> +
> +	for_each_engine(engine, dev_priv, id) {
> +		struct intel_context *ce = &ctx->engine[id];
> +
> +		ce->watchdog_threshold =
> +			watchdog_to_clock_counts((u64)threshold_in_us[id]);
> +	}
> +
> +	return 0;
> +}
> +
>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>  {
>  	struct i915_gem_context *ctx;
> @@ -1061,6 +1133,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_get_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -1118,6 +1193,9 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  		else
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		ret = i915_gem_context_set_watchdog(ctx, args);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 88700bdbb4e1..6867b1fead8b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -250,6 +250,26 @@ static inline bool i915_gem_context_is_kernel(struct i915_gem_context *ctx)
>  	return !ctx->file_priv;
>  }
>
> +/*
> + * Timestamp timer resolution = 0.080 uSec,
> + * or 12500000 counts per second, or ~12 counts per microsecond.
> + */
> +#define TIMESTAMP_CNTS_PER_USEC 12
> +static inline u32 watchdog_to_us(u32 value_in_clock_counts)
> +{
> +	return (value_in_clock_counts) / (TIMESTAMP_CNTS_PER_USEC);
> +}
> +
> +static inline u32 watchdog_to_clock_counts(u64 value_in_us)
> +{
> +	u64 threshold = (value_in_us) * (TIMESTAMP_CNTS_PER_USEC);
> +
> +	if (GEM_WARN_ON(overflows_type(threshold, u32)))
> +		return 0;
> +
> +	return threshold;
> +}
> +
>  /* i915_gem_context.c */
>  int __must_check i915_gem_context_init(struct drm_i915_private *dev_priv);
>  void i915_gem_context_lost(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 5d015bcc7484..8acb83778030 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -388,9 +388,10 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
>  				const char *header,
>  				const struct drm_i915_error_context *ctx)
>  {
> -	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
> +	err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d, watchdog %dus\n",
>  		   header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
> -		   ctx->ban_score, ctx->guilty, ctx->active);
> +		   ctx->ban_score, ctx->guilty, ctx->active,
> +		   watchdog_to_us(ctx->watchdog_threshold));
>  }
>
>  static void error_print_engine(struct drm_i915_error_state_buf *m,
> @@ -1336,7 +1337,8 @@ static void error_record_engine_execlists(struct intel_engine_cs *engine,
>  }
>
>  static void record_context(struct drm_i915_error_context *e,
> -			   struct i915_gem_context *ctx)
> +			   struct i915_gem_context *ctx,
> +			   u32 engine_id)
>  {
>  	if (ctx->pid) {
>  		struct task_struct *task;
> @@ -1355,6 +1357,7 @@ static void record_context(struct drm_i915_error_context *e,
>  	e->ban_score = ctx->ban_score;
>  	e->guilty = ctx->guilty_count;
>  	e->active = ctx->active_count;
> +	e->watchdog_threshold =	ctx->engine[engine_id].watchdog_threshold;
>  }
>
>  static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
> @@ -1389,7 +1392,7 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
>  			ee->vm = request->ctx->ppgtt ?
>  				&request->ctx->ppgtt->base : &ggtt->base;
>
> -			record_context(&ee->context, request->ctx);
> +			record_context(&ee->context, request->ctx, engine->id);
>
>  			/* We need to copy these to an anonymous buffer
>  			 * as the simplest method to avoid being overwritten
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 2736f642dc76..3f2b57a22338 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1496,7 +1496,7 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
>  	return 0;
>  }
>
> -#define GEN8_WATCHDOG_1000US 0x2ee0 //XXX: Temp, replace with helper function
> +#define GEN8_WATCHDOG_1000US watchdog_to_clock_counts(1000)
>  static void gen8_watchdog_irq_handler(unsigned long data)
>  {
>  	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index f083931a7809..448c9c0faa69 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1293,6 +1293,7 @@ struct drm_i915_gem_context_param {
>  #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
>  #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
>  #define I915_CONTEXT_PARAM_BANNABLE	0x5
> +#define I915_CONTEXT_PARAM_WATCHDOG	0x6
>  	__u64 value;
>  };
>
>
On Fri, Apr 14, 2017 at 09:05:06AM -0700, Daniele Ceraolo Spurio wrote:
> On 24/03/17 18:30, Michel Thierry wrote:
> >+/*
> >+ * Based on time out value in microseconds (us) calculate
> >+ * timer count thresholds needed based on core frequency.
> >+ * Watchdog can be disabled by setting it to 0.
> >+ */
> >+int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
> >+				  struct drm_i915_gem_context_param *args)
> >+{
> >+	struct drm_i915_private *dev_priv = ctx->i915;
> >+	struct intel_engine_cs *engine;
> >+	enum intel_engine_id id;
> >+	u32 threshold_in_us[I915_NUM_ENGINES];
> >+
> >+	if (!dev_priv->engine[VCS]->emit_start_watchdog)
> >+		return -ENODEV;
> >+	else if (args->size < sizeof(threshold_in_us))
> >+		return -EINVAL;
> 
> won't we break userspace with this check if we ever get more engines
> on a new platform and thus bump I915_NUM_ENGINES?

Not really. Userspace uses a 2 step process to first determine the array
length it needs to pass to the kernel. It will just fill those rings it
doesn't know about with 0.

The alternative to using a fixed length array is using an array of
(engine-id, threshold) pairs. Which is probably going to be more
convenient to userspace.
-Chris
On 14/04/17 09:05, Daniele Ceraolo Spurio wrote:
>
>
> On 24/03/17 18:30, Michel Thierry wrote:
>> Final enablement patch for GPU hang detection using watchdog timeout.
>> Using the gem_context_setparam ioctl, users can specify the desired
>> timeout value in microseconds, and the driver will do the conversion to
>> 'timestamps'.
>>
>> The recommended default watchdog threshold for video engines is 60000 us,
>> since this has been _empirically determined_ to be a good compromise for
>> low-latency requirements and low rate of false positives. The default
>> register value is ~106000us and the theoretical max value (all 1s) is
>> 353 seconds.
>>
>> v2: Fixed get api to return values in microseconds. Threshold updated to
>> be per context engine. Check for u32 overflow. Capture ctx threshold
>> value in error state.
>>
>> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>>  drivers/gpu/drm/i915/i915_gem_context.c | 78
>> +++++++++++++++++++++++++++++++++
>>  drivers/gpu/drm/i915/i915_gem_context.h | 20 +++++++++
>>  drivers/gpu/drm/i915/i915_gpu_error.c   | 11 +++--
>>  drivers/gpu/drm/i915/intel_lrc.c        |  2 +-
>>  include/uapi/drm/i915_drm.h             |  1 +
>>  6 files changed, 108 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index b43c37a911bb..1741584d858f 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -1039,6 +1039,7 @@ struct i915_gpu_state {
>>              int ban_score;
>>              int active;
>>              int guilty;
>> +            int watchdog_threshold;
>>          } context;
>>
>>          struct drm_i915_error_object {
>> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c
>> b/drivers/gpu/drm/i915/i915_gem_context.c
>> index edbed85a1c88..f5c126c0c681 100644
>> --- a/drivers/gpu/drm/i915/i915_gem_context.c
>> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
>> @@ -422,6 +422,78 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>>      return ctx;
>>  }
>>
>> +/* Return the timer count threshold in microseconds. */
>> +int i915_gem_context_get_watchdog(struct i915_gem_context *ctx,
>> +                  struct drm_i915_gem_context_param *args)
>> +{
>> +    struct drm_i915_private *dev_priv = ctx->i915;
>> +    struct intel_engine_cs *engine;
>> +    enum intel_engine_id id;
>> +    u32 threshold_in_us[I915_NUM_ENGINES];
>> +
>> +    if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> +        return -ENODEV;
>> +
>> +    for_each_engine(engine, dev_priv, id) {
>> +        struct intel_context *ce = &ctx->engine[id];
>> +
>> +        threshold_in_us[id] = watchdog_to_us(ce->watchdog_threshold);
>> +    }
>> +
>> +    mutex_unlock(&dev_priv->drm.struct_mutex);
>> +    if (__copy_to_user(u64_to_user_ptr(args->value),
>> +               &threshold_in_us,
>> +               sizeof(threshold_in_us))) {
>> +        mutex_lock(&dev_priv->drm.struct_mutex);
>> +        return -EFAULT;
>> +    }
>> +    mutex_lock(&dev_priv->drm.struct_mutex);
>> +    args->size = sizeof(threshold_in_us);
>> +
>> +    return 0;
>> +}
>> +
>> +/*
>> + * Based on time out value in microseconds (us) calculate
>> + * timer count thresholds needed based on core frequency.
>> + * Watchdog can be disabled by setting it to 0.
>> + */
>> +int i915_gem_context_set_watchdog(struct i915_gem_context *ctx,
>> +                  struct drm_i915_gem_context_param *args)
>> +{
>> +    struct drm_i915_private *dev_priv = ctx->i915;
>> +    struct intel_engine_cs *engine;
>> +    enum intel_engine_id id;
>> +    u32 threshold_in_us[I915_NUM_ENGINES];
>> +
>> +    if (!dev_priv->engine[VCS]->emit_start_watchdog)
>> +        return -ENODEV;
>> +    else if (args->size < sizeof(threshold_in_us))
>> +        return -EINVAL;
>
> won't we break userspace with this check if we ever get more engines on
> a new platform and thus bump I915_NUM_ENGINES?
>
> Thanks,
> Daniele
>

There's a v3 of this patch with Chris feedback,
https://patchwork.freedesktop.org/patch/148805/