[RFC,3/3] drm/i915: Watchdog timeout: DRM kernel interface to set the timeout

Submitted by Michel Thierry on Feb. 23, 2017, 7:44 p.m.

Details

Message ID 20170223194421.28463-3-michel.thierry@intel.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Michel Thierry Feb. 23, 2017, 7:44 p.m.
Final enablement patch for GPU hang detection using watchdog timeout.
Using the gem_context_setparam ioctl, users can specify the desired
timeout value in milliseconds, and the driver will do the conversion to
'timestamps'.

The _recommended_ default watchdog threshold for video engines is 60 ms,
since this has been _empirically determined_ to be a good compromise for
low-latency requirements and low rate of false positives. The default
register value is ~106ms and the theoretical max value (all 1s) is
353 seconds.

Signed-off-by: Tomas Elf <tomas.elf@intel.com>
Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c    | 46 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.h    |  3 ++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 +-----
 include/uapi/drm/i915_drm.h                |  1 +
 4 files changed, 51 insertions(+), 7 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 99c46f4dbde6..c3748878e64c 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -440,6 +440,32 @@  i915_gem_context_create_gvt(struct drm_device *dev)
 	return ctx;
 }
 
+void i915_context_watchdog_setup(struct i915_gem_context *ctx, u32 value_in_ms)
+{
+	/*
+	 * Based on time out value (ms) calculate
+	 * timer count thresholds needed based on core frequency.
+	 */
+#define TIMER_MILLISECOND 1000
+
+	/*
+	 * Timestamp timer resolution = 0.080 uSec,
+	 * or 12500000 counts per second
+	 */
+#define TIMESTAMP_CNTS_PER_SEC_80NS 12500000
+
+	ctx->watchdog_threshold =
+		((value_in_ms) *
+		 ((TIMESTAMP_CNTS_PER_SEC_80NS) / (TIMER_MILLISECOND)));
+
+	/*
+	 * watchdog register must never be programmed to zero. This would
+	 * cause the watchdog counter to exceed and not allow the engine to
+	 * go into IDLE state
+	 */
+	GEM_BUG_ON(ctx->watchdog_threshold == 0);
+}
+
 int i915_gem_context_init(struct drm_i915_private *dev_priv)
 {
 	struct i915_gem_context *ctx;
@@ -1056,6 +1082,7 @@  int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct drm_i915_gem_context_param *args = data;
 	struct i915_gem_context *ctx;
+	struct intel_engine_cs *engine;
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1090,6 +1117,15 @@  int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_BANNABLE:
 		args->value = i915_gem_context_is_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		engine = to_i915(dev)->engine[VCS];
+		if (!engine->emit_start_watchdog)
+			ret = -EINVAL;
+		else if (args->value)
+			ret = -EINVAL;
+		else
+			args->value = ctx->watchdog_threshold;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1105,6 +1141,7 @@  int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct drm_i915_gem_context_param *args = data;
 	struct i915_gem_context *ctx;
+	struct intel_engine_cs *engine;
 	int ret;
 
 	ret = i915_mutex_lock_interruptible(dev);
@@ -1147,6 +1184,15 @@  int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		else
 			i915_gem_context_clear_bannable(ctx);
 		break;
+	case I915_CONTEXT_PARAM_WATCHDOG:
+		engine = to_i915(dev)->engine[VCS];
+		if (!engine->emit_start_watchdog)
+			ret = -EINVAL;
+		else if (!args->value)
+			ret = -EINVAL;
+		else
+			i915_context_watchdog_setup(ctx, args->value);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 0ac750b90f3d..133ed7b413aa 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -176,6 +176,9 @@  struct i915_gem_context {
 	/** ban_score: Accumulated score of all hangs caused by this context. */
 	int ban_score;
 
+	/** watchdog_threshold: hw watchdog threshold value, in clock counts */
+	u32 watchdog_threshold;
+
 	/** remap_slice: Bitmask of cache lines that need remapping */
 	u8 remap_slice;
 };
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 348d81c40e81..26c50a6d6158 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1419,12 +1419,6 @@  execbuf_submit(struct i915_execbuffer_params *params,
 	bool watchdog_running = false;
 	int ret;
 
-	/*
-	 * NB: Place-holder until watchdog timeout is enabled through DRM
-	 * interface
-	 */
-	bool enable_watchdog = false;
-
 	ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas);
 	if (ret)
 		return ret;
@@ -1488,7 +1482,7 @@  execbuf_submit(struct i915_execbuffer_params *params,
 	}
 
 	/* Start watchdog timer */
-	if (enable_watchdog) {
+	if (params->ctx->watchdog_threshold != 0) {
 		if (!params->engine->emit_start_watchdog)
 			return -EINVAL;
 
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 3554495bef13..e318c4f53a9e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1289,6 +1289,7 @@  struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
 #define I915_CONTEXT_PARAM_BANNABLE	0x5
+#define I915_CONTEXT_PARAM_WATCHDOG	0x6
 	__u64 value;
 };
 

Comments

On Thu, Feb 23, 2017 at 11:44:19AM -0800, Michel Thierry wrote:
> Final enablement patch for GPU hang detection using watchdog timeout.
> Using the gem_context_setparam ioctl, users can specify the desired
> timeout value in milliseconds, and the driver will do the conversion to
> 'timestamps'.
> 
> The _recommended_ default watchdog threshold for video engines is 60 ms,
> since this has been _empirically determined_ to be a good compromise for
> low-latency requirements and low rate of false positives. The default
> register value is ~106ms and the theoretical max value (all 1s) is
> 353 seconds.
> 
> Signed-off-by: Tomas Elf <tomas.elf@intel.com>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c    | 46 ++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_context.h    |  3 ++
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  8 +-----
>  include/uapi/drm/i915_drm.h                |  1 +
>  4 files changed, 51 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 99c46f4dbde6..c3748878e64c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -440,6 +440,32 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>  	return ctx;
>  }
>  
> +void i915_context_watchdog_setup(struct i915_gem_context *ctx, u32 value_in_ms)
Ahem.

> +{
> +	/*
> +	 * Based on time out value (ms) calculate
> +	 * timer count thresholds needed based on core frequency.
> +	 */
> +#define TIMER_MILLISECOND 1000
> +
> +	/*
> +	 * Timestamp timer resolution = 0.080 uSec,
> +	 * or 12500000 counts per second
> +	 */
> +#define TIMESTAMP_CNTS_PER_SEC_80NS 12500000
> +
> +	ctx->watchdog_threshold =
> +		((value_in_ms) *
> +		 ((TIMESTAMP_CNTS_PER_SEC_80NS) / (TIMER_MILLISECOND)));

Rescale to ns (u64 math), check for overflows before assigning to u32.

> +	/*
> +	 * watchdog register must never be programmed to zero. This would
> +	 * cause the watchdog counter to exceed and not allow the engine to
> +	 * go into IDLE state
> +	 */
> +	GEM_BUG_ON(ctx->watchdog_threshold == 0);

This throws a bug when the user tries to disable the watchdog
afterwards. [ctx->watchdog_threshold = 0 => disable watchdog around this
context, default].

> +}
> +
>  int i915_gem_context_init(struct drm_i915_private *dev_priv)
>  {
>  	struct i915_gem_context *ctx;
> @@ -1056,6 +1082,7 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	struct drm_i915_file_private *file_priv = file->driver_priv;
>  	struct drm_i915_gem_context_param *args = data;
>  	struct i915_gem_context *ctx;
> +	struct intel_engine_cs *engine;
>  	int ret;
>  
>  	ret = i915_mutex_lock_interruptible(dev);
> @@ -1090,6 +1117,15 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_BANNABLE:
>  		args->value = i915_gem_context_is_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		engine = to_i915(dev)->engine[VCS];
> +		if (!engine->emit_start_watchdog)
> +			ret = -EINVAL;
> +		else if (args->value)
> +			ret = -EINVAL;
> +		else
> +			args->value = ctx->watchdog_threshold;

Just 
	args->value = watchdog_to_ns(ctx->watchdog_threshold);
will be 0 where unset. On setting we complain if not supported.

Do we really want the user API to be in clock ticks rather than say ns?
I think we want ns, and don't forget to include that information in the
error state. Do we want to capture error state? Do we want this to
contribute to banning?

> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -1105,6 +1141,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  	struct drm_i915_file_private *file_priv = file->driver_priv;
>  	struct drm_i915_gem_context_param *args = data;
>  	struct i915_gem_context *ctx;
> +	struct intel_engine_cs *engine;
>  	int ret;
>  
>  	ret = i915_mutex_lock_interruptible(dev);
> @@ -1147,6 +1184,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  		else
>  			i915_gem_context_clear_bannable(ctx);
>  		break;
> +	case I915_CONTEXT_PARAM_WATCHDOG:
> +		engine = to_i915(dev)->engine[VCS];
> +		if (!engine->emit_start_watchdog)
> +			ret = -EINVAL;

ret = -ENODEV;

> +		else if (!args->value)
> +			ret = -EINVAL;
> +		else
> +			i915_context_watchdog_setup(ctx, args->value);

I'm pushing for ns. And this should be
i915_gem_context_set_watchdog(ctx, args->value);

> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 0ac750b90f3d..133ed7b413aa 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -176,6 +176,9 @@ struct i915_gem_context {
>  	/** ban_score: Accumulated score of all hangs caused by this context. */
>  	int ban_score;
>  
> +	/** watchdog_threshold: hw watchdog threshold value, in clock counts */
> +	u32 watchdog_threshold;
> +
>  	/** remap_slice: Bitmask of cache lines that need remapping */
>  	u8 remap_slice;
>  };
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 348d81c40e81..26c50a6d6158 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -1419,12 +1419,6 @@ execbuf_submit(struct i915_execbuffer_params *params,
>  	bool watchdog_running = false;
>  	int ret;
>  
> -	/*
> -	 * NB: Place-holder until watchdog timeout is enabled through DRM
> -	 * interface
> -	 */
> -	bool enable_watchdog = false;
> -
>  	ret = i915_gem_execbuffer_move_to_gpu(params->request, vmas);
>  	if (ret)
>  		return ret;
> @@ -1488,7 +1482,7 @@ execbuf_submit(struct i915_execbuffer_params *params,
>  	}
>  
>  	/* Start watchdog timer */
> -	if (enable_watchdog) {
> +	if (params->ctx->watchdog_threshold != 0) {
>  		if (!params->engine->emit_start_watchdog)
>  			return -EINVAL;

No. If we set a watchdog on a ctx, this then causes all use of BCS to
fail. We need saner API than that - either per-engine thresholds in the
ctx, or just document that the watchdog is only supported where
available by hw.
-Chris