[v5,18/18] drm/i915: Watchdog timeout: Export media reset count from GuC to debugfs

Submitted by Michel Thierry on March 25, 2017, 1:30 a.m.

Details

Message ID 20170325013010.36244-19-michel.thierry@intel.com
State New
Headers show
Series "Gen8+ engine-reset" ( rev: 1 ) in Intel GFX

Not browsing as part of any series.

Commit Message

Michel Thierry March 25, 2017, 1:30 a.m.
From firmware v8.8, GuC provides the count of media engine resets
(watchdog timeout). This information is available in the GuC shared
context data struct, which resides in the first page of the default
(kernel) lrc context.

Since GuC handled engine resets are transparent for kernel and user,
provide a simple debugfs entry to see the number of times media reset
has happened.

Signed-off-by: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 28 ++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_guc_fwif.h | 18 ++++++++++++++++++
 2 files changed, 46 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 8db850541e03..f40a2c84b423 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1400,6 +1400,32 @@  static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	return 0;
 }
 
+static u32 i915_watchdog_reset_count(struct drm_i915_private *dev_priv)
+{
+	struct drm_device *dev = &dev_priv->drm;
+	struct i915_gem_context *ctx;
+	struct page *page;
+	struct guc_shared_ctx_data *guc_shared_data;
+	u32 guc_media_reset_count;
+
+	if (!i915.enable_guc_submission)
+		return 0;
+
+	if (mutex_lock_interruptible(&dev->struct_mutex))
+		return 0;
+
+	ctx = dev_priv->kernel_context;
+	page = i915_gem_object_get_dirty_page(ctx->engine[RCS].state->obj,
+					      LRC_GUCSHR_PN);
+	guc_shared_data = kmap_atomic(page);
+	guc_media_reset_count = guc_shared_data->media_reset_count;
+	kunmap_atomic(guc_shared_data);
+
+	mutex_unlock(&dev->struct_mutex);
+
+	return guc_media_reset_count;
+}
+
 static int i915_reset_info(struct seq_file *m, void *unused)
 {
 	struct drm_i915_private *dev_priv = node_to_i915(m->private);
@@ -1408,6 +1434,8 @@  static int i915_reset_info(struct seq_file *m, void *unused)
 	enum intel_engine_id id;
 
 	seq_printf(m, "full gpu reset = %u\n", i915_reset_count(error));
+	seq_printf(m, "GuC watchdog/media reset = %u\n",
+		   i915_watchdog_reset_count(dev_priv));
 
 	for_each_engine(engine, dev_priv, id) {
 		seq_printf(m, "%s = %u\n", engine->name,
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 5db3def5f74e..d9dc844fcce0 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -503,6 +503,24 @@  union guc_log_control {
 	u32 value;
 } __packed;
 
+/* GuC Shared Context Data Struct */
+struct guc_shared_ctx_data {
+	u32 addr_of_last_preempted_data_low;
+	u32 addr_of_last_preempted_data_high;
+	u32 addr_of_last_preempted_data_high_tmp;
+	u32 padding;
+	u32 is_mapped_to_proxy;
+	u32 proxy_ctx_id;
+	u32 engine_reset_ctx_id;
+	u32 media_reset_count;
+	u32 reserved[8];
+	u32 uk_last_ctx_switch_reason;
+	u32 was_reset;
+	u32 lrca_gpu_addr;
+	u32 execlist_ctx;
+	u32 reserved1[32];
+} __packed;
+
 /* This Action will be programmed in C180 - SOFT_SCRATCH_O_REG */
 enum intel_guc_action {
 	INTEL_GUC_ACTION_DEFAULT = 0x0,

Comments

On Fri, Mar 24, 2017 at 06:30:10PM -0700, Michel Thierry wrote:
> From firmware v8.8, GuC provides the count of media engine resets
> (watchdog timeout). This information is available in the GuC shared
> context data struct, which resides in the first page of the default
> (kernel) lrc context.
> 
> Since GuC handled engine resets are transparent for kernel and user,
> provide a simple debugfs entry to see the number of times media reset
> has happened.
> 
> Signed-off-by: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   | 28 ++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_guc_fwif.h | 18 ++++++++++++++++++
>  2 files changed, 46 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 8db850541e03..f40a2c84b423 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1400,6 +1400,32 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>  	return 0;
>  }
>  
> +static u32 i915_watchdog_reset_count(struct drm_i915_private *dev_priv)
> +{
> +	struct drm_device *dev = &dev_priv->drm;
> +	struct i915_gem_context *ctx;
> +	struct page *page;
> +	struct guc_shared_ctx_data *guc_shared_data;
> +	u32 guc_media_reset_count;
> +
> +	if (!i915.enable_guc_submission)
> +		return 0;
> +
> +	if (mutex_lock_interruptible(&dev->struct_mutex))
> +		return 0;

Do you need the mutex?

> +	ctx = dev_priv->kernel_context;
> +	page = i915_gem_object_get_dirty_page(ctx->engine[RCS].state->obj,
> +					      LRC_GUCSHR_PN);

Are you writing?

> +	guc_shared_data = kmap_atomic(page);

Atomic?

> +	guc_media_reset_count = guc_shared_data->media_reset_count;

This is an unserialised access, mark it so (READ_ONCE).
-Chris