[1/3] drm/i915: Skip the ERR_PTR error state

Submitted by Chris Wilson on Dec. 6, 2018, 8:44 a.m.

Details

Message ID 20181206084431.9805-1-chris@chris-wilson.co.uk
State New
Series "Series without cover letter"
Headers show

Commit Message

Chris Wilson Dec. 6, 2018, 8:44 a.m.
Although commit fb6f0b64e455 ("drm/i915: Prevent machine hang from
Broxton's vtd w/a and error capture") applied cleanly after a 24 month
hiatus, the code had moved on with new methods for peeking and fetching
the captured gpu info. Make sure we catch all uses of the stashed error
state and avoid dereferencing the error pointer.

Fixes: fb6f0b64e455 ("drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c   | 11 ++++++++++-
 drivers/gpu/drm/i915/i915_gpu_error.c | 12 +++++++++---
 drivers/gpu/drm/i915/i915_sysfs.c     |  3 +++
 3 files changed, 22 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 38dcee1ca062..f8aa8586c9a6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -984,6 +984,9 @@  static int i915_gpu_info_open(struct inode *inode, struct file *file)
 	intel_runtime_pm_get(i915);
 	gpu = i915_capture_gpu_state(i915);
 	intel_runtime_pm_put(i915);
+
+	if (IS_ERR(gpu))
+		return PTR_ERR(gpu);
 	if (!gpu)
 		return -ENOMEM;
 
@@ -1018,7 +1021,13 @@  i915_error_state_write(struct file *filp,
 
 static int i915_error_state_open(struct inode *inode, struct file *file)
 {
-	file->private_data = i915_first_error_state(inode->i_private);
+	struct i915_gpu_state *error;
+
+	error = i915_first_error_state(inode->i_private);
+	if (IS_ERR(error))
+		return PTR_ERR(error);
+
+	file->private_data  = error;
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 07465123c166..7ca6f3f31d41 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1907,6 +1907,11 @@  i915_capture_gpu_state(struct drm_i915_private *i915)
 {
 	struct i915_gpu_state *error;
 
+	/* Check if GPU capture has been disabled */
+	error = READ_ONCE(i915->gpu_error.first_error);
+	if (IS_ERR(error))
+		return error;
+
 	error = kzalloc(sizeof(*error), GFP_ATOMIC);
 	if (!error)
 		return NULL;
@@ -1987,7 +1992,7 @@  i915_first_error_state(struct drm_i915_private *i915)
 
 	spin_lock_irq(&i915->gpu_error.lock);
 	error = i915->gpu_error.first_error;
-	if (error)
+	if (!IS_ERR_OR_NULL(error))
 		i915_gpu_state_get(error);
 	spin_unlock_irq(&i915->gpu_error.lock);
 
@@ -2000,10 +2005,11 @@  void i915_reset_error_state(struct drm_i915_private *i915)
 
 	spin_lock_irq(&i915->gpu_error.lock);
 	error = i915->gpu_error.first_error;
-	i915->gpu_error.first_error = NULL;
+	if (!IS_ERR_OR_NULL(error)) /* once disabled, always disabled */
+		i915->gpu_error.first_error = NULL;
 	spin_unlock_irq(&i915->gpu_error.lock);
 
-	if (!IS_ERR(error))
+	if (!IS_ERR_OR_NULL(error))
 		i915_gpu_state_put(error);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index 535caebd9813..370b7497e9df 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -521,6 +521,9 @@  static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
 	ssize_t ret;
 
 	gpu = i915_first_error_state(i915);
+	if (IS_ERR(gpu))
+		return PTR_ERR(gpu);
+
 	if (gpu) {
 		ret = i915_gpu_state_copy_to_buffer(gpu, buf, off, count);
 		i915_gpu_state_put(gpu);

Comments

Tvrtko Ursulin Dec. 6, 2018, 12:57 p.m.
On 06/12/2018 08:44, Chris Wilson wrote:
> Although commit fb6f0b64e455 ("drm/i915: Prevent machine hang from
> Broxton's vtd w/a and error capture") applied cleanly after a 24 month
> hiatus, the code had moved on with new methods for peeking and fetching
> the captured gpu info. Make sure we catch all uses of the stashed error
> state and avoid dereferencing the error pointer.
> 
> Fixes: fb6f0b64e455 ("drm/i915: Prevent machine hang from Broxton's vtd w/a and error capture")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c   | 11 ++++++++++-
>   drivers/gpu/drm/i915/i915_gpu_error.c | 12 +++++++++---
>   drivers/gpu/drm/i915/i915_sysfs.c     |  3 +++
>   3 files changed, 22 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 38dcee1ca062..f8aa8586c9a6 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -984,6 +984,9 @@ static int i915_gpu_info_open(struct inode *inode, struct file *file)
>   	intel_runtime_pm_get(i915);
>   	gpu = i915_capture_gpu_state(i915);
>   	intel_runtime_pm_put(i915);
> +
> +	if (IS_ERR(gpu))
> +		return PTR_ERR(gpu);
>   	if (!gpu)
>   		return -ENOMEM;
>   
> @@ -1018,7 +1021,13 @@ i915_error_state_write(struct file *filp,
>   
>   static int i915_error_state_open(struct inode *inode, struct file *file)
>   {
> -	file->private_data = i915_first_error_state(inode->i_private);
> +	struct i915_gpu_state *error;
> +
> +	error = i915_first_error_state(inode->i_private);
> +	if (IS_ERR(error))
> +		return PTR_ERR(error);
> +
> +	file->private_data  = error;
>   	return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 07465123c166..7ca6f3f31d41 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1907,6 +1907,11 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
>   {
>   	struct i915_gpu_state *error;
>   
> +	/* Check if GPU capture has been disabled */
> +	error = READ_ONCE(i915->gpu_error.first_error);
> +	if (IS_ERR(error))
> +		return error;
> +
>   	error = kzalloc(sizeof(*error), GFP_ATOMIC);
>   	if (!error)
>   		return NULL;

Looks like it would be cleaner to return ERR_PTR(-ENOMEM) here. At least 
the i915_gpu_info_open hunk would be.

Then in drm-tip I see i915_capture_error_state as a caller which needs 
to handle ERR_PTR now as well, no?

> @@ -1987,7 +1992,7 @@ i915_first_error_state(struct drm_i915_private *i915)
>   
>   	spin_lock_irq(&i915->gpu_error.lock);
>   	error = i915->gpu_error.first_error;
> -	if (error)
> +	if (!IS_ERR_OR_NULL(error))
>   		i915_gpu_state_get(error);
>   	spin_unlock_irq(&i915->gpu_error.lock);
>   
> @@ -2000,10 +2005,11 @@ void i915_reset_error_state(struct drm_i915_private *i915)
>   
>   	spin_lock_irq(&i915->gpu_error.lock);
>   	error = i915->gpu_error.first_error;
> -	i915->gpu_error.first_error = NULL;
> +	if (!IS_ERR_OR_NULL(error)) /* once disabled, always disabled */

Hm, this will apply on transient -ENOMEM as well.

> +		i915->gpu_error.first_error = NULL;
>   	spin_unlock_irq(&i915->gpu_error.lock);
>   
> -	if (!IS_ERR(error))
> +	if (!IS_ERR_OR_NULL(error))
>   		i915_gpu_state_put(error);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
> index 535caebd9813..370b7497e9df 100644
> --- a/drivers/gpu/drm/i915/i915_sysfs.c
> +++ b/drivers/gpu/drm/i915/i915_sysfs.c
> @@ -521,6 +521,9 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
>   	ssize_t ret;
>   
>   	gpu = i915_first_error_state(i915);
> +	if (IS_ERR(gpu))
> +		return PTR_ERR(gpu);
> +
>   	if (gpu) {
>   		ret = i915_gpu_state_copy_to_buffer(gpu, buf, off, count);
>   		i915_gpu_state_put(gpu);#
> 

A single control block:

if (!IS_ERR_OR_NULL) {
	...
} else {
	...
}

seems like it would do the trick.

Regards,

Tvrtko
Chris Wilson Dec. 6, 2018, 9:09 p.m.
Quoting Tvrtko Ursulin (2018-12-06 12:57:23)
> 
> On 06/12/2018 08:44, Chris Wilson wrote:
> > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> > index 07465123c166..7ca6f3f31d41 100644
> > --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> > @@ -1907,6 +1907,11 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
> >   {
> >       struct i915_gpu_state *error;
> >   
> > +     /* Check if GPU capture has been disabled */
> > +     error = READ_ONCE(i915->gpu_error.first_error);
> > +     if (IS_ERR(error))
> > +             return error;
> > +
> >       error = kzalloc(sizeof(*error), GFP_ATOMIC);
> >       if (!error)
> >               return NULL;
> 
> Looks like it would be cleaner to return ERR_PTR(-ENOMEM) here. At least 
> the i915_gpu_info_open hunk would be.
> 
> Then in drm-tip I see i915_capture_error_state as a caller which needs 
> to handle ERR_PTR now as well, no?

It shouldn't due to the earlier check, but it would work better if it
tool the ERR_PTR pointer from here.

> > @@ -1987,7 +1992,7 @@ i915_first_error_state(struct drm_i915_private *i915)
> >   
> >       spin_lock_irq(&i915->gpu_error.lock);
> >       error = i915->gpu_error.first_error;
> > -     if (error)
> > +     if (!IS_ERR_OR_NULL(error))
> >               i915_gpu_state_get(error);
> >       spin_unlock_irq(&i915->gpu_error.lock);
> >   
> > @@ -2000,10 +2005,11 @@ void i915_reset_error_state(struct drm_i915_private *i915)
> >   
> >       spin_lock_irq(&i915->gpu_error.lock);
> >       error = i915->gpu_error.first_error;
> > -     i915->gpu_error.first_error = NULL;
> > +     if (!IS_ERR_OR_NULL(error)) /* once disabled, always disabled */
> 
> Hm, this will apply on transient -ENOMEM as well.

Yeah. The best compromise is to say only ENODEV is special.

> > +             i915->gpu_error.first_error = NULL;
> >       spin_unlock_irq(&i915->gpu_error.lock);
> >   
> > -     if (!IS_ERR(error))
> > +     if (!IS_ERR_OR_NULL(error))
> >               i915_gpu_state_put(error);
> >   }
> >   
> > diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
> > index 535caebd9813..370b7497e9df 100644
> > --- a/drivers/gpu/drm/i915/i915_sysfs.c
> > +++ b/drivers/gpu/drm/i915/i915_sysfs.c
> > @@ -521,6 +521,9 @@ static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
> >       ssize_t ret;
> >   
> >       gpu = i915_first_error_state(i915);
> > +     if (IS_ERR(gpu))
> > +             return PTR_ERR(gpu);
> > +
> >       if (gpu) {
> >               ret = i915_gpu_state_copy_to_buffer(gpu, buf, off, count);
> >               i915_gpu_state_put(gpu);#
> > 
> 
> A single control block:
> 
> if (!IS_ERR_OR_NULL) {
>         ...
> } else {
>         ...
> }
> 
> seems like it would do the trick.

Not quite unless you were thinking of if { } else if { } else { };
-Chris