[v3,3/3] dmr/amdgpu: Add system auto reboot to RAS.

Submitted by Andrey Grodzovsky on Aug. 30, 2019, 4:39 p.m.

Details

Message ID 1567183153-11014-3-git-send-email-andrey.grodzovsky@amd.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Andrey Grodzovsky Aug. 30, 2019, 4:39 p.m.
In case of RAS error allow user configure auto system
reboot through ras_ctrl.
This is also part of the temproray work around for the RAS
hang problem.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 10 +++++++++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  1 +
 3 files changed, 28 insertions(+), 1 deletion(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c9825ae..e26f2e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3760,6 +3760,24 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	int i, r = 0;
 	bool in_ras_intr = amdgpu_ras_intr_triggered();
 
+	/*
+	 * Flush RAM to disk so that after reboot
+	 * the user can read log and see why the system rebooted.
+	 *
+	 * Using user mode app call instead of kernel APIs such as
+	 * ksys_sync_helper for backward comparability with earlier
+	 * kernels into which this is also intended.
+	 */
+	if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
+		char *envp[] = { "HOME=/", NULL };
+		char *argv[] = { "/bin/sync", NULL };
+
+		DRM_WARN("Emergency reboot.");
+
+		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+		emergency_restart();
+	}
+
 	need_full_reset = job_signaled = false;
 	INIT_LIST_HEAD(&device_list);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1cc34de..bbcfb4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -30,6 +30,7 @@ 
 #include "amdgpu_ras.h"
 #include "amdgpu_atomfirmware.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
+#include <linux/kmod.h>
 
 const char *ras_error_string[] = {
 	"none",
@@ -154,6 +155,8 @@  static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 		op = 1;
 	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 		op = 2;
+	else if (sscanf(str, "reboot %32s", block_name) == 1)
+		op = 3;
 	else if (str[0] && str[1] && str[2] && str[3])
 		/* ascii string, but commands are not matched. */
 		return -EINVAL;
@@ -287,6 +290,9 @@  static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
 		/* data.inject.address is offset instead of absolute gpu address */
 		ret = amdgpu_ras_error_inject(adev, &data.inject);
 		break;
+	case 3:
+		amdgpu_ras_get_context(adev)->reboot = true;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1733,6 +1739,8 @@  int amdgpu_ras_fini(struct amdgpu_device *adev)
 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
 {
 	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
-		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
+		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
+
+		amdgpu_ras_reset_gpu(adev, false);
 	}
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 3ec2a87..a83ec99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -333,6 +333,7 @@  struct amdgpu_ras {
 	struct mutex recovery_lock;
 
 	uint32_t flags;
+	bool reboot;
 };
 
 struct ras_fs_data {

Comments

On Fri, Aug 30, 2019 at 12:39 PM Andrey Grodzovsky
<andrey.grodzovsky@amd.com> wrote:
>
> In case of RAS error allow user configure auto system
> reboot through ras_ctrl.
> This is also part of the temproray work around for the RAS
> hang problem.
>
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

Typo in title: dmr -> drm

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 10 +++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  1 +
>  3 files changed, 28 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index c9825ae..e26f2e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3760,6 +3760,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>         int i, r = 0;
>         bool in_ras_intr = amdgpu_ras_intr_triggered();
>
> +       /*
> +        * Flush RAM to disk so that after reboot
> +        * the user can read log and see why the system rebooted.
> +        *
> +        * Using user mode app call instead of kernel APIs such as
> +        * ksys_sync_helper for backward comparability with earlier
> +        * kernels into which this is also intended.
> +        */
> +       if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
> +               char *envp[] = { "HOME=/", NULL };
> +               char *argv[] = { "/bin/sync", NULL };
> +
> +               DRM_WARN("Emergency reboot.");
> +
> +               call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
> +               emergency_restart();
> +       }
> +

This is fine for dkms, but for upstream/amd-staging, we probably want
to call the appropriate APIs directly.

>         need_full_reset = job_signaled = false;
>         INIT_LIST_HEAD(&device_list);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1cc34de..bbcfb4f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -30,6 +30,7 @@
>  #include "amdgpu_ras.h"
>  #include "amdgpu_atomfirmware.h"
>  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
> +#include <linux/kmod.h>
>
>  const char *ras_error_string[] = {
>         "none",
> @@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
>                 op = 1;
>         else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
>                 op = 2;
> +       else if (sscanf(str, "reboot %32s", block_name) == 1)
> +               op = 3;
>         else if (str[0] && str[1] && str[2] && str[3])
>                 /* ascii string, but commands are not matched. */
>                 return -EINVAL;
> @@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
>                 /* data.inject.address is offset instead of absolute gpu address */
>                 ret = amdgpu_ras_error_inject(adev, &data.inject);
>                 break;
> +       case 3:
> +               amdgpu_ras_get_context(adev)->reboot = true;
> +               break;
>         default:
>                 ret = -EINVAL;
>                 break;
> @@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
>  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
>  {
>         if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
> -               DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
> +               DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
> +
> +               amdgpu_ras_reset_gpu(adev, false);
>         }
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 3ec2a87..a83ec99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -333,6 +333,7 @@ struct amdgpu_ras {
>         struct mutex recovery_lock;
>
>         uint32_t flags;
> +       bool reboot;
>  };
>
>  struct ras_fs_data {
> --
> 2.7.4
>
But I am not the one cherry-picking to DKMS, should I just let this person know this is the DKMS code he should use for when appropriate API doesn't exist ?

Andrey
yeah, that's fine.

Alex

On Fri, Aug 30, 2019 at 8:21 PM Grodzovsky, Andrey
<Andrey.Grodzovsky@amd.com> wrote:
>
> But I am not the one cherry-picking to DKMS, should I just let this person know this is the DKMS code he should use for when appropriate API doesn't exist ?
>
> Andrey
>
> ________________________________________
> From: Alex Deucher <alexdeucher@gmail.com>
> Sent: 30 August 2019 15:55:03
> To: Grodzovsky, Andrey
> Cc: amd-gfx list; Zhang, Hawking; Christian K├Ânig; Zhou1, Tao; Kuehling, Felix
> Subject: Re: [PATCH v3 3/3] dmr/amdgpu: Add system auto reboot to RAS.
>
> On Fri, Aug 30, 2019 at 12:39 PM Andrey Grodzovsky
> <andrey.grodzovsky@amd.com> wrote:
> >
> > In case of RAS error allow user configure auto system
> > reboot through ras_ctrl.
> > This is also part of the temproray work around for the RAS
> > hang problem.
> >
> > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>
> Typo in title: dmr -> drm
>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 10 +++++++++-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  1 +
> >  3 files changed, 28 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index c9825ae..e26f2e9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -3760,6 +3760,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> >         int i, r = 0;
> >         bool in_ras_intr = amdgpu_ras_intr_triggered();
> >
> > +       /*
> > +        * Flush RAM to disk so that after reboot
> > +        * the user can read log and see why the system rebooted.
> > +        *
> > +        * Using user mode app call instead of kernel APIs such as
> > +        * ksys_sync_helper for backward comparability with earlier
> > +        * kernels into which this is also intended.
> > +        */
> > +       if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) {
> > +               char *envp[] = { "HOME=/", NULL };
> > +               char *argv[] = { "/bin/sync", NULL };
> > +
> > +               DRM_WARN("Emergency reboot.");
> > +
> > +               call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
> > +               emergency_restart();
> > +       }
> > +
>
> This is fine for dkms, but for upstream/amd-staging, we probably want
> to call the appropriate APIs directly.
>
> >         need_full_reset = job_signaled = false;
> >         INIT_LIST_HEAD(&device_list);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > index 1cc34de..bbcfb4f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> > @@ -30,6 +30,7 @@
> >  #include "amdgpu_ras.h"
> >  #include "amdgpu_atomfirmware.h"
> >  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
> > +#include <linux/kmod.h>
> >
> >  const char *ras_error_string[] = {
> >         "none",
> > @@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
> >                 op = 1;
> >         else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
> >                 op = 2;
> > +       else if (sscanf(str, "reboot %32s", block_name) == 1)
> > +               op = 3;
> >         else if (str[0] && str[1] && str[2] && str[3])
> >                 /* ascii string, but commands are not matched. */
> >                 return -EINVAL;
> > @@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
> >                 /* data.inject.address is offset instead of absolute gpu address */
> >                 ret = amdgpu_ras_error_inject(adev, &data.inject);
> >                 break;
> > +       case 3:
> > +               amdgpu_ras_get_context(adev)->reboot = true;
> > +               break;
> >         default:
> >                 ret = -EINVAL;
> >                 break;
> > @@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
> >  void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
> >  {
> >         if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
> > -               DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
> > +               DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n");
> > +
> > +               amdgpu_ras_reset_gpu(adev, false);
> >         }
> >  }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > index 3ec2a87..a83ec99 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> > @@ -333,6 +333,7 @@ struct amdgpu_ras {
> >         struct mutex recovery_lock;
> >
> >         uint32_t flags;
> > +       bool reboot;
> >  };
> >
> >  struct ras_fs_data {
> > --
> > 2.7.4
> >