[v2,2/4] gpio: fail if gpu external power is missing

Submitted by Mark Menzynski on July 15, 2019, 9:42 a.m.

Details

Message ID 20190715094247.8331-3-mmenzyns@redhat.com
State New
Headers show
Series "Refuse to load if power cables are not connected" ( rev: 1 ) in Nouveau

Commit Message

Mark Menzynski July 15, 2019, 9:42 a.m.
Currently, nouveau doesn't check if GPU is missing power. This
patch makes nouveau fail when this happens on latest GPUs.

It checks GPIO function 121 (External Power Emergency), which
should detect power problems on GPU initialization.

Tested on TU104, GP106 and GF100.

Signed-off-by: Mark Menzynski <mmenzyns@redhat.com>
---
 drm/nouveau/include/nvkm/subdev/bios/gpio.h |  1 +
 drm/nouveau/nvkm/subdev/gpio/base.c         | 23 +++++++++++++++++++++
 2 files changed, 24 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drm/nouveau/include/nvkm/subdev/bios/gpio.h b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
index 2f40935f..a70ec9e8 100644
--- a/drm/nouveau/include/nvkm/subdev/bios/gpio.h
+++ b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
@@ -7,6 +7,7 @@  enum dcb_gpio_func_name {
 	DCB_GPIO_TVDAC0 = 0x0c,
 	DCB_GPIO_TVDAC1 = 0x2d,
 	DCB_GPIO_FAN_SENSE = 0x3d,
+	DCB_GPIO_EXT_POWER_LOW = 0x79,
 	DCB_GPIO_LOGO_LED_PWM = 0x84,
 	DCB_GPIO_UNUSED = 0xff,
 	DCB_GPIO_VID0 = 0x04,
diff --git a/drm/nouveau/nvkm/subdev/gpio/base.c b/drm/nouveau/nvkm/subdev/gpio/base.c
index 1399d923..c4685807 100644
--- a/drm/nouveau/nvkm/subdev/gpio/base.c
+++ b/drm/nouveau/nvkm/subdev/gpio/base.c
@@ -182,12 +182,35 @@  static const struct dmi_system_id gpio_reset_ids[] = {
 	{ }
 };
 
+static enum dcb_gpio_func_name power_checks[] = {
+	DCB_GPIO_EXT_POWER_LOW,
+};
+
 static int
 nvkm_gpio_init(struct nvkm_subdev *subdev)
 {
 	struct nvkm_gpio *gpio = nvkm_gpio(subdev);
+	struct dcb_gpio_func func;
+	int ret;
+	int i;
+
 	if (dmi_check_system(gpio_reset_ids))
 		nvkm_gpio_reset(gpio, DCB_GPIO_UNUSED);
+
+	for (i = 0; i < ARRAY_SIZE(power_checks); ++i) {
+		ret = nvkm_gpio_find(gpio, 0, power_checks[i], DCB_GPIO_UNUSED,
+				     &func);
+		if (ret)
+			continue;
+
+		ret = nvkm_gpio_get(gpio, 0, func.func, func.line);
+		if (ret) {
+			nvkm_error(&gpio->subdev,
+				   "not enough power, check GPU power cable\n");
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
 

Comments

Please add a config override to skip this, since we'll invariably get
it wrong for some setup, and should be able to provide users with
workarounds while the issue is being worked out.

On Mon, Jul 15, 2019 at 5:43 AM Mark Menzynski <mmenzyns@redhat.com> wrote:
>
> Currently, nouveau doesn't check if GPU is missing power. This
> patch makes nouveau fail when this happens on latest GPUs.
>
> It checks GPIO function 121 (External Power Emergency), which
> should detect power problems on GPU initialization.
>
> Tested on TU104, GP106 and GF100.
>
> Signed-off-by: Mark Menzynski <mmenzyns@redhat.com>
> ---
>  drm/nouveau/include/nvkm/subdev/bios/gpio.h |  1 +
>  drm/nouveau/nvkm/subdev/gpio/base.c         | 23 +++++++++++++++++++++
>  2 files changed, 24 insertions(+)
>
> diff --git a/drm/nouveau/include/nvkm/subdev/bios/gpio.h b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> index 2f40935f..a70ec9e8 100644
> --- a/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> +++ b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> @@ -7,6 +7,7 @@ enum dcb_gpio_func_name {
>         DCB_GPIO_TVDAC0 = 0x0c,
>         DCB_GPIO_TVDAC1 = 0x2d,
>         DCB_GPIO_FAN_SENSE = 0x3d,
> +       DCB_GPIO_EXT_POWER_LOW = 0x79,
>         DCB_GPIO_LOGO_LED_PWM = 0x84,
>         DCB_GPIO_UNUSED = 0xff,
>         DCB_GPIO_VID0 = 0x04,
> diff --git a/drm/nouveau/nvkm/subdev/gpio/base.c b/drm/nouveau/nvkm/subdev/gpio/base.c
> index 1399d923..c4685807 100644
> --- a/drm/nouveau/nvkm/subdev/gpio/base.c
> +++ b/drm/nouveau/nvkm/subdev/gpio/base.c
> @@ -182,12 +182,35 @@ static const struct dmi_system_id gpio_reset_ids[] = {
>         { }
>  };
>
> +static enum dcb_gpio_func_name power_checks[] = {
> +       DCB_GPIO_EXT_POWER_LOW,
> +};
> +
>  static int
>  nvkm_gpio_init(struct nvkm_subdev *subdev)
>  {
>         struct nvkm_gpio *gpio = nvkm_gpio(subdev);
> +       struct dcb_gpio_func func;
> +       int ret;
> +       int i;
> +
>         if (dmi_check_system(gpio_reset_ids))
>                 nvkm_gpio_reset(gpio, DCB_GPIO_UNUSED);
> +
> +       for (i = 0; i < ARRAY_SIZE(power_checks); ++i) {
> +               ret = nvkm_gpio_find(gpio, 0, power_checks[i], DCB_GPIO_UNUSED,
> +                                    &func);
> +               if (ret)
> +                       continue;
> +
> +               ret = nvkm_gpio_get(gpio, 0, func.func, func.line);
> +               if (ret) {
> +                       nvkm_error(&gpio->subdev,
> +                                  "not enough power, check GPU power cable\n");
> +                       return -EINVAL;
> +               }
> +       }
> +
>         return 0;
>  }
>
> --
> 2.21.0
>
> _______________________________________________
> Nouveau mailing list
> Nouveau@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/nouveau
On Mon, 15 Jul 2019 at 22:26, Ilia Mirkin <imirkin@alum.mit.edu> wrote:
>
> Please add a config override to skip this, since we'll invariably get
> it wrong for some setup, and should be able to provide users with
> workarounds while the issue is being worked out.
Yeah, this makes me nervous as well.  In the very least, I'd like a
config option, but I'm still wondering if perhaps we shouldn't limit
this to a warning (which people tend to report) for a while first too.

Also, what's NV's behaviour here?  Do they refuse to load, or do they
do something like force the GPU into its lowest pstate?

Ben.

>
> On Mon, Jul 15, 2019 at 5:43 AM Mark Menzynski <mmenzyns@redhat.com> wrote:
> >
> > Currently, nouveau doesn't check if GPU is missing power. This
> > patch makes nouveau fail when this happens on latest GPUs.
> >
> > It checks GPIO function 121 (External Power Emergency), which
> > should detect power problems on GPU initialization.
> >
> > Tested on TU104, GP106 and GF100.
> >
> > Signed-off-by: Mark Menzynski <mmenzyns@redhat.com>
> > ---
> >  drm/nouveau/include/nvkm/subdev/bios/gpio.h |  1 +
> >  drm/nouveau/nvkm/subdev/gpio/base.c         | 23 +++++++++++++++++++++
> >  2 files changed, 24 insertions(+)
> >
> > diff --git a/drm/nouveau/include/nvkm/subdev/bios/gpio.h b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> > index 2f40935f..a70ec9e8 100644
> > --- a/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> > +++ b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> > @@ -7,6 +7,7 @@ enum dcb_gpio_func_name {
> >         DCB_GPIO_TVDAC0 = 0x0c,
> >         DCB_GPIO_TVDAC1 = 0x2d,
> >         DCB_GPIO_FAN_SENSE = 0x3d,
> > +       DCB_GPIO_EXT_POWER_LOW = 0x79,
> >         DCB_GPIO_LOGO_LED_PWM = 0x84,
> >         DCB_GPIO_UNUSED = 0xff,
> >         DCB_GPIO_VID0 = 0x04,
> > diff --git a/drm/nouveau/nvkm/subdev/gpio/base.c b/drm/nouveau/nvkm/subdev/gpio/base.c
> > index 1399d923..c4685807 100644
> > --- a/drm/nouveau/nvkm/subdev/gpio/base.c
> > +++ b/drm/nouveau/nvkm/subdev/gpio/base.c
> > @@ -182,12 +182,35 @@ static const struct dmi_system_id gpio_reset_ids[] = {
> >         { }
> >  };
> >
> > +static enum dcb_gpio_func_name power_checks[] = {
> > +       DCB_GPIO_EXT_POWER_LOW,
> > +};
> > +
> >  static int
> >  nvkm_gpio_init(struct nvkm_subdev *subdev)
> >  {
> >         struct nvkm_gpio *gpio = nvkm_gpio(subdev);
> > +       struct dcb_gpio_func func;
> > +       int ret;
> > +       int i;
> > +
> >         if (dmi_check_system(gpio_reset_ids))
> >                 nvkm_gpio_reset(gpio, DCB_GPIO_UNUSED);
> > +
> > +       for (i = 0; i < ARRAY_SIZE(power_checks); ++i) {
> > +               ret = nvkm_gpio_find(gpio, 0, power_checks[i], DCB_GPIO_UNUSED,
> > +                                    &func);
> > +               if (ret)
> > +                       continue;
> > +
> > +               ret = nvkm_gpio_get(gpio, 0, func.func, func.line);
> > +               if (ret) {
> > +                       nvkm_error(&gpio->subdev,
> > +                                  "not enough power, check GPU power cable\n");
> > +                       return -EINVAL;
> > +               }
> > +       }
> > +
> >         return 0;
> >  }
> >
> > --
> > 2.21.0
> >
> > _______________________________________________
> > Nouveau mailing list
> > Nouveau@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/nouveau
> _______________________________________________
> Nouveau mailing list
> Nouveau@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/nouveau
This is what Nvidia did after using nvidia-smi, which is not very far
from what happens now with the patch.

kernel: nvidia-nvlink: Nvlink Core is being initialized, major device number 235
kernel: nvidia 0000:01:00.0: vgaarb: changed VGA decodes:
olddecodes=io+mem,decodes=none:owns=none
kernel: NVRM: loading NVIDIA UNIX x86_64 Kernel Module  430.34  Wed
Jun 26 12:19:48 CDT 2019
kernel: NVRM: GPU 0000:01:00.0: GPU does not have the necessary power
cables connected.
kernel: NVRM: GPU 0000:01:00.0: RmInitAdapter failed! (0x25:0x1c:1133)
kernel: NVRM: GPU 0000:01:00.0: rm_init_adapter failed, device minor number 0

Also, when booting, POST refuses to boot if power cables are not
connected, but there are scenarios where you don't boot with the
Nvidia GPU.
I am not sure about limiting it to warning, it makes sense but I also
think it should fail.

But I wanted to ask about the error messages. At this moment, this is
current output:

[17383.042727] nouveau 0000:12:00.0: gpio: GPU is missing power, check
its power cables. Boot with nouveau.config=NvPowerChecks=0 to disable.
[17383.042728] nouveau 0000:12:00.0: gpio: init failed, -22
[17383.042986] nouveau 0000:12:00.0: init failed with -22
[17383.042987] nouveau: DRM-master:00000000:00000080: init failed with -22
[17383.042990] nouveau 0000:12:00.0: DRM-master: Device allocation failed: -22
[17383.043458] nouveau: probe of 0000:12:00.0 failed with error -22

Isn't it a wrong place to implement the checks? Maybe I should put it
somewhere else?

On Tue, Jul 16, 2019 at 5:47 AM Ben Skeggs <skeggsb@gmail.com> wrote:
>
> On Mon, 15 Jul 2019 at 22:26, Ilia Mirkin <imirkin@alum.mit.edu> wrote:
> >
> > Please add a config override to skip this, since we'll invariably get
> > it wrong for some setup, and should be able to provide users with
> > workarounds while the issue is being worked out.
> Yeah, this makes me nervous as well.  In the very least, I'd like a
> config option, but I'm still wondering if perhaps we shouldn't limit
> this to a warning (which people tend to report) for a while first too.
>
> Also, what's NV's behaviour here?  Do they refuse to load, or do they
> do something like force the GPU into its lowest pstate?
>
> Ben.
>
> >
> > On Mon, Jul 15, 2019 at 5:43 AM Mark Menzynski <mmenzyns@redhat.com> wrote:
> > >
> > > Currently, nouveau doesn't check if GPU is missing power. This
> > > patch makes nouveau fail when this happens on latest GPUs.
> > >
> > > It checks GPIO function 121 (External Power Emergency), which
> > > should detect power problems on GPU initialization.
> > >
> > > Tested on TU104, GP106 and GF100.
> > >
> > > Signed-off-by: Mark Menzynski <mmenzyns@redhat.com>
> > > ---
> > >  drm/nouveau/include/nvkm/subdev/bios/gpio.h |  1 +
> > >  drm/nouveau/nvkm/subdev/gpio/base.c         | 23 +++++++++++++++++++++
> > >  2 files changed, 24 insertions(+)
> > >
> > > diff --git a/drm/nouveau/include/nvkm/subdev/bios/gpio.h b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> > > index 2f40935f..a70ec9e8 100644
> > > --- a/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> > > +++ b/drm/nouveau/include/nvkm/subdev/bios/gpio.h
> > > @@ -7,6 +7,7 @@ enum dcb_gpio_func_name {
> > >         DCB_GPIO_TVDAC0 = 0x0c,
> > >         DCB_GPIO_TVDAC1 = 0x2d,
> > >         DCB_GPIO_FAN_SENSE = 0x3d,
> > > +       DCB_GPIO_EXT_POWER_LOW = 0x79,
> > >         DCB_GPIO_LOGO_LED_PWM = 0x84,
> > >         DCB_GPIO_UNUSED = 0xff,
> > >         DCB_GPIO_VID0 = 0x04,
> > > diff --git a/drm/nouveau/nvkm/subdev/gpio/base.c b/drm/nouveau/nvkm/subdev/gpio/base.c
> > > index 1399d923..c4685807 100644
> > > --- a/drm/nouveau/nvkm/subdev/gpio/base.c
> > > +++ b/drm/nouveau/nvkm/subdev/gpio/base.c
> > > @@ -182,12 +182,35 @@ static const struct dmi_system_id gpio_reset_ids[] = {
> > >         { }
> > >  };
> > >
> > > +static enum dcb_gpio_func_name power_checks[] = {
> > > +       DCB_GPIO_EXT_POWER_LOW,
> > > +};
> > > +
> > >  static int
> > >  nvkm_gpio_init(struct nvkm_subdev *subdev)
> > >  {
> > >         struct nvkm_gpio *gpio = nvkm_gpio(subdev);
> > > +       struct dcb_gpio_func func;
> > > +       int ret;
> > > +       int i;
> > > +
> > >         if (dmi_check_system(gpio_reset_ids))
> > >                 nvkm_gpio_reset(gpio, DCB_GPIO_UNUSED);
> > > +
> > > +       for (i = 0; i < ARRAY_SIZE(power_checks); ++i) {
> > > +               ret = nvkm_gpio_find(gpio, 0, power_checks[i], DCB_GPIO_UNUSED,
> > > +                                    &func);
> > > +               if (ret)
> > > +                       continue;
> > > +
> > > +               ret = nvkm_gpio_get(gpio, 0, func.func, func.line);
> > > +               if (ret) {
> > > +                       nvkm_error(&gpio->subdev,
> > > +                                  "not enough power, check GPU power cable\n");
> > > +                       return -EINVAL;
> > > +               }
> > > +       }
> > > +
> > >         return 0;
> > >  }
> > >
> > > --
> > > 2.21.0
> > >
> > > _______________________________________________
> > > Nouveau mailing list
> > > Nouveau@lists.freedesktop.org
> > > https://lists.freedesktop.org/mailman/listinfo/nouveau
> > _______________________________________________
> > Nouveau mailing list
> > Nouveau@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/nouveau