drm/amd/powerplay: do proper cleanups on hw_fini

Submitted by Quan, Evan on Sept. 3, 2019, 3:44 a.m.

Details

Message ID 20190903034336.16005-1-evan.quan@amd.com
State New
Headers show
Series "drm/amd/powerplay: do proper cleanups on hw_fini" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Quan, Evan Sept. 3, 2019, 3:44 a.m.
These are needed for smu_reset support.

Change-Id: If29ede4b99758adb08fd4e16665f44fd893ec99b
Signed-off-by: Evan Quan <evan.quan@amd.com>
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c     | 17 +++++++++++++++++
 drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h |  3 +++
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c      | 10 ++++++++++
 3 files changed, 30 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index d5ee13a78eb7..3cf8d944f890 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1286,6 +1286,11 @@  static int smu_hw_init(void *handle)
 	return ret;
 }
 
+static int smu_stop_dpms(struct smu_context *smu)
+{
+	return smu_send_smc_msg(smu, SMU_MSG_DisableAllSmuFeatures);
+}
+
 static int smu_hw_fini(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1298,6 +1303,18 @@  static int smu_hw_fini(void *handle)
 		smu_powergate_vcn(&adev->smu, true);
 	}
 
+	ret = smu_stop_thermal_control(smu);
+	if (ret) {
+		pr_warn("Fail to stop thermal control!\n");
+		return ret;
+	}
+
+	ret = smu_stop_dpms(smu);
+	if (ret) {
+		pr_warn("Fail to stop Dpms!\n");
+		return ret;
+	}
+
 	kfree(table_context->driver_pptable);
 	table_context->driver_pptable = NULL;
 
diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
index b19224cb6d6d..8e4b0ad24712 100644
--- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
@@ -498,6 +498,7 @@  struct smu_funcs
 	int (*get_current_clk_freq)(struct smu_context *smu, enum smu_clk_type clk_id, uint32_t *value);
 	int (*init_max_sustainable_clocks)(struct smu_context *smu);
 	int (*start_thermal_control)(struct smu_context *smu);
+	int (*stop_thermal_control)(struct smu_context *smu);
 	int (*read_sensor)(struct smu_context *smu, enum amd_pp_sensors sensor,
 			   void *data, uint32_t *size);
 	int (*set_deep_sleep_dcefclk)(struct smu_context *smu, uint32_t clk);
@@ -647,6 +648,8 @@  struct smu_funcs
 	((smu)->ppt_funcs->set_thermal_fan_table ? (smu)->ppt_funcs->set_thermal_fan_table((smu)) : 0)
 #define smu_start_thermal_control(smu) \
 	((smu)->funcs->start_thermal_control? (smu)->funcs->start_thermal_control((smu)) : 0)
+#define smu_stop_thermal_control(smu) \
+	((smu)->funcs->stop_thermal_control? (smu)->funcs->stop_thermal_control((smu)) : 0)
 #define smu_read_sensor(smu, sensor, data, size) \
 	((smu)->ppt_funcs->read_sensor? (smu)->ppt_funcs->read_sensor((smu), (sensor), (data), (size)) : 0)
 #define smu_smc_read_sensor(smu, sensor, data, size) \
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index db5e94ce54af..1a38af84394e 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -1209,6 +1209,15 @@  static int smu_v11_0_start_thermal_control(struct smu_context *smu)
 	return ret;
 }
 
+static int smu_v11_0_stop_thermal_control(struct smu_context *smu)
+{
+	struct amdgpu_device *adev = smu->adev;
+
+	WREG32_SOC15(THM, 0, mmTHM_THERMAL_INT_ENA, 0);
+
+	return 0;
+}
+
 static uint16_t convert_to_vddc(uint8_t vid)
 {
 	return (uint16_t) ((6200 - (vid * 25)) / SMU11_VOLTAGE_SCALE);
@@ -1783,6 +1792,7 @@  static const struct smu_funcs smu_v11_0_funcs = {
 	.get_current_clk_freq = smu_v11_0_get_current_clk_freq,
 	.init_max_sustainable_clocks = smu_v11_0_init_max_sustainable_clocks,
 	.start_thermal_control = smu_v11_0_start_thermal_control,
+	.stop_thermal_control = smu_v11_0_stop_thermal_control,
 	.read_sensor = smu_v11_0_read_sensor,
 	.set_deep_sleep_dcefclk = smu_v11_0_set_deep_sleep_dcefclk,
 	.display_clock_voltage_request = smu_v11_0_display_clock_voltage_request,

Comments

Hey guys,



This patch caused some kind of reversion with smu_reset on Navi10. I'm
no expert since everything I know comes from just reading through the
code, so this could be some kind of intended behavior, but after this
patch, if you write a pptable to the sysfs pp_table interface on navi10,
then the SMU will fail to reset successfully, and the result is
seemingly an unrecoverable situation.



I put in a report on bugzilla with dmesg logs
:
https://bugs.freedesktop.org/show_bug.cgi?id=112234


Finding this change was the result of a bisect to find where the issue
started, and reverting the changes to smu_hw_fini resolved the issue.
Any advice on possible proper fixes?

Thanks in advance,

Matt

On 9/2/19 9:44 PM, Quan, Evan wrote:
> These are needed for smu_reset support.
> 
> Change-Id: If29ede4b99758adb08fd4e16665f44fd893ec99b
> Signed-off-by: Evan Quan <evan.quan@amd.com>
> ---
>  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c     | 17 +++++++++++++++++
>  drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h |  3 +++
>  drivers/gpu/drm/amd/powerplay/smu_v11_0.c      | 10 ++++++++++
>  3 files changed, 30 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> index d5ee13a78eb7..3cf8d944f890 100644
> --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
> @@ -1286,6 +1286,11 @@ static int smu_hw_init(void *handle)
>  	return ret;
>  }
>  
> +static int smu_stop_dpms(struct smu_context *smu)
> +{
> +	return smu_send_smc_msg(smu, SMU_MSG_DisableAllSmuFeatures);
> +}
> +
>  static int smu_hw_fini(void *handle)
>  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> @@ -1298,6 +1303,18 @@ static int smu_hw_fini(void *handle)
>  		smu_powergate_vcn(&adev->smu, true);
>  	}
>  
> +	ret = smu_stop_thermal_control(smu);
> +	if (ret) {
> +		pr_warn("Fail to stop thermal control!\n");
> +		return ret;
> +	}
> +
> +	ret = smu_stop_dpms(smu);
> +	if (ret) {
> +		pr_warn("Fail to stop Dpms!\n");
> +		return ret;
> +	}
> +
>  	kfree(table_context->driver_pptable);
>  	table_context->driver_pptable = NULL;
>  
> diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> index b19224cb6d6d..8e4b0ad24712 100644
> --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h
> @@ -498,6 +498,7 @@ struct smu_funcs
>  	int (*get_current_clk_freq)(struct smu_context *smu, enum smu_clk_type clk_id, uint32_t *value);
>  	int (*init_max_sustainable_clocks)(struct smu_context *smu);
>  	int (*start_thermal_control)(struct smu_context *smu);
> +	int (*stop_thermal_control)(struct smu_context *smu);
>  	int (*read_sensor)(struct smu_context *smu, enum amd_pp_sensors sensor,
>  			   void *data, uint32_t *size);
>  	int (*set_deep_sleep_dcefclk)(struct smu_context *smu, uint32_t clk);
> @@ -647,6 +648,8 @@ struct smu_funcs
>  	((smu)->ppt_funcs->set_thermal_fan_table ? (smu)->ppt_funcs->set_thermal_fan_table((smu)) : 0)
>  #define smu_start_thermal_control(smu) \
>  	((smu)->funcs->start_thermal_control? (smu)->funcs->start_thermal_control((smu)) : 0)
> +#define smu_stop_thermal_control(smu) \
> +	((smu)->funcs->stop_thermal_control? (smu)->funcs->stop_thermal_control((smu)) : 0)
>  #define smu_read_sensor(smu, sensor, data, size) \
>  	((smu)->ppt_funcs->read_sensor? (smu)->ppt_funcs->read_sensor((smu), (sensor), (data), (size)) : 0)
>  #define smu_smc_read_sensor(smu, sensor, data, size) \
> diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> index db5e94ce54af..1a38af84394e 100644
> --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
> @@ -1209,6 +1209,15 @@ static int smu_v11_0_start_thermal_control(struct smu_context *smu)
>  	return ret;
>  }
>  
> +static int smu_v11_0_stop_thermal_control(struct smu_context *smu)
> +{
> +	struct amdgpu_device *adev = smu->adev;
> +
> +	WREG32_SOC15(THM, 0, mmTHM_THERMAL_INT_ENA, 0);
> +
> +	return 0;
> +}
> +
>  static uint16_t convert_to_vddc(uint8_t vid)
>  {
>  	return (uint16_t) ((6200 - (vid * 25)) / SMU11_VOLTAGE_SCALE);
> @@ -1783,6 +1792,7 @@ static const struct smu_funcs smu_v11_0_funcs = {
>  	.get_current_clk_freq = smu_v11_0_get_current_clk_freq,
>  	.init_max_sustainable_clocks = smu_v11_0_init_max_sustainable_clocks,
>  	.start_thermal_control = smu_v11_0_start_thermal_control,
> +	.stop_thermal_control = smu_v11_0_stop_thermal_control,
>  	.read_sensor = smu_v11_0_read_sensor,
>  	.set_deep_sleep_dcefclk = smu_v11_0_set_deep_sleep_dcefclk,
>  	.display_clock_voltage_request = smu_v11_0_display_clock_voltage_request,
>
Just sent out a patch which should be able to address this issue.
https://lists.freedesktop.org/archives/amd-gfx/2019-November/042458.html

Regards,
Evan
> -----Original Message-----

> From: Matt Coffin <mcoffin13@gmail.com>

> Sent: Saturday, November 9, 2019 4:50 AM

> To: Quan, Evan <Evan.Quan@amd.com>; amd-gfx@lists.freedesktop.org

> Cc: Li, Candice <Candice.Li@amd.com>; Gui, Jack <Jack.Gui@amd.com>; Alex

> Deucher <alexdeucher@gmail.com>

> Subject: Re: [PATCH] drm/amd/powerplay: do proper cleanups on hw_fini

> 

> Hey guys,

> 

> 

> 

> This patch caused some kind of reversion with smu_reset on Navi10. I'm no

> expert since everything I know comes from just reading through the code, so

> this could be some kind of intended behavior, but after this patch, if you write a

> pptable to the sysfs pp_table interface on navi10, then the SMU will fail to reset

> successfully, and the result is seemingly an unrecoverable situation.

> 

> 

> 

> I put in a report on bugzilla with dmesg logs

> :

> https://bugs.freedesktop.org/show_bug.cgi?id=112234

> 

> 

> Finding this change was the result of a bisect to find where the issue started,

> and reverting the changes to smu_hw_fini resolved the issue.

> Any advice on possible proper fixes?

> 

> Thanks in advance,

> 

> Matt

> 

> On 9/2/19 9:44 PM, Quan, Evan wrote:

> > These are needed for smu_reset support.

> >

> > Change-Id: If29ede4b99758adb08fd4e16665f44fd893ec99b

> > Signed-off-by: Evan Quan <evan.quan@amd.com>

> > ---

> >  drivers/gpu/drm/amd/powerplay/amdgpu_smu.c     | 17

> +++++++++++++++++

> >  drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h |  3 +++

> >  drivers/gpu/drm/amd/powerplay/smu_v11_0.c      | 10 ++++++++++

> >  3 files changed, 30 insertions(+)

> >

> > diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c

> > b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c

> > index d5ee13a78eb7..3cf8d944f890 100644

> > --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c

> > +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c

> > @@ -1286,6 +1286,11 @@ static int smu_hw_init(void *handle)

> >  	return ret;

> >  }

> >

> > +static int smu_stop_dpms(struct smu_context *smu) {

> > +	return smu_send_smc_msg(smu, SMU_MSG_DisableAllSmuFeatures); }

> > +

> >  static int smu_hw_fini(void *handle)

> >  {

> >  	struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@

> > -1298,6 +1303,18 @@ static int smu_hw_fini(void *handle)

> >  		smu_powergate_vcn(&adev->smu, true);

> >  	}

> >

> > +	ret = smu_stop_thermal_control(smu);

> > +	if (ret) {

> > +		pr_warn("Fail to stop thermal control!\n");

> > +		return ret;

> > +	}

> > +

> > +	ret = smu_stop_dpms(smu);

> > +	if (ret) {

> > +		pr_warn("Fail to stop Dpms!\n");

> > +		return ret;

> > +	}

> > +

> >  	kfree(table_context->driver_pptable);

> >  	table_context->driver_pptable = NULL;

> >

> > diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h

> > b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h

> > index b19224cb6d6d..8e4b0ad24712 100644

> > --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h

> > +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h

> > @@ -498,6 +498,7 @@ struct smu_funcs

> >  	int (*get_current_clk_freq)(struct smu_context *smu, enum

> smu_clk_type clk_id, uint32_t *value);

> >  	int (*init_max_sustainable_clocks)(struct smu_context *smu);

> >  	int (*start_thermal_control)(struct smu_context *smu);

> > +	int (*stop_thermal_control)(struct smu_context *smu);

> >  	int (*read_sensor)(struct smu_context *smu, enum amd_pp_sensors

> sensor,

> >  			   void *data, uint32_t *size);

> >  	int (*set_deep_sleep_dcefclk)(struct smu_context *smu, uint32_t

> > clk); @@ -647,6 +648,8 @@ struct smu_funcs

> >  	((smu)->ppt_funcs->set_thermal_fan_table ?

> > (smu)->ppt_funcs->set_thermal_fan_table((smu)) : 0)  #define

> smu_start_thermal_control(smu) \

> >  	((smu)->funcs->start_thermal_control?

> > (smu)->funcs->start_thermal_control((smu)) : 0)

> > +#define smu_stop_thermal_control(smu) \

> > +	((smu)->funcs->stop_thermal_control?

> > +(smu)->funcs->stop_thermal_control((smu)) : 0)

> >  #define smu_read_sensor(smu, sensor, data, size) \

> >  	((smu)->ppt_funcs->read_sensor? (smu)->ppt_funcs-

> >read_sensor((smu),

> > (sensor), (data), (size)) : 0)  #define smu_smc_read_sensor(smu,

> > sensor, data, size) \ diff --git

> > a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c

> > b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c

> > index db5e94ce54af..1a38af84394e 100644

> > --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c

> > +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c

> > @@ -1209,6 +1209,15 @@ static int

> smu_v11_0_start_thermal_control(struct smu_context *smu)

> >  	return ret;

> >  }

> >

> > +static int smu_v11_0_stop_thermal_control(struct smu_context *smu) {

> > +	struct amdgpu_device *adev = smu->adev;

> > +

> > +	WREG32_SOC15(THM, 0, mmTHM_THERMAL_INT_ENA, 0);

> > +

> > +	return 0;

> > +}

> > +

> >  static uint16_t convert_to_vddc(uint8_t vid)  {

> >  	return (uint16_t) ((6200 - (vid * 25)) / SMU11_VOLTAGE_SCALE); @@

> > -1783,6 +1792,7 @@ static const struct smu_funcs smu_v11_0_funcs = {

> >  	.get_current_clk_freq = smu_v11_0_get_current_clk_freq,

> >  	.init_max_sustainable_clocks =

> smu_v11_0_init_max_sustainable_clocks,

> >  	.start_thermal_control = smu_v11_0_start_thermal_control,

> > +	.stop_thermal_control = smu_v11_0_stop_thermal_control,

> >  	.read_sensor = smu_v11_0_read_sensor,

> >  	.set_deep_sleep_dcefclk = smu_v11_0_set_deep_sleep_dcefclk,

> >  	.display_clock_voltage_request =

> > smu_v11_0_display_clock_voltage_request,

> >