[v2] drm/amdgpu: fix the memory corruption on S3

Submitted by Huang, Ray on June 29, 2017, 8:09 a.m.

Details

Message ID 1498723761-6723-1-git-send-email-ray.huang@amd.com
State New
Headers show
Series "drm/amdgpu: fix the memory corruption on S3" ( rev: 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Huang, Ray June 29, 2017, 8:09 a.m.
psp->cmd will be used on resume phase, so we can not free it on hw_init.
Otherwise, a memory corruption will be triggered.

Signed-off-by: Huang Rui <ray.huang@amd.com>
---

V1 -> V2:
- remove "cmd" variable.
- fix typo of check.

Alex, Christian,

This is the final fix for vega10 S3. The random memory corruption issue is root
caused.

Thanks,
Ray

---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 5bed483..711476792 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -330,14 +330,11 @@  static int psp_load_fw(struct amdgpu_device *adev)
 {
 	int ret;
 	struct psp_context *psp = &adev->psp;
-	struct psp_gfx_cmd_resp *cmd;
 
-	cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
-	if (!cmd)
+	psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
+	if (!psp->cmd)
 		return -ENOMEM;
 
-	psp->cmd = cmd;
-
 	ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
 				      AMDGPU_GEM_DOMAIN_GTT,
 				      &psp->fw_pri_bo,
@@ -376,8 +373,6 @@  static int psp_load_fw(struct amdgpu_device *adev)
 	if (ret)
 		goto failed_mem;
 
-	kfree(cmd);
-
 	return 0;
 
 failed_mem:
@@ -387,7 +382,8 @@  static int psp_load_fw(struct amdgpu_device *adev)
 	amdgpu_bo_free_kernel(&psp->fw_pri_bo,
 			      &psp->fw_pri_mc_addr, &psp->fw_pri_buf);
 failed:
-	kfree(cmd);
+	kfree(psp->cmd);
+	psp->cmd = NULL;
 	return ret;
 }
 
@@ -447,6 +443,11 @@  static int psp_hw_fini(void *handle)
 		amdgpu_bo_free_kernel(&psp->fence_buf_bo,
 				      &psp->fence_buf_mc_addr, &psp->fence_buf);
 
+	if (psp->cmd) {
+		kfree(psp->cmd);
+		psp->cmd = NULL;
+	}
+
 	return 0;
 }
 

Comments

Am 29.06.2017 um 10:09 schrieb Huang Rui:
> psp->cmd will be used on resume phase, so we can not free it on hw_init.
> Otherwise, a memory corruption will be triggered.
>
> Signed-off-by: Huang Rui <ray.huang@amd.com>
> ---
>
> V1 -> V2:
> - remove "cmd" variable.
> - fix typo of check.
>
> Alex, Christian,
>
> This is the final fix for vega10 S3. The random memory corruption issue is root
> caused.
>
> Thanks,
> Ray
>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 17 +++++++++--------
>   1 file changed, 9 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 5bed483..711476792 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -330,14 +330,11 @@ static int psp_load_fw(struct amdgpu_device *adev)
>   {
>   	int ret;
>   	struct psp_context *psp = &adev->psp;
> -	struct psp_gfx_cmd_resp *cmd;
>   
> -	cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
> -	if (!cmd)
> +	psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
> +	if (!psp->cmd)
>   		return -ENOMEM;
>   
> -	psp->cmd = cmd;
> -
>   	ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
>   				      AMDGPU_GEM_DOMAIN_GTT,
>   				      &psp->fw_pri_bo,
> @@ -376,8 +373,6 @@ static int psp_load_fw(struct amdgpu_device *adev)
>   	if (ret)
>   		goto failed_mem;
>   
> -	kfree(cmd);
> -
>   	return 0;
>   
>   failed_mem:
> @@ -387,7 +382,8 @@ static int psp_load_fw(struct amdgpu_device *adev)
>   	amdgpu_bo_free_kernel(&psp->fw_pri_bo,
>   			      &psp->fw_pri_mc_addr, &psp->fw_pri_buf);
>   failed:
> -	kfree(cmd);
> +	kfree(psp->cmd);
> +	psp->cmd = NULL;
>   	return ret;
>   }
>   
> @@ -447,6 +443,11 @@ static int psp_hw_fini(void *handle)
>   		amdgpu_bo_free_kernel(&psp->fence_buf_bo,
>   				      &psp->fence_buf_mc_addr, &psp->fence_buf);
>   
> +	if (psp->cmd) {

As Michel noted as well please drop this extra check, kfree(NULL) is 
perfectly save.

With that fixed the patch is Reviewed-by: Christian König 
<christian.koenig@amd.com> for now, but I still think we could do better 
by only allocating the temporary command buffer when it is needed.

Regards,
Christian.

> +		kfree(psp->cmd);
> +		psp->cmd = NULL;
> +	}
> +
>   	return 0;
>   }
>
On Thu, Jun 29, 2017 at 04:16:53PM +0800, Christian König wrote:
> Am 29.06.2017 um 10:09 schrieb Huang Rui:
> > psp->cmd will be used on resume phase, so we can not free it on hw_init.
> > Otherwise, a memory corruption will be triggered.
> >
> > Signed-off-by: Huang Rui <ray.huang@amd.com>
> > ---
> >
> > V1 -> V2:
> > - remove "cmd" variable.
> > - fix typo of check.
> >
> > Alex, Christian,
> >
> > This is the final fix for vega10 S3. The random memory corruption issue is
> root
> > caused.
> >
> > Thanks,
> > Ray
> >
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 17 +++++++++--------
> >   1 file changed, 9 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/
> amdgpu/amdgpu_psp.c
> > index 5bed483..711476792 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> > @@ -330,14 +330,11 @@ static int psp_load_fw(struct amdgpu_device *adev)
> >   {
> >        int ret;
> >        struct psp_context *psp = &adev->psp;
> > -     struct psp_gfx_cmd_resp *cmd;
> >  
> > -     cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
> > -     if (!cmd)
> > +     psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);
> > +     if (!psp->cmd)
> >                return -ENOMEM;
> >  
> > -     psp->cmd = cmd;
> > -
> >        ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,
> >                                      AMDGPU_GEM_DOMAIN_GTT,
> >                                      &psp->fw_pri_bo,
> > @@ -376,8 +373,6 @@ static int psp_load_fw(struct amdgpu_device *adev)
> >        if (ret)
> >                goto failed_mem;
> >  
> > -     kfree(cmd);
> > -
> >        return 0;
> >  
> >   failed_mem:
> > @@ -387,7 +382,8 @@ static int psp_load_fw(struct amdgpu_device *adev)
> >        amdgpu_bo_free_kernel(&psp->fw_pri_bo,
> >                              &psp->fw_pri_mc_addr, &psp->fw_pri_buf);
> >   failed:
> > -     kfree(cmd);
> > +     kfree(psp->cmd);
> > +     psp->cmd = NULL;
> >        return ret;
> >   }
> >  
> > @@ -447,6 +443,11 @@ static int psp_hw_fini(void *handle)
> >                amdgpu_bo_free_kernel(&psp->fence_buf_bo,
> >                                      &psp->fence_buf_mc_addr, &psp->
> fence_buf);
> >  
> > +     if (psp->cmd) {
> 
> As Michel noted as well please drop this extra check, kfree(NULL) is
> perfectly save.
> 
> With that fixed the patch is Reviewed-by: Christian König
> <christian.koenig@amd.com> for now, but I still think we could do better
> by only allocating the temporary command buffer when it is needed.
> 

Thanks. This is the quick fix for release. You know, it was a tragedy till
I found the root cause for S3 suspend/resume and make it stable, now it's
able to enter S3 more than 30+ cycles and never crash. 

I am planning to refine the psp codes, any suggestions are warm for me. I
will refer the comments such as fence and "temporary command buffter" to
modify it in following days. :-)

Thanks,
Ray
> -----Original Message-----

> From: Christian König [mailto:deathsimple@vodafone.de]

> Sent: Thursday, June 29, 2017 4:17 AM

> To: Huang, Ray; amd-gfx@lists.freedesktop.org; Deucher, Alexander; Koenig,

> Christian

> Cc: Huan, Alvin; Qiao, Joe(Markham); Jiang, Sonny; Wang, Ken; Yuan, Xiaojie

> Subject: Re: [PATCH v2] drm/amdgpu: fix the memory corruption on S3

> 

> Am 29.06.2017 um 10:09 schrieb Huang Rui:

> > psp->cmd will be used on resume phase, so we can not free it on hw_init.

> > Otherwise, a memory corruption will be triggered.

> >

> > Signed-off-by: Huang Rui <ray.huang@amd.com>

> > ---

> >

> > V1 -> V2:

> > - remove "cmd" variable.

> > - fix typo of check.

> >

> > Alex, Christian,

> >

> > This is the final fix for vega10 S3. The random memory corruption issue is

> root

> > caused.

> >

> > Thanks,

> > Ray

> >

> > ---

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 17 +++++++++--------

> >   1 file changed, 9 insertions(+), 8 deletions(-)

> >

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

> > index 5bed483..711476792 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

> > @@ -330,14 +330,11 @@ static int psp_load_fw(struct amdgpu_device

> *adev)

> >   {

> >   	int ret;

> >   	struct psp_context *psp = &adev->psp;

> > -	struct psp_gfx_cmd_resp *cmd;

> >

> > -	cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);

> > -	if (!cmd)

> > +	psp->cmd = kzalloc(sizeof(struct psp_gfx_cmd_resp), GFP_KERNEL);

> > +	if (!psp->cmd)

> >   		return -ENOMEM;

> >

> > -	psp->cmd = cmd;

> > -

> >   	ret = amdgpu_bo_create_kernel(adev, PSP_1_MEG, PSP_1_MEG,

> >   				      AMDGPU_GEM_DOMAIN_GTT,

> >   				      &psp->fw_pri_bo,

> > @@ -376,8 +373,6 @@ static int psp_load_fw(struct amdgpu_device

> *adev)

> >   	if (ret)

> >   		goto failed_mem;

> >

> > -	kfree(cmd);

> > -

> >   	return 0;

> >

> >   failed_mem:

> > @@ -387,7 +382,8 @@ static int psp_load_fw(struct amdgpu_device

> *adev)

> >   	amdgpu_bo_free_kernel(&psp->fw_pri_bo,

> >   			      &psp->fw_pri_mc_addr, &psp->fw_pri_buf);

> >   failed:

> > -	kfree(cmd);

> > +	kfree(psp->cmd);

> > +	psp->cmd = NULL;

> >   	return ret;

> >   }

> >

> > @@ -447,6 +443,11 @@ static int psp_hw_fini(void *handle)

> >   		amdgpu_bo_free_kernel(&psp->fence_buf_bo,

> >   				      &psp->fence_buf_mc_addr, &psp-

> >fence_buf);

> >

> > +	if (psp->cmd) {

> 

> As Michel noted as well please drop this extra check, kfree(NULL) is

> perfectly save.

> 

> With that fixed the patch is Reviewed-by: Christian König

> <christian.koenig@amd.com> for now, but I still think we could do better

> by only allocating the temporary command buffer when it is needed.


Yes, nice find Ray!  Glad to finally have this one solved!  With the extra check fixed:
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>


> 

> Regards,

> Christian.

> 

> > +		kfree(psp->cmd);

> > +		psp->cmd = NULL;

> > +	}

> > +

> >   	return 0;

> >   }

> >

>