[4/4] drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place

Submitted by Zhou1, Tao on Aug. 30, 2019, 12:24 p.m.

Details

Message ID 20190830122453.19703-5-tao.zhou1@amd.com
State New
Headers show
Series "add support for ras page retirement" ( rev: 3 2 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Zhou1, Tao Aug. 30, 2019, 12:24 p.m.
ras recovery_init should be called after ttm and smu init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset
add cleanup for recovery_init and recovery_fini

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++
 3 files changed, 33 insertions(+), 16 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 67b09cb2a9e2..4e4895ac926d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2894,6 +2894,17 @@  int amdgpu_device_init(struct amdgpu_device *adev,
 		goto failed;
 	}
 
+	/* retired pages will be loaded from eeprom and reserved here,
+	 * new bo may be created, it should be called after ttm init,
+	 * accessing eeprom also relies on smu (unlock i2c bus) and it
+	 * should be called after smu init
+	 *
+	 * Note: theoretically, this should be called before all vram allocations
+	 * to protect retired page from abusing, but there are some allocations
+	 * in front of smu fw loading
+	 */
+	amdgpu_ras_recovery_init(adev);
+
 	/* must succeed. */
 	amdgpu_ras_resume(adev);
 
@@ -3627,11 +3638,6 @@  static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
 						break;
 				}
 			}
-
-			list_for_each_entry(tmp_adev, device_list_handle,
-					gmc.xgmi.head) {
-				amdgpu_ras_reserve_bad_pages(tmp_adev);
-			}
 		}
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 02120aa3cb5d..ad67a109122f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1477,14 +1477,13 @@  static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
 	int ret;
 
-	*data = kmalloc(sizeof(**data),
-			GFP_KERNEL|__GFP_ZERO);
+	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
 	if (!*data)
 		return -ENOMEM;
 
@@ -1495,18 +1494,29 @@  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 
 	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
 	if (ret)
-		return ret;
+		goto cancel_work;
 
 	if (adev->psp.ras.ras->eeprom_control.num_recs) {
 		ret = amdgpu_ras_load_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto cancel_work;
 		ret = amdgpu_ras_reserve_bad_pages(adev);
 		if (ret)
-			return ret;
+			goto release;
 	}
 
 	return 0;
+
+release:
+	amdgpu_ras_release_bad_pages(adev);
+cancel_work:
+	cancel_work_sync(&con->recovery_work);
+	con->eh_data = NULL;
+	kfree((*data)->bps);
+	kfree((*data)->bps_bo);
+	kfree(*data);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
@@ -1520,6 +1530,7 @@  static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	mutex_lock(&con->recovery_lock);
 	con->eh_data = NULL;
 	kfree(data->bps);
+	kfree(data->bps_bo);
 	kfree(data);
 	mutex_unlock(&con->recovery_lock);
 
@@ -1611,9 +1622,6 @@  int amdgpu_ras_init(struct amdgpu_device *adev)
 			return r;
 	}
 
-	if (amdgpu_ras_recovery_init(adev))
-		goto recovery_out;
-
 	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
 
 	if (amdgpu_ras_fs_init(adev))
@@ -1628,8 +1636,6 @@  int amdgpu_ras_init(struct amdgpu_device *adev)
 			con->hw_supported, con->supported);
 	return 0;
 fs_out:
-	amdgpu_ras_recovery_fini(adev);
-recovery_out:
 	amdgpu_ras_set_context(adev, NULL);
 	kfree(con);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 42e1d379e21c..6d00f79b612b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -480,6 +480,7 @@  static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 	return ras && (ras->supported & (1 << block));
 }
 
+int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
 		unsigned int block);
 
@@ -500,6 +501,10 @@  static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
 {
 	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
+	/* save bad page to eeprom before gpu reset,
+	 * i2c may be unstable in gpu reset
+	 */
+	amdgpu_ras_reserve_bad_pages(adev);
 	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
 		schedule_work(&ras->recovery_work);
 	return 0;

Comments

On 8/30/19 8:24 AM, Tao Zhou wrote:
> ras recovery_init should be called after ttm and smu init,

> bad page reserve should be put in front of gpu reset since i2c

> may be unstable during gpu reset

> add cleanup for recovery_init and recovery_fini

>

> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----

>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++---------

>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++

>   3 files changed, 33 insertions(+), 16 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index 67b09cb2a9e2..4e4895ac926d 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,

>   		goto failed;

>   	}

>   

> +	/* retired pages will be loaded from eeprom and reserved here,

> +	 * new bo may be created, it should be called after ttm init,

> +	 * accessing eeprom also relies on smu (unlock i2c bus) and it

> +	 * should be called after smu init

> +	 *

> +	 * Note: theoretically, this should be called before all vram allocations

> +	 * to protect retired page from abusing, but there are some allocations

> +	 * in front of smu fw loading

> +	 */

> +	amdgpu_ras_recovery_init(adev);



You probably should check for return value here.


> +

>   	/* must succeed. */

>   	amdgpu_ras_resume(adev);

>   

> @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,

>   						break;

>   				}

>   			}

> -

> -			list_for_each_entry(tmp_adev, device_list_handle,

> -					gmc.xgmi.head) {

> -				amdgpu_ras_reserve_bad_pages(tmp_adev);

> -			}

>   		}

>   	}

>   

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> index 02120aa3cb5d..ad67a109122f 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> @@ -1477,14 +1477,13 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)

>   	return 0;

>   }

>   

> -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)

> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)

>   {

>   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

>   	struct ras_err_handler_data **data = &con->eh_data;

>   	int ret;

>   

> -	*data = kmalloc(sizeof(**data),

> -			GFP_KERNEL|__GFP_ZERO);

> +	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);

>   	if (!*data)

>   		return -ENOMEM;

>   

> @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)

>   

>   	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);

>   	if (ret)

> -		return ret;

> +		goto cancel_work;



Why you need do go to cancel_work_sync(&con->recovery_work) at such 
early stage of device init, is RAS active already by this time and RAS 
interrupt might fire triggering reset  ?

Andrey


>   

>   	if (adev->psp.ras.ras->eeprom_control.num_recs) {

>   		ret = amdgpu_ras_load_bad_pages(adev);

>   		if (ret)

> -			return ret;

> +			goto cancel_work;

>   		ret = amdgpu_ras_reserve_bad_pages(adev);

>   		if (ret)

> -			return ret;

> +			goto release;

>   	}

>   

>   	return 0;

> +

> +release:

> +	amdgpu_ras_release_bad_pages(adev);

> +cancel_work:

> +	cancel_work_sync(&con->recovery_work);

> +	con->eh_data = NULL;

> +	kfree((*data)->bps);

> +	kfree((*data)->bps_bo);

> +	kfree(*data);

> +

> +	return ret;

>   }

>   

>   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)

> @@ -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)

>   	mutex_lock(&con->recovery_lock);

>   	con->eh_data = NULL;

>   	kfree(data->bps);

> +	kfree(data->bps_bo);

>   	kfree(data);

>   	mutex_unlock(&con->recovery_lock);

>   

> @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)

>   			return r;

>   	}

>   

> -	if (amdgpu_ras_recovery_init(adev))

> -		goto recovery_out;

> -

>   	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;

>   

>   	if (amdgpu_ras_fs_init(adev))

> @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)

>   			con->hw_supported, con->supported);

>   	return 0;

>   fs_out:

> -	amdgpu_ras_recovery_fini(adev);

> -recovery_out:

>   	amdgpu_ras_set_context(adev, NULL);

>   	kfree(con);

>   

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> index 42e1d379e21c..6d00f79b612b 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,

>   	return ras && (ras->supported & (1 << block));

>   }

>   

> +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);

>   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,

>   		unsigned int block);

>   

> @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,

>   {

>   	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

>   

> +	/* save bad page to eeprom before gpu reset,

> +	 * i2c may be unstable in gpu reset

> +	 */

> +	amdgpu_ras_reserve_bad_pages(adev);

>   	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)

>   		schedule_work(&ras->recovery_work);

>   	return 0;
> -----Original Message-----

> From: Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>

> Sent: 2019年8月30日 22:03

> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;

> Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>;

> Zhang, Hawking <Hawking.Zhang@amd.com>

> Subject: Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and

> bad page reserve to proper place

> 

> 

> On 8/30/19 8:24 AM, Tao Zhou wrote:

> > ras recovery_init should be called after ttm and smu init, bad page

> > reserve should be put in front of gpu reset since i2c may be unstable

> > during gpu reset add cleanup for recovery_init and recovery_fini

> >

> > Signed-off-by: Tao Zhou <tao.zhou1@amd.com>

> > ---

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++----

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 28 +++++++++++++-------

> --

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    |  5 ++++

> >   3 files changed, 33 insertions(+), 16 deletions(-)

> >

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> > index 67b09cb2a9e2..4e4895ac926d 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> > @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device

> *adev,

> >   		goto failed;

> >   	}

> >

> > +	/* retired pages will be loaded from eeprom and reserved here,

> > +	 * new bo may be created, it should be called after ttm init,

> > +	 * accessing eeprom also relies on smu (unlock i2c bus) and it

> > +	 * should be called after smu init

> > +	 *

> > +	 * Note: theoretically, this should be called before all vram allocations

> > +	 * to protect retired page from abusing, but there are some

> allocations

> > +	 * in front of smu fw loading

> > +	 */

> > +	amdgpu_ras_recovery_init(adev);

> 

> 

> You probably should check for return value here.

[Tao] No check here is intentional, the failure of recovery_init should not stop amdgpu init process and recovery_init could free resources allocated by itself if it fails. I'll add comment here and add a DRM_WARN for recovery_init.

> 

> 

> > +

> >   	/* must succeed. */

> >   	amdgpu_ras_resume(adev);

> >

> > @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct

> amdgpu_hive_info *hive,

> >   						break;

> >   				}

> >   			}

> > -

> > -			list_for_each_entry(tmp_adev, device_list_handle,

> > -					gmc.xgmi.head) {

> > -				amdgpu_ras_reserve_bad_pages(tmp_adev);

> > -			}

> >   		}

> >   	}

> >

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> > index 02120aa3cb5d..ad67a109122f 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> > @@ -1477,14 +1477,13 @@ static int

> amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)

> >   	return 0;

> >   }

> >

> > -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)

> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev)

> >   {

> >   	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

> >   	struct ras_err_handler_data **data = &con->eh_data;

> >   	int ret;

> >

> > -	*data = kmalloc(sizeof(**data),

> > -			GFP_KERNEL|__GFP_ZERO);

> > +	*data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);

> >   	if (!*data)

> >   		return -ENOMEM;

> >

> > @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct

> > amdgpu_device *adev)

> >

> >   	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-

> >eeprom_control);

> >   	if (ret)

> > -		return ret;

> > +		goto cancel_work;

> 

> 

> Why you need do go to cancel_work_sync(&con->recovery_work) at such

> early stage of device init, is RAS active already by this time and RAS interrupt

> might fire triggering reset  ?

> 

> Andrey

> 

[Tao] Good point, I'll remove cancel_work_sync.

> 

> >

> >   	if (adev->psp.ras.ras->eeprom_control.num_recs) {

> >   		ret = amdgpu_ras_load_bad_pages(adev);

> >   		if (ret)

> > -			return ret;

> > +			goto cancel_work;

> >   		ret = amdgpu_ras_reserve_bad_pages(adev);

> >   		if (ret)

> > -			return ret;

> > +			goto release;

> >   	}

> >

> >   	return 0;

> > +

> > +release:

> > +	amdgpu_ras_release_bad_pages(adev);

> > +cancel_work:

> > +	cancel_work_sync(&con->recovery_work);

> > +	con->eh_data = NULL;

> > +	kfree((*data)->bps);

> > +	kfree((*data)->bps_bo);

> > +	kfree(*data);

> > +

> > +	return ret;

> >   }

> >

> >   static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@

> > -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct

> amdgpu_device *adev)

> >   	mutex_lock(&con->recovery_lock);

> >   	con->eh_data = NULL;

> >   	kfree(data->bps);

> > +	kfree(data->bps_bo);

> >   	kfree(data);

> >   	mutex_unlock(&con->recovery_lock);

> >

> > @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device

> *adev)

> >   			return r;

> >   	}

> >

> > -	if (amdgpu_ras_recovery_init(adev))

> > -		goto recovery_out;

> > -

> >   	amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;

> >

> >   	if (amdgpu_ras_fs_init(adev))

> > @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device

> *adev)

> >   			con->hw_supported, con->supported);

> >   	return 0;

> >   fs_out:

> > -	amdgpu_ras_recovery_fini(adev);

> > -recovery_out:

> >   	amdgpu_ras_set_context(adev, NULL);

> >   	kfree(con);

> >

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> > index 42e1d379e21c..6d00f79b612b 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> > @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct

> amdgpu_device *adev,

> >   	return ras && (ras->supported & (1 << block));

> >   }

> >

> > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev);

> >   int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,

> >   		unsigned int block);

> >

> > @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct

> amdgpu_device *adev,

> >   {

> >   	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);

> >

> > +	/* save bad page to eeprom before gpu reset,

> > +	 * i2c may be unstable in gpu reset

> > +	 */

> > +	amdgpu_ras_reserve_bad_pages(adev);

> >   	if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)

> >   		schedule_work(&ras->recovery_work);

> >   	return 0;