[2/4] drm/amdgpu: Hook EEPROM table to RAS

Submitted by Chen, Guchun on Sept. 2, 2019, 2:11 a.m.

Details

Message ID SN6PR12MB2813A05D3E8BCC723AE50308F1BE0@SN6PR12MB2813.namprd12.prod.outlook.com
State New
Headers show
Series "add support for ras page retirement" ( rev: 3 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Chen, Guchun Sept. 2, 2019, 2:11 a.m.
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao Zhou

Sent: Friday, August 30, 2019 8:25 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS

support eeprom records load and save for ras, move EEPROM records storing to bad page reserving

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++------
 1 file changed, 83 insertions(+), 28 deletions(-)

 	struct amdgpu_bo *bo;
-	int i;
+	int i, ret = 0;
 
 	if (!con || !con->eh_data)
 		return 0;
@@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
+
+	ret = amdgpu_ras_save_bad_pages(adev);
 out:
 	mutex_unlock(&con->recovery_lock);
-	return 0;
+	return ret;
 }
 
 /* called when driver unload */
@@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
-static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{
-	/* TODO
-	 * write the array to eeprom when SMU disabled.
-	 */
-	return 0;
-}
-
-/*
- * read error record array in eeprom and reserve enough space for
- * storing new bad pages
- */
-static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{
-	struct eeprom_table_record *bps = NULL;
-	int ret;
-
-	ret = amdgpu_ras_add_bad_pages(adev, bps,
-				adev->umc.max_ras_err_cnt_per_query);
-
-	return ret;
-}
-
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data **data = &con->eh_data;
+	int ret;
 
 	*data = kmalloc(sizeof(**data),
 			GFP_KERNEL|__GFP_ZERO);
@@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 	atomic_set(&con->in_recovery, 0);
 	con->adev = adev;
 
-	amdgpu_ras_load_bad_pages(adev);
-	amdgpu_ras_reserve_bad_pages(adev);
+	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
+	if (ret)
+		return ret;
+
+	if (adev->psp.ras.ras->eeprom_control.num_recs) {
+		ret = amdgpu_ras_load_bad_pages(adev);
+		if (ret)
+			return ret;
+		ret = amdgpu_ras_reserve_bad_pages(adev);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
@@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data = con->eh_data;
 
 	cancel_work_sync(&con->recovery_work);
-	amdgpu_ras_save_bad_pages(adev);
 	amdgpu_ras_release_bad_pages(adev);
 
 	mutex_lock(&con->recovery_lock);
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 24663ec41248..02120aa3cb5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1348,6 +1348,72 @@  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 	return ret;
 }
 
+/*
+ * write error record array to eeprom, the function should be
+ * protected by recovery_lock
+ */
+static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) {
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	struct ras_err_handler_data *data;
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	int save_count;
+
+	if (!con || !con->eh_data)
+		return 0;
+
+	data = con->eh_data;
+	if (!data)
+		return 0;
[Guchun]Such check (!data) is redundant and not needed. As we have checked !con->eh_data earlier, and the whole function is protected by recovery_lock.

+	save_count = data->count - control->num_recs;
+	/* only new entries are saved */
+	if (save_count > 0)
+		if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control,
+							&data->bps[control->num_recs],
+							true,
+							save_count)) {
+			DRM_ERROR("Failed to save EEPROM table data!");
+			return -EIO;
+		}
+
+	return 0;
+}
+
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
+static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {
+	struct amdgpu_ras_eeprom_control *control =
+					&adev->psp.ras.ras->eeprom_control;
+	struct eeprom_table_record *bps = NULL;
+	int ret = 0;
+
+	/* no bad page record, skip eeprom access */
+	if (!control->num_recs)
+		return ret;
+
+	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
+	if (!bps)
+		return -ENOMEM;
+
+	if (amdgpu_ras_eeprom_process_recods(control, bps, false,
+		control->num_recs)) {
+		DRM_ERROR("Failed to load EEPROM table records!");
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
+
+out:
+	kfree(bps);
+	return ret;
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 	struct ras_err_handler_data *data;
 	uint64_t bp;

Comments

> -----Original Message-----

> From: Chen, Guchun <Guchun.Chen@amd.com>

> Sent: 2019年9月2日 10:11

> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;

> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Li, Dennis

> <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>

> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>

> Subject: RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS

> 

> 

> 

> -----Original Message-----

> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao

> Zhou

> Sent: Friday, August 30, 2019 8:25 PM

> To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey

> <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>;

> Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking

> <Hawking.Zhang@amd.com>

> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>

> Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS

> 

> support eeprom records load and save for ras, move EEPROM records

> storing to bad page reserving

> 

> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>

> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

> ---

>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++--

> ----

>  1 file changed, 83 insertions(+), 28 deletions(-)

> 

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> index 24663ec41248..02120aa3cb5d 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct

> amdgpu_device *adev,

>  	return ret;

>  }

> 

> +/*

> + * write error record array to eeprom, the function should be

> + * protected by recovery_lock

> + */

> +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) {

> +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

> +	struct ras_err_handler_data *data;

> +	struct amdgpu_ras_eeprom_control *control =

> +					&adev->psp.ras.ras->eeprom_control;

> +	int save_count;

> +

> +	if (!con || !con->eh_data)

> +		return 0;

> +

> +	data = con->eh_data;

> +	if (!data)

> +		return 0;

> [Guchun]Such check (!data) is redundant and not needed. As we have

> checked !con->eh_data earlier, and the whole function is protected by

> recovery_lock.


[Tao] OK, I'll remove it.

> 

> +	save_count = data->count - control->num_recs;

> +	/* only new entries are saved */

> +	if (save_count > 0)

> +		if (amdgpu_ras_eeprom_process_recods(&con-

> >eeprom_control,

> +							&data->bps[control-

> >num_recs],

> +							true,

> +							save_count)) {

> +			DRM_ERROR("Failed to save EEPROM table data!");

> +			return -EIO;

> +		}

> +

> +	return 0;

> +}

> +

> +/*

> + * read error record array in eeprom and reserve enough space for

> + * storing new bad pages

> + */

> +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) {

> +	struct amdgpu_ras_eeprom_control *control =

> +					&adev->psp.ras.ras->eeprom_control;

> +	struct eeprom_table_record *bps = NULL;

> +	int ret = 0;

> +

> +	/* no bad page record, skip eeprom access */

> +	if (!control->num_recs)

> +		return ret;

> +

> +	bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);

> +	if (!bps)

> +		return -ENOMEM;

> +

> +	if (amdgpu_ras_eeprom_process_recods(control, bps, false,

> +		control->num_recs)) {

> +		DRM_ERROR("Failed to load EEPROM table records!");

> +		ret = -EIO;

> +		goto out;

> +	}

> +

> +	ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);

> +

> +out:

> +	kfree(bps);

> +	return ret;

> +}

> +

>  /* called in gpu recovery/init */

>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -

> 1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct

> amdgpu_device *adev)

>  	struct ras_err_handler_data *data;

>  	uint64_t bp;

>  	struct amdgpu_bo *bo;

> -	int i;

> +	int i, ret = 0;

> 

>  	if (!con || !con->eh_data)

>  		return 0;

> @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct

> amdgpu_device *adev)

>  		data->bps_bo[i] = bo;

>  		data->last_reserved = i + 1;

>  	}

> +

> +	ret = amdgpu_ras_save_bad_pages(adev);

>  out:

>  	mutex_unlock(&con->recovery_lock);

> -	return 0;

> +	return ret;

>  }

> 

>  /* called when driver unload */

> @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct

> amdgpu_device *adev)

>  	return 0;

>  }

> 

> -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{

> -	/* TODO

> -	 * write the array to eeprom when SMU disabled.

> -	 */

> -	return 0;

> -}

> -

> -/*

> - * read error record array in eeprom and reserve enough space for

> - * storing new bad pages

> - */

> -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{

> -	struct eeprom_table_record *bps = NULL;

> -	int ret;

> -

> -	ret = amdgpu_ras_add_bad_pages(adev, bps,

> -				adev->umc.max_ras_err_cnt_per_query);

> -

> -	return ret;

> -}

> -

>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)  {

>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

>  	struct ras_err_handler_data **data = &con->eh_data;

> +	int ret;

> 

>  	*data = kmalloc(sizeof(**data),

>  			GFP_KERNEL|__GFP_ZERO);

> @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct

> amdgpu_device *adev)

>  	atomic_set(&con->in_recovery, 0);

>  	con->adev = adev;

> 

> -	amdgpu_ras_load_bad_pages(adev);

> -	amdgpu_ras_reserve_bad_pages(adev);

> +	ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras-

> >eeprom_control);

> +	if (ret)

> +		return ret;

> +

> +	if (adev->psp.ras.ras->eeprom_control.num_recs) {

> +		ret = amdgpu_ras_load_bad_pages(adev);

> +		if (ret)

> +			return ret;

> +		ret = amdgpu_ras_reserve_bad_pages(adev);

> +		if (ret)

> +			return ret;

> +	}

> 

>  	return 0;

>  }

> @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct

> amdgpu_device *adev)

>  	struct ras_err_handler_data *data = con->eh_data;

> 

>  	cancel_work_sync(&con->recovery_work);

> -	amdgpu_ras_save_bad_pages(adev);

>  	amdgpu_ras_release_bad_pages(adev);

> 

>  	mutex_lock(&con->recovery_lock);

> --

> 2.17.1

> 

> _______________________________________________

> amd-gfx mailing list

> amd-gfx@lists.freedesktop.org

> https://lists.freedesktop.org/mailman/listinfo/amd-gfx