[1/4] drm/amdgpu: change ras bps type to eeprom table record structure

Submitted by Chen, Guchun on Sept. 2, 2019, 2:13 a.m.

Details

Message ID SN6PR12MB2813C145D6004891FAF398B5F1BE0@SN6PR12MB2813.namprd12.prod.outlook.com
State New
Headers show
Series "add support for ras page retirement" ( rev: 3 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Chen, Guchun Sept. 2, 2019, 2:13 a.m.
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao Zhou

Sent: Friday, August 30, 2019 8:25 PM
To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>; Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>
Cc: Zhou1, Tao <Tao.Zhou1@amd.com>
Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table record structure

change bps type from retired page to eeprom table record, prepare for saving error records to eeprom

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>

---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++---------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--
 2 files changed, 43 insertions(+), 27 deletions(-)

 
 /* error handling functions */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages);
+		struct eeprom_table_record *bps, int pages);
 
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
 
--
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2ca3997d4b3a..24663ec41248 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1187,14 +1187,14 @@  static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 
 	for (; i < data->count; i++) {
 		(*bps)[i] = (struct ras_badpage){
-			.bp = data->bps[i].bp,
+			.bp = data->bps[i].retired_page,
 			.size = AMDGPU_GPU_PAGE_SIZE,
 			.flags = 0,
 		};
 
 		if (data->last_reserved <= i)
 			(*bps)[i].flags = 1;
-		else if (data->bps[i].bo == NULL)
+		else if (data->bps_bo[i] == NULL)
 			(*bps)[i].flags = 2;
 	}
 
@@ -1288,30 +1288,40 @@  static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  {
 	unsigned int old_space = data->count + data->space_left;
 	unsigned int new_space = old_space + pages;
-	unsigned int align_space = ALIGN(new_space, 1024);
-	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
-
-	if (!tmp)
+	unsigned int align_space = ALIGN(new_space, 512);
[Guchun]Any special reason to change alignment from 512 to 1024?

+	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
+	struct amdgpu_bo **bps_bo =
+			kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
+
+	if (!bps || !bps_bo) {
+		kfree(bps);
+		kfree(bps_bo);
 		return -ENOMEM;
+	}
 
 	if (data->bps) {
-		memcpy(tmp, data->bps,
+		memcpy(bps, data->bps,
 				data->count * sizeof(*data->bps));
 		kfree(data->bps);
 	}
+	if (data->bps_bo) {
+		memcpy(bps_bo, data->bps_bo,
+				data->count * sizeof(*data->bps_bo));
+		kfree(data->bps_bo);
+	}
 
-	data->bps = tmp;
+	data->bps = bps;
+	data->bps_bo = bps_bo;
 	data->space_left += align_space - old_space;
 	return 0;
 }
 
 /* it deal with vram only. */
 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
-		unsigned long *bps, int pages)
+		struct eeprom_table_record *bps, int pages)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data;
-	int i = pages;
 	int ret = 0;
 
 	if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10 +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 			goto out;
 		}
 
-	while (i--)
-		data->bps[data->count++].bp = bps[i];
-
+	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
+	data->count += pages;
 	data->space_left -= pages;
+
 out:
 	mutex_unlock(&con->recovery_lock);
 
@@ -1356,13 +1366,13 @@  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 		goto out;
 	/* reserve vram at driver post stage. */
 	for (i = data->last_reserved; i < data->count; i++) {
-		bp = data->bps[i].bp;
+		bp = data->bps[i].retired_page;
 
 		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
 					PAGE_SIZE, &bo))
 			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i + 1;
 	}
 out:
@@ -1387,11 +1397,11 @@  static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
 		goto out;
 
 	for (i = data->last_reserved - 1; i >= 0; i--) {
-		bo = data->bps[i].bo;
+		bo = data->bps_bo[i];
 
 		amdgpu_ras_release_vram(adev, &bo);
 
-		data->bps[i].bo = bo;
+		data->bps_bo[i] = bo;
 		data->last_reserved = i;
 	}
 out:
@@ -1407,12 +1417,19 @@  static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
 	return 0;
 }
 
+/*
+ * read error record array in eeprom and reserve enough space for
+ * storing new bad pages
+ */
 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  {
-	/* TODO
-	 * read the array to eeprom when SMU disabled.
-	 */
-	return 0;
+	struct eeprom_table_record *bps = NULL;
+	int ret;
+
+	ret = amdgpu_ras_add_bad_pages(adev, bps,
+				adev->umc.max_ras_err_cnt_per_query);
+
+	return ret;
 }
 
 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 66b71525446e..b6bac873c588 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -351,11 +351,10 @@  struct ras_err_data {  };
 
 struct ras_err_handler_data {
-	/* point to bad pages array */
-	struct {
-		unsigned long bp;
-		struct amdgpu_bo *bo;
-	} *bps;
+	/* point to bad page records array */
+	struct eeprom_table_record *bps;
+	/* point to reserved bo array */
+	struct amdgpu_bo **bps_bo;
 	/* the count of entries */
 	int count;
 	/* the space can place new entries */
@@ -492,7 +491,7 @@  unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,

Comments

> -----Original Message-----

> From: Chen, Guchun <Guchun.Chen@amd.com>

> Sent: 2019年9月2日 10:13

> To: Zhou1, Tao <Tao.Zhou1@amd.com>; amd-gfx@lists.freedesktop.org;

> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>; Li, Dennis

> <Dennis.Li@amd.com>; Zhang, Hawking <Hawking.Zhang@amd.com>

> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>

> Subject: RE: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table

> record structure

> 

> 

> 

> -----Original Message-----

> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Tao

> Zhou

> Sent: Friday, August 30, 2019 8:25 PM

> To: amd-gfx@lists.freedesktop.org; Grodzovsky, Andrey

> <Andrey.Grodzovsky@amd.com>; Chen, Guchun <Guchun.Chen@amd.com>;

> Li, Dennis <Dennis.Li@amd.com>; Zhang, Hawking

> <Hawking.Zhang@amd.com>

> Cc: Zhou1, Tao <Tao.Zhou1@amd.com>

> Subject: [PATCH 1/4] drm/amdgpu: change ras bps type to eeprom table

> record structure

> 

> change bps type from retired page to eeprom table record, prepare for

> saving error records to eeprom

> 

> Signed-off-by: Tao Zhou <tao.zhou1@amd.com>

> ---

>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 ++++++++++++++++-------

> --  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--

>  2 files changed, 43 insertions(+), 27 deletions(-)

> 

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> index 2ca3997d4b3a..24663ec41248 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> @@ -1187,14 +1187,14 @@ static int amdgpu_ras_badpages_read(struct

> amdgpu_device *adev,

> 

>  	for (; i < data->count; i++) {

>  		(*bps)[i] = (struct ras_badpage){

> -			.bp = data->bps[i].bp,

> +			.bp = data->bps[i].retired_page,

>  			.size = AMDGPU_GPU_PAGE_SIZE,

>  			.flags = 0,

>  		};

> 

>  		if (data->last_reserved <= i)

>  			(*bps)[i].flags = 1;

> -		else if (data->bps[i].bo == NULL)

> +		else if (data->bps_bo[i] == NULL)

>  			(*bps)[i].flags = 2;

>  	}

> 

> @@ -1288,30 +1288,40 @@ static int

> amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,  {

>  	unsigned int old_space = data->count + data->space_left;

>  	unsigned int new_space = old_space + pages;

> -	unsigned int align_space = ALIGN(new_space, 1024);

> -	void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);

> -

> -	if (!tmp)

> +	unsigned int align_space = ALIGN(new_space, 512);

> [Guchun]Any special reason to change alignment from 512 to 1024?


[Tao] The old "data->bps" is 16 byte and new " struct eeprom_table_record bps" is 31 bytes on 64bit system, I'd like to lower the pressure on memory system. The value can be adjusted according to feedback in the future.

> 

> +	void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);

> +	struct amdgpu_bo **bps_bo =

> +			kmalloc(align_space * sizeof(*data->bps_bo),

> GFP_KERNEL);

> +

> +	if (!bps || !bps_bo) {

> +		kfree(bps);

> +		kfree(bps_bo);

>  		return -ENOMEM;

> +	}

> 

>  	if (data->bps) {

> -		memcpy(tmp, data->bps,

> +		memcpy(bps, data->bps,

>  				data->count * sizeof(*data->bps));

>  		kfree(data->bps);

>  	}

> +	if (data->bps_bo) {

> +		memcpy(bps_bo, data->bps_bo,

> +				data->count * sizeof(*data->bps_bo));

> +		kfree(data->bps_bo);

> +	}

> 

> -	data->bps = tmp;

> +	data->bps = bps;

> +	data->bps_bo = bps_bo;

>  	data->space_left += align_space - old_space;

>  	return 0;

>  }

> 

>  /* it deal with vram only. */

>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,

> -		unsigned long *bps, int pages)

> +		struct eeprom_table_record *bps, int pages)

>  {

>  	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);

>  	struct ras_err_handler_data *data;

> -	int i = pages;

>  	int ret = 0;

> 

>  	if (!con || !con->eh_data || !bps || pages <= 0) @@ -1328,10

> +1338,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,

>  			goto out;

>  		}

> 

> -	while (i--)

> -		data->bps[data->count++].bp = bps[i];

> -

> +	memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));

> +	data->count += pages;

>  	data->space_left -= pages;

> +

>  out:

>  	mutex_unlock(&con->recovery_lock);

> 

> @@ -1356,13 +1366,13 @@ int amdgpu_ras_reserve_bad_pages(struct

> amdgpu_device *adev)

>  		goto out;

>  	/* reserve vram at driver post stage. */

>  	for (i = data->last_reserved; i < data->count; i++) {

> -		bp = data->bps[i].bp;

> +		bp = data->bps[i].retired_page;

> 

>  		if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,

>  					PAGE_SIZE, &bo))

>  			DRM_ERROR("RAS ERROR: reserve vram %llx fail\n",

> bp);

> 

> -		data->bps[i].bo = bo;

> +		data->bps_bo[i] = bo;

>  		data->last_reserved = i + 1;

>  	}

>  out:

> @@ -1387,11 +1397,11 @@ static int amdgpu_ras_release_bad_pages(struct

> amdgpu_device *adev)

>  		goto out;

> 

>  	for (i = data->last_reserved - 1; i >= 0; i--) {

> -		bo = data->bps[i].bo;

> +		bo = data->bps_bo[i];

> 

>  		amdgpu_ras_release_vram(adev, &bo);

> 

> -		data->bps[i].bo = bo;

> +		data->bps_bo[i] = bo;

>  		data->last_reserved = i;

>  	}

>  out:

> @@ -1407,12 +1417,19 @@ static int amdgpu_ras_save_bad_pages(struct

> amdgpu_device *adev)

>  	return 0;

>  }

> 

> +/*

> + * read error record array in eeprom and reserve enough space for

> + * storing new bad pages

> + */

>  static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  {

> -	/* TODO

> -	 * read the array to eeprom when SMU disabled.

> -	 */

> -	return 0;

> +	struct eeprom_table_record *bps = NULL;

> +	int ret;

> +

> +	ret = amdgpu_ras_add_bad_pages(adev, bps,

> +				adev->umc.max_ras_err_cnt_per_query);

> +

> +	return ret;

>  }

> 

>  static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> index 66b71525446e..b6bac873c588 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> @@ -351,11 +351,10 @@ struct ras_err_data {  };

> 

>  struct ras_err_handler_data {

> -	/* point to bad pages array */

> -	struct {

> -		unsigned long bp;

> -		struct amdgpu_bo *bo;

> -	} *bps;

> +	/* point to bad page records array */

> +	struct eeprom_table_record *bps;

> +	/* point to reserved bo array */

> +	struct amdgpu_bo **bps_bo;

>  	/* the count of entries */

>  	int count;

>  	/* the space can place new entries */

> @@ -492,7 +491,7 @@ unsigned long

> amdgpu_ras_query_error_count(struct amdgpu_device *adev,

> 

>  /* error handling functions */

>  int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,

> -		unsigned long *bps, int pages);

> +		struct eeprom_table_record *bps, int pages);

> 

>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);

> 

> --

> 2.17.1

> 

> _______________________________________________

> amd-gfx mailing list

> amd-gfx@lists.freedesktop.org

> https://lists.freedesktop.org/mailman/listinfo/amd-gfx