[3/4] drm/amdgpu: save umc error records

Submitted by Zhou1, Tao on Aug. 30, 2019, 12:24 p.m.

Details

Message ID 20190830122453.19703-4-tao.zhou1@amd.com
State New
Headers show
Series "add support for ras page retirement" ( rev: 3 2 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Zhou1, Tao Aug. 30, 2019, 12:24 p.m.
save umc error records to ras bad page array

v2: add bad pages before gpu reset

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 29 ++++++++++++++----
 drivers/gpu/drm/amd/amdgpu/umc_v6_1.c   | 39 ++++++++++++++++++++-----
 3 files changed, 57 insertions(+), 13 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b6bac873c588..42e1d379e21c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -347,7 +347,7 @@  struct ras_err_data {
 	unsigned long ue_count;
 	unsigned long ce_count;
 	unsigned long err_addr_cnt;
-	uint64_t *err_addr;
+	struct eeprom_table_record *err_addr;
 };
 
 struct ras_err_handler_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 70a05e302d60..5015a07dcb3c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -246,16 +246,35 @@  static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 	if (adev->umc.funcs->query_ras_error_count)
 		adev->umc.funcs->query_ras_error_count(adev, err_data);
-	/* umc query_ras_error_address is also responsible for clearing
-	 * error status
-	 */
-	if (adev->umc.funcs->query_ras_error_address)
+
+	if (adev->umc.funcs->query_ras_error_address &&
+		adev->umc.max_ras_err_cnt_per_query) {
+		err_data->err_addr =
+			kcalloc(adev->umc.max_ras_err_cnt_per_query,
+				sizeof(struct eeprom_table_record), GFP_KERNEL);
+		/* still call query_ras_error_address to clear error status
+		 * even NOMEM error is encountered
+		 */
+		if(!err_data->err_addr)
+			DRM_WARN("Failed to alloc memory for umc error address record!\n");
+
+		/* umc query_ras_error_address is also responsible for clearing
+		 * error status
+		 */
 		adev->umc.funcs->query_ras_error_address(adev, err_data);
+	}
 
 	/* only uncorrectable error needs gpu reset */
-	if (err_data->ue_count)
+	if (err_data->ue_count) {
+		if (err_data->err_addr_cnt &&
+			amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
+						err_data->err_addr_cnt))
+			DRM_WARN("Failed to add ras bad page!\n");
+
 		amdgpu_ras_reset_gpu(adev, 0);
+	}
 
+	kfree(err_data->err_addr);
 	return AMDGPU_RAS_SUCCESS;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
index 8502e736f721..a54ff170591f 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c
@@ -75,6 +75,17 @@  static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev)
 			RSMU_UMC_INDEX_MODE_EN, 0);
 }
 
+static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev)
+{
+	uint32_t rsmu_umc_index;
+
+	rsmu_umc_index = RREG32_SOC15(RSMU, 0,
+			mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU);
+	return REG_GET_FIELD(rsmu_umc_index,
+				RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU,
+				RSMU_UMC_INDEX_INSTANCE);
+}
+
 static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev,
 						   uint32_t umc_reg_offset,
 						   unsigned long *error_count)
@@ -165,7 +176,8 @@  static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 					 uint32_t umc_reg_offset, uint32_t channel_index)
 {
 	uint32_t lsb, mc_umc_status_addr;
-	uint64_t mc_umc_status, err_addr;
+	uint64_t mc_umc_status, err_addr, retired_page;
+	struct eeprom_table_record *err_rec;
 
 	mc_umc_status_addr =
 		SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -177,6 +189,7 @@  static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 		return;
 	}
 
+	err_rec = &err_data->err_addr[err_data->err_addr_cnt];
 	mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset);
 
 	/* calculate error address if ue/ce error is detected */
@@ -191,12 +204,24 @@  static void umc_v6_1_query_error_address(struct amdgpu_device *adev,
 		err_addr &= ~((0x1ULL << lsb) - 1);
 
 		/* translate umc channel address to soc pa, 3 parts are included */
-		err_data->err_addr[err_data->err_addr_cnt] =
-						ADDR_OF_8KB_BLOCK(err_addr) |
-						ADDR_OF_256B_BLOCK(channel_index) |
-						OFFSET_IN_256B_BLOCK(err_addr);
-
-		err_data->err_addr_cnt++;
+		retired_page = ADDR_OF_8KB_BLOCK(err_addr) |
+				ADDR_OF_256B_BLOCK(channel_index) |
+				OFFSET_IN_256B_BLOCK(err_addr);
+
+		/* we only save ue error information currently, ce is skipped */
+		if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC)
+				== 1) {
+			err_rec->address = err_addr;
+			/* page frame address is saved */
+			err_rec->retired_page = retired_page >> PAGE_SHIFT;
+			err_rec->ts = (uint64_t)ktime_get_real_seconds();
+			err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
+			err_rec->cu = 0;
+			err_rec->mem_channel = channel_index;
+			err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev);
+
+			err_data->err_addr_cnt++;
+		}
 	}
 
 	/* clear umc status */