[3/3] drm/amdgpu: adding xgmi error monitoring

Submitted by Kim, Jonathan on July 16, 2019, 5:12 p.m.

Details

Message ID CH2PR12MB3831ACCA7B895EB9A604F99385CE0@CH2PR12MB3831.namprd12.prod.outlook.com
State New
Headers show
Series "Series without cover letter" ( rev: 3 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Kim, Jonathan July 16, 2019, 5:12 p.m.
Note patch 1 sent to internal amd mailing list for review.

-----Original Message-----
From: Kim, Jonathan <Jonathan.Kim@amd.com> 
Sent: Tuesday, July 16, 2019 1:09 PM
To: amd-gfx@lists.freedesktop.org
Cc: Kim, Jonathan <Jonathan.Kim@amd.com>; Kim, Jonathan <Jonathan.Kim@amd.com>
Subject: [PATCH 3/3] drm/amdgpu: adding xgmi error monitoring

monitor xgmi errors via mc pie status through fica registers.

Change-Id: Id80b6c2f635a294afe343cf55a03902e9a1787a5
Signed-off-by: Jonathan Kim <Jonathan.Kim@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 38 ++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

--
2.17.1

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index d11eba09eadd..4b87fda15ac5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -25,7 +25,7 @@ 
 #include "amdgpu.h"
 #include "amdgpu_xgmi.h"
 #include "amdgpu_smu.h"
-
+#include "df/df_3_6_offset.h"
 
 static DEFINE_MUTEX(xgmi_mutex);
 
@@ -131,9 +131,37 @@  static ssize_t amdgpu_xgmi_show_device_id(struct device *dev,
 
 }
 
+#define AMDGPU_XGMI_SET_FICAA(o)	((o) | 0x456801)
+static ssize_t amdgpu_xgmi_show_error(struct device *dev,
+				      struct device_attribute *attr,
+				      char *buf)
+{
+	struct drm_device *ddev = dev_get_drvdata(dev);
+	struct amdgpu_device *adev = ddev->dev_private;
+	uint32_t ficaa_pie_ctl_in, ficaa_pie_status_in;
+	uint64_t fica_out;
+	unsigned int error_count = 0;
+
+	ficaa_pie_ctl_in = AMDGPU_XGMI_SET_FICAA(0x200);
+	ficaa_pie_status_in = AMDGPU_XGMI_SET_FICAA(0x208);
 
-static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, NULL);
+	fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_ctl_in);
+	if (fica_out != 0x1f)
+		pr_err("xGMI error counters not enabled!\n");
+
+	fica_out = adev->df_funcs->get_fica(adev, ficaa_pie_status_in);
+
+	if ((fica_out & 0xffff) == 2)
+		error_count = ((fica_out >> 62) & 0x1) + (fica_out >> 63);
 
+	adev->df_funcs->set_fica(adev, ficaa_pie_status_in, 0, 0);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", error_count); }
+
+
+static DEVICE_ATTR(xgmi_device_id, S_IRUGO, amdgpu_xgmi_show_device_id, 
+NULL); static DEVICE_ATTR(xgmi_error, S_IRUGO, amdgpu_xgmi_show_error, 
+NULL);
 
 static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
 					 struct amdgpu_hive_info *hive)
@@ -148,6 +176,12 @@  static int amdgpu_xgmi_sysfs_add_dev_info(struct amdgpu_device *adev,
 		return ret;
 	}
 
+	/* Create xgmi error file */
+	ret = device_create_file(adev->dev, &dev_attr_xgmi_error);
+	if (ret)
+		pr_err("failed to create xgmi_error\n");
+
+
 	/* Create sysfs link to hive info folder on the first device */
 	if (adev != hive->adev) {
 		ret = sysfs_create_link(&adev->dev->kobj, hive->kobj,