[v2,1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

Submitted by Andrey Grodzovsky on Aug. 30, 2019, 12:53 a.m.

Details

Message ID 1567126427-13912-1-git-send-email-andrey.grodzovsky@amd.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Andrey Grodzovsky Aug. 30, 2019, 12:53 a.m.
Problem:
Under certain conditions, when some IP bocks take a RAS error,
we can get into a situation where a GPU reset is not possible
due to issues in RAS in SMU/PSP.

Temporary fix until proper solution in PSP/SMU is ready:
When uncorrectable error happens the DF will unconditionally
broadcast error event packets to all its clients/slave upon
receiving fatal error event and freeze all its outbound queues,
err_event_athub interrupt  will be triggered.
In such case and we use this interrupt
to issue GPU reset. THe GPU reset code is modified for such case to avoid HW
reset, only stops schedulers, deatches all in progress and not yet scheduled
job's fences, set error code on them and signals.
Also reject any new incoming job submissions from user space.
All this is done to notify the applications of the problem.

v2:
Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev
Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c
Remove print param from amdgpu_ras_query_error_count

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++++++++++++++++++++++-------
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 38 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 22 ++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 10 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 ++++---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 +++++++++-------
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 ++++
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++++++++----------
 12 files changed, 163 insertions(+), 42 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 9da681e..300adb8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -38,6 +38,7 @@ 
 #include "amdgpu_gmc.h"
 #include "amdgpu_gem.h"
 #include "amdgpu_display.h"
+#include "amdgpu_ras.h"
 
 #if defined(HAVE_DRM_FREE_LARGE)
 #define kvfree drm_free_large
@@ -1461,6 +1462,9 @@  int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 	bool reserved_buffers = false;
 	int i, r;
 
+	if (amdgpu_ras_intr_triggered())
+		return -EHWPOISON;
+
 	if (!adev->accel_working)
 		return -EBUSY;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a5daccc..d3a078b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3727,25 +3727,18 @@  static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
 		adev->mp1_state = PP_MP1_STATE_NONE;
 		break;
 	}
-	/* Block kfd: SRIOV would do it separately */
-	if (!amdgpu_sriov_vf(adev))
-                amdgpu_amdkfd_pre_reset(adev);
 
 	return true;
 }
 
 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
 {
-	/*unlock kfd: SRIOV would do it separately */
-	if (!amdgpu_sriov_vf(adev))
-                amdgpu_amdkfd_post_reset(adev);
 	amdgpu_vf_error_trans_all(adev);
 	adev->mp1_state = PP_MP1_STATE_NONE;
 	adev->in_gpu_reset = 0;
 	mutex_unlock(&adev->lock_reset);
 }
 
-
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -3765,11 +3758,12 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	struct amdgpu_hive_info *hive = NULL;
 	struct amdgpu_device *tmp_adev = NULL;
 	int i, r = 0;
+	bool in_ras_intr = amdgpu_ras_intr_triggered();
 
 	need_full_reset = job_signaled = false;
 	INIT_LIST_HEAD(&device_list);
 
-	dev_info(adev->dev, "GPU reset begin!\n");
+	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");
 
 	cancel_delayed_work_sync(&adev->delayed_init_work);
 
@@ -3796,9 +3790,16 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		return 0;
 	}
 
+	/* Block kfd: SRIOV would do it separately */
+	if (!amdgpu_sriov_vf(adev))
+                amdgpu_amdkfd_pre_reset(adev);
+
 	/* Build list of devices to reset */
 	if  (adev->gmc.xgmi.num_physical_nodes > 1) {
 		if (!hive) {
+			/*unlock kfd: SRIOV would do it separately */
+			if (!amdgpu_sriov_vf(adev))
+		                amdgpu_amdkfd_post_reset(adev);
 			amdgpu_device_unlock_adev(adev);
 			return -ENODEV;
 		}
@@ -3824,7 +3825,7 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	/* block all schedulers and reset given job's ring */
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
 		/* disable ras on ALL IPs */
-		if (amdgpu_device_ip_need_full_reset(tmp_adev))
+		if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
 			amdgpu_ras_suspend(tmp_adev);
 
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -3834,10 +3835,26 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 				continue;
 
 			drm_sched_stop(&ring->sched, job ? &job->base : NULL);
+
+			if (in_ras_intr)
+				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
 		}
 	}
 
 
+	if (in_ras_intr) {
+		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+			if (tmp_adev == adev)
+				continue;
+
+			if (amdgpu_device_lock_adev(tmp_adev, false) && !amdgpu_sriov_vf(tmp_adev))
+				amdgpu_amdkfd_pre_reset(tmp_adev);
+
+		}
+
+		goto skip_sched_resume;
+	}
+
 	/*
 	 * Must check guilty signal here since after this point all old
 	 * HW fences are force signaled.
@@ -3872,7 +3889,9 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		if (tmp_adev == adev)
 			continue;
 
-		amdgpu_device_lock_adev(tmp_adev, false);
+		if (amdgpu_device_lock_adev(tmp_adev, false) && !amdgpu_sriov_vf(tmp_adev))
+			amdgpu_amdkfd_pre_reset(tmp_adev);
+
 		r = amdgpu_device_pre_asic_reset(tmp_adev,
 						 NULL,
 						 &need_full_reset);
@@ -3900,6 +3919,7 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
 	/* Post ASIC reset for all devs .*/
 	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+
 		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 			struct amdgpu_ring *ring = tmp_adev->rings[i];
 
@@ -3926,7 +3946,13 @@  int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 		} else {
 			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));
 		}
+	}
 
+skip_sched_resume:
+	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
+		/*unlock kfd: SRIOV would do it separately */
+		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))
+	                amdgpu_amdkfd_post_reset(tmp_adev);
 		amdgpu_device_unlock_adev(tmp_adev);
 	}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 151d7f2..757fd6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -40,6 +40,8 @@ 
 
 #include "amdgpu_amdkfd.h"
 
+#include "amdgpu_ras.h"
+
 /*
  * KMS wrapper.
  * - 3.0.0 - initial driver
@@ -1179,6 +1181,9 @@  amdgpu_pci_shutdown(struct pci_dev *pdev)
 	struct drm_device *dev = pci_get_drvdata(pdev);
 	struct amdgpu_device *adev = dev->dev_private;
 
+	if (amdgpu_ras_intr_triggered())
+		return;
+
 	/* if we are running in a VM, make sure the device
 	 * torn down properly on reboot/shutdown.
 	 * unfortunately we can't detect certain
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 4d67b77..b12981e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -250,6 +250,44 @@  static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 	return fence;
 }
 
+#define to_drm_sched_job(sched_job)		\
+		container_of((sched_job), struct drm_sched_job, queue_node)
+
+void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
+{
+	struct drm_sched_job *s_job;
+	struct drm_sched_entity *s_entity = NULL;
+	int i;
+
+	/* Signal all jobs not yet scheduled */
+	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
+		struct drm_sched_rq *rq = &sched->sched_rq[i];
+
+		if (!rq)
+			continue;
+
+		spin_lock(&rq->lock);
+		list_for_each_entry(s_entity, &rq->entities, list) {
+			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
+				struct drm_sched_fence *s_fence = s_job->s_fence;
+
+				dma_fence_signal(&s_fence->scheduled);
+				dma_fence_set_error(&s_fence->finished, -EHWPOISON);
+				dma_fence_signal(&s_fence->finished);
+			}
+		}
+		spin_unlock(&rq->lock);
+	}
+
+	/* Signal all jobs already scheduled to HW */
+	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
+		struct drm_sched_fence *s_fence = s_job->s_fence;
+
+		dma_fence_set_error(&s_fence->finished, -EHWPOISON);
+		dma_fence_signal(&s_fence->finished);
+	}
+}
+
 const struct drm_sched_backend_ops amdgpu_sched_ops = {
 	.dependency = amdgpu_job_dependency,
 	.run_job = amdgpu_job_run,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 51e6250..dc7ee93 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -76,4 +76,7 @@  int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,
 		      void *owner, struct dma_fence **f);
 int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,
 			     struct dma_fence **fence);
+
+void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index da2143d..ced766c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1046,6 +1046,12 @@  int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
 	/* Ensure IB tests are run on ring */
 	flush_delayed_work(&adev->delayed_init_work);
 
+
+	if (amdgpu_ras_intr_triggered()) {
+		DRM_ERROR("RAS Intr triggered, device disabled!!");
+		return -EHWPOISON;
+	}
+
 	file_priv->driver_priv = NULL;
 
 	r = pm_runtime_get_sync(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2d5897a..7b00ac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -24,6 +24,8 @@ 
 #include <linux/debugfs.h>
 #include <linux/list.h>
 #include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
 #include "amdgpu.h"
 #include "amdgpu_ras.h"
 #include "amdgpu_atomfirmware.h"
@@ -64,6 +66,9 @@  const char *ras_block_string[] = {
 /* inject address is 52 bits */
 #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)
 
+
+atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
+
 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
 		uint64_t offset, uint64_t size,
 		struct amdgpu_bo **bo_ptr);
@@ -188,6 +193,10 @@  static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 
 	return 0;
 }
+
+static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+		struct ras_common_if *head);
+
 /**
  * DOC: AMDGPU RAS debugfs control interface
  *
@@ -627,12 +636,14 @@  int amdgpu_ras_error_query(struct amdgpu_device *adev,
 	info->ue_count = obj->err_data.ue_count;
 	info->ce_count = obj->err_data.ce_count;
 
-	if (err_data.ce_count)
+	if (err_data.ce_count) {
 		dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
 			 obj->err_data.ce_count, ras_block_str(info->head.block));
-	if (err_data.ue_count)
+	}
+	if (err_data.ue_count) {
 		dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
 			 obj->err_data.ue_count, ras_block_str(info->head.block));
+	}
 
 	return 0;
 }
@@ -1718,3 +1729,10 @@  int amdgpu_ras_fini(struct amdgpu_device *adev)
 
 	return 0;
 }
+
+void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
+{
+	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
+		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
+	}
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5a0df73..cf5ffb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -600,4 +600,14 @@  int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
 
 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
 		struct ras_dispatch_if *info);
+
+extern atomic_t amdgpu_ras_in_intr;
+
+static inline bool amdgpu_ras_intr_triggered(void)
+{
+	return !!atomic_read(&amdgpu_ras_in_intr);
+}
+
+void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
+
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index b2c86a0..e7a83f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5669,10 +5669,12 @@  static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 		struct amdgpu_iv_entry *entry)
 {
 	/* TODO ue will trigger an interrupt. */
-	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-	if (adev->gfx.funcs->query_ras_error_count)
-		adev->gfx.funcs->query_ras_error_count(adev, err_data);
-	amdgpu_ras_reset_gpu(adev, 0);
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		if (adev->gfx.funcs->query_ras_error_count)
+			adev->gfx.funcs->query_ras_error_count(adev, err_data);
+		amdgpu_ras_reset_gpu(adev, 0);
+	}
 	return AMDGPU_RAS_SUCCESS;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 43b4fbc..87a66c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -243,18 +243,20 @@  static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 		struct ras_err_data *err_data,
 		struct amdgpu_iv_entry *entry)
 {
-	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
-	if (adev->umc.funcs->query_ras_error_count)
-		adev->umc.funcs->query_ras_error_count(adev, err_data);
-	/* umc query_ras_error_address is also responsible for clearing
-	 * error status
-	 */
-	if (adev->umc.funcs->query_ras_error_address)
-		adev->umc.funcs->query_ras_error_address(adev, err_data);
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		if (adev->umc.funcs->query_ras_error_count)
+			adev->umc.funcs->query_ras_error_count(adev, err_data);
+		/* umc query_ras_error_address is also responsible for clearing
+		 * error status
+		 */
+		if (adev->umc.funcs->query_ras_error_address)
+			adev->umc.funcs->query_ras_error_address(adev, err_data);
 
-	/* only uncorrectable error needs gpu reset */
-	if (err_data->ue_count)
-		amdgpu_ras_reset_gpu(adev, 0);
+		/* only uncorrectable error needs gpu reset */
+		if (err_data->ue_count)
+			amdgpu_ras_reset_gpu(adev, 0);
+	}
 
 	return AMDGPU_RAS_SUCCESS;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 367f9d6..545990c 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -30,6 +30,7 @@ 
 #include "nbio/nbio_7_4_0_smn.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_ras.h"
 
 #define smnNBIF_MGCG_CTRL_LCLK	0x1013a21c
 
@@ -329,6 +330,8 @@  static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 						BIF_DOORBELL_INT_CNTL,
 						RAS_CNTLR_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
+
+		amdgpu_ras_global_ras_isr(adev);
 	}
 }
 
@@ -344,6 +347,8 @@  static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
 						BIF_DOORBELL_INT_CNTL,
 						RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
 		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
+
+		amdgpu_ras_global_ras_isr(adev);
 	}
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 956432f..438e504 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1972,24 +1972,26 @@  static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
 	uint32_t err_source;
 	int instance;
 
-	instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
-	if (instance < 0)
-		return 0;
+	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+		instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
+		if (instance < 0)
+			return 0;
 
-	switch (entry->src_id) {
-	case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
-		err_source = 0;
-		break;
-	case SDMA0_4_0__SRCID__SDMA_ECC:
-		err_source = 1;
-		break;
-	default:
-		return 0;
-	}
+		switch (entry->src_id) {
+		case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
+			err_source = 0;
+			break;
+		case SDMA0_4_0__SRCID__SDMA_ECC:
+			err_source = 1;
+			break;
+		default:
+			return 0;
+		}
 
-	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
 
-	amdgpu_ras_reset_gpu(adev, 0);
+		amdgpu_ras_reset_gpu(adev, 0);
+	}
 
 	return AMDGPU_RAS_SUCCESS;
 }

Comments

On 2019-08-29 8:53 p.m., Andrey Grodzovsky wrote:
> Problem:

> Under certain conditions, when some IP bocks take a RAS error,

> we can get into a situation where a GPU reset is not possible

> due to issues in RAS in SMU/PSP.

>

> Temporary fix until proper solution in PSP/SMU is ready:

> When uncorrectable error happens the DF will unconditionally

> broadcast error event packets to all its clients/slave upon

> receiving fatal error event and freeze all its outbound queues,

> err_event_athub interrupt  will be triggered.

> In such case and we use this interrupt

> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW

> reset, only stops schedulers, deatches all in progress and not yet scheduled

> job's fences, set error code on them and signals.

> Also reject any new incoming job submissions from user space.

> All this is done to notify the applications of the problem.

>

> v2:

> Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev

> Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c

> Remove print param from amdgpu_ras_query_error_count

>

> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 +++

>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++++++++++++++++++++++-------

>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++++

>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 38 ++++++++++++++++++++++++

>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 ++

>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++++

>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 22 ++++++++++++--

>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 10 +++++++

>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 ++++---

>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 +++++++++-------

>   drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 ++++

>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++++++++----------

>   12 files changed, 163 insertions(+), 42 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

> index 9da681e..300adb8 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

> @@ -38,6 +38,7 @@

>   #include "amdgpu_gmc.h"

>   #include "amdgpu_gem.h"

>   #include "amdgpu_display.h"

> +#include "amdgpu_ras.h"

>   

>   #if defined(HAVE_DRM_FREE_LARGE)

>   #define kvfree drm_free_large

> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)

>   	bool reserved_buffers = false;

>   	int i, r;

>   

> +	if (amdgpu_ras_intr_triggered())

> +		return -EHWPOISON;

> +

>   	if (!adev->accel_working)

>   		return -EBUSY;

>   

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index a5daccc..d3a078b 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -3727,25 +3727,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)

>   		adev->mp1_state = PP_MP1_STATE_NONE;

>   		break;

>   	}

> -	/* Block kfd: SRIOV would do it separately */

> -	if (!amdgpu_sriov_vf(adev))

> -                amdgpu_amdkfd_pre_reset(adev);

>   

>   	return true;

>   }

>   

>   static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)

>   {

> -	/*unlock kfd: SRIOV would do it separately */

> -	if (!amdgpu_sriov_vf(adev))

> -                amdgpu_amdkfd_post_reset(adev);

>   	amdgpu_vf_error_trans_all(adev);

>   	adev->mp1_state = PP_MP1_STATE_NONE;

>   	adev->in_gpu_reset = 0;

>   	mutex_unlock(&adev->lock_reset);

>   }

>   

> -

>   /**

>    * amdgpu_device_gpu_recover - reset the asic and recover scheduler

>    *

> @@ -3765,11 +3758,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>   	struct amdgpu_hive_info *hive = NULL;

>   	struct amdgpu_device *tmp_adev = NULL;

>   	int i, r = 0;

> +	bool in_ras_intr = amdgpu_ras_intr_triggered();

>   

>   	need_full_reset = job_signaled = false;

>   	INIT_LIST_HEAD(&device_list);

>   

> -	dev_info(adev->dev, "GPU reset begin!\n");

> +	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");

>   

>   	cancel_delayed_work_sync(&adev->delayed_init_work);

>   

> @@ -3796,9 +3790,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>   		return 0;

>   	}

>   

> +	/* Block kfd: SRIOV would do it separately */

> +	if (!amdgpu_sriov_vf(adev))

> +                amdgpu_amdkfd_pre_reset(adev);

> +

>   	/* Build list of devices to reset */

>   	if  (adev->gmc.xgmi.num_physical_nodes > 1) {

>   		if (!hive) {

> +			/*unlock kfd: SRIOV would do it separately */

> +			if (!amdgpu_sriov_vf(adev))

> +		                amdgpu_amdkfd_post_reset(adev);

>   			amdgpu_device_unlock_adev(adev);

>   			return -ENODEV;

>   		}

> @@ -3824,7 +3825,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>   	/* block all schedulers and reset given job's ring */

>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

>   		/* disable ras on ALL IPs */

> -		if (amdgpu_device_ip_need_full_reset(tmp_adev))

> +		if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))

>   			amdgpu_ras_suspend(tmp_adev);

>   

>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

> @@ -3834,10 +3835,26 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>   				continue;

>   

>   			drm_sched_stop(&ring->sched, job ? &job->base : NULL);

> +

> +			if (in_ras_intr)

> +				amdgpu_job_stop_all_jobs_on_sched(&ring->sched);

>   		}

>   	}

>   

>   

> +	if (in_ras_intr) {

> +		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> +			if (tmp_adev == adev)

> +				continue;

> +

> +			if (amdgpu_device_lock_adev(tmp_adev, false) && !amdgpu_sriov_vf(tmp_adev))

> +				amdgpu_amdkfd_pre_reset(tmp_adev);

> +

> +		}

> +

> +		goto skip_sched_resume;

> +	}

> +

>   	/*

>   	 * Must check guilty signal here since after this point all old

>   	 * HW fences are force signaled.

> @@ -3872,7 +3889,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>   		if (tmp_adev == adev)

>   			continue;

>   

> -		amdgpu_device_lock_adev(tmp_adev, false);

> +		if (amdgpu_device_lock_adev(tmp_adev, false) && !amdgpu_sriov_vf(tmp_adev))

> +			amdgpu_amdkfd_pre_reset(tmp_adev);

> +

>   		r = amdgpu_device_pre_asic_reset(tmp_adev,

>   						 NULL,

>   						 &need_full_reset);

> @@ -3900,6 +3919,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>   

>   	/* Post ASIC reset for all devs .*/

>   	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> +

>   		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

>   			struct amdgpu_ring *ring = tmp_adev->rings[i];

>   

> @@ -3926,7 +3946,13 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>   		} else {

>   			dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));

>   		}

> +	}

>   

> +skip_sched_resume:

> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> +		/*unlock kfd: SRIOV would do it separately */

> +		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))

> +	                amdgpu_amdkfd_post_reset(tmp_adev);


I think this doesn't handle the case correctly that lock_adev failed. In 
that case pre_reset wasn't called, so you should not call post_reset either.

Regards,
   Felix


>   		amdgpu_device_unlock_adev(tmp_adev);

>   	}

>   

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> index 151d7f2..757fd6d 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> @@ -40,6 +40,8 @@

>   

>   #include "amdgpu_amdkfd.h"

>   

> +#include "amdgpu_ras.h"

> +

>   /*

>    * KMS wrapper.

>    * - 3.0.0 - initial driver

> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)

>   	struct drm_device *dev = pci_get_drvdata(pdev);

>   	struct amdgpu_device *adev = dev->dev_private;

>   

> +	if (amdgpu_ras_intr_triggered())

> +		return;

> +

>   	/* if we are running in a VM, make sure the device

>   	 * torn down properly on reboot/shutdown.

>   	 * unfortunately we can't detect certain

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

> index 4d67b77..b12981e 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

> @@ -250,6 +250,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)

>   	return fence;

>   }

>   

> +#define to_drm_sched_job(sched_job)		\

> +		container_of((sched_job), struct drm_sched_job, queue_node)

> +

> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)

> +{

> +	struct drm_sched_job *s_job;

> +	struct drm_sched_entity *s_entity = NULL;

> +	int i;

> +

> +	/* Signal all jobs not yet scheduled */

> +	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {

> +		struct drm_sched_rq *rq = &sched->sched_rq[i];

> +

> +		if (!rq)

> +			continue;

> +

> +		spin_lock(&rq->lock);

> +		list_for_each_entry(s_entity, &rq->entities, list) {

> +			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {

> +				struct drm_sched_fence *s_fence = s_job->s_fence;

> +

> +				dma_fence_signal(&s_fence->scheduled);

> +				dma_fence_set_error(&s_fence->finished, -EHWPOISON);

> +				dma_fence_signal(&s_fence->finished);

> +			}

> +		}

> +		spin_unlock(&rq->lock);

> +	}

> +

> +	/* Signal all jobs already scheduled to HW */

> +	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {

> +		struct drm_sched_fence *s_fence = s_job->s_fence;

> +

> +		dma_fence_set_error(&s_fence->finished, -EHWPOISON);

> +		dma_fence_signal(&s_fence->finished);

> +	}

> +}

> +

>   const struct drm_sched_backend_ops amdgpu_sched_ops = {

>   	.dependency = amdgpu_job_dependency,

>   	.run_job = amdgpu_job_run,

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

> index 51e6250..dc7ee93 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

> @@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity,

>   		      void *owner, struct dma_fence **f);

>   int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring,

>   			     struct dma_fence **fence);

> +

> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched);

> +

>   #endif

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> index da2143d..ced766c 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)

>   	/* Ensure IB tests are run on ring */

>   	flush_delayed_work(&adev->delayed_init_work);

>   

> +

> +	if (amdgpu_ras_intr_triggered()) {

> +		DRM_ERROR("RAS Intr triggered, device disabled!!");

> +		return -EHWPOISON;

> +	}

> +

>   	file_priv->driver_priv = NULL;

>   

>   	r = pm_runtime_get_sync(dev->dev);

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> index 2d5897a..7b00ac6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> @@ -24,6 +24,8 @@

>   #include <linux/debugfs.h>

>   #include <linux/list.h>

>   #include <linux/module.h>

> +#include <linux/reboot.h>

> +#include <linux/syscalls.h>

>   #include "amdgpu.h"

>   #include "amdgpu_ras.h"

>   #include "amdgpu_atomfirmware.h"

> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {

>   /* inject address is 52 bits */

>   #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)

>   

> +

> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);

> +

>   static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,

>   		uint64_t offset, uint64_t size,

>   		struct amdgpu_bo **bo_ptr);

> @@ -188,6 +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,

>   

>   	return 0;

>   }

> +

> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,

> +		struct ras_common_if *head);

> +

>   /**

>    * DOC: AMDGPU RAS debugfs control interface

>    *

> @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,

>   	info->ue_count = obj->err_data.ue_count;

>   	info->ce_count = obj->err_data.ce_count;

>   

> -	if (err_data.ce_count)

> +	if (err_data.ce_count) {

>   		dev_info(adev->dev, "%ld correctable errors detected in %s block\n",

>   			 obj->err_data.ce_count, ras_block_str(info->head.block));

> -	if (err_data.ue_count)

> +	}

> +	if (err_data.ue_count) {

>   		dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",

>   			 obj->err_data.ue_count, ras_block_str(info->head.block));

> +	}

>   

>   	return 0;

>   }

> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)

>   

>   	return 0;

>   }

> +

> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)

> +{

> +	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {

> +		DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");

> +	}

> +}

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> index 5a0df73..cf5ffb6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,

>   

>   int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,

>   		struct ras_dispatch_if *info);

> +

> +extern atomic_t amdgpu_ras_in_intr;

> +

> +static inline bool amdgpu_ras_intr_triggered(void)

> +{

> +	return !!atomic_read(&amdgpu_ras_in_intr);

> +}

> +

> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);

> +

>   #endif

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> index b2c86a0..e7a83f6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> @@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,

>   		struct amdgpu_iv_entry *entry)

>   {

>   	/* TODO ue will trigger an interrupt. */

> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> -	if (adev->gfx.funcs->query_ras_error_count)

> -		adev->gfx.funcs->query_ras_error_count(adev, err_data);

> -	amdgpu_ras_reset_gpu(adev, 0);

> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {

> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> +		if (adev->gfx.funcs->query_ras_error_count)

> +			adev->gfx.funcs->query_ras_error_count(adev, err_data);

> +		amdgpu_ras_reset_gpu(adev, 0);

> +	}

>   	return AMDGPU_RAS_SUCCESS;

>   }

>   

> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> index 43b4fbc..87a66c2 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,

>   		struct ras_err_data *err_data,

>   		struct amdgpu_iv_entry *entry)

>   {

> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> -	if (adev->umc.funcs->query_ras_error_count)

> -		adev->umc.funcs->query_ras_error_count(adev, err_data);

> -	/* umc query_ras_error_address is also responsible for clearing

> -	 * error status

> -	 */

> -	if (adev->umc.funcs->query_ras_error_address)

> -		adev->umc.funcs->query_ras_error_address(adev, err_data);

> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {

> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> +		if (adev->umc.funcs->query_ras_error_count)

> +			adev->umc.funcs->query_ras_error_count(adev, err_data);

> +		/* umc query_ras_error_address is also responsible for clearing

> +		 * error status

> +		 */

> +		if (adev->umc.funcs->query_ras_error_address)

> +			adev->umc.funcs->query_ras_error_address(adev, err_data);

>   

> -	/* only uncorrectable error needs gpu reset */

> -	if (err_data->ue_count)

> -		amdgpu_ras_reset_gpu(adev, 0);

> +		/* only uncorrectable error needs gpu reset */

> +		if (err_data->ue_count)

> +			amdgpu_ras_reset_gpu(adev, 0);

> +	}

>   

>   	return AMDGPU_RAS_SUCCESS;

>   }

> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

> index 367f9d6..545990c 100644

> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

> @@ -30,6 +30,7 @@

>   #include "nbio/nbio_7_4_0_smn.h"

>   #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"

>   #include <uapi/linux/kfd_ioctl.h>

> +#include "amdgpu_ras.h"

>   

>   #define smnNBIF_MGCG_CTRL_LCLK	0x1013a21c

>   

> @@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device

>   						BIF_DOORBELL_INT_CNTL,

>   						RAS_CNTLR_INTERRUPT_CLEAR, 1);

>   		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);

> +

> +		amdgpu_ras_global_ras_isr(adev);

>   	}

>   }

>   

> @@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d

>   						BIF_DOORBELL_INT_CNTL,

>   						RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);

>   		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);

> +

> +		amdgpu_ras_global_ras_isr(adev);

>   	}

>   }

>   

> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

> index 956432f..438e504 100644

> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

> @@ -1972,24 +1972,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,

>   	uint32_t err_source;

>   	int instance;

>   

> -	instance = sdma_v4_0_irq_id_to_seq(entry->client_id);

> -	if (instance < 0)

> -		return 0;

> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {

> +		instance = sdma_v4_0_irq_id_to_seq(entry->client_id);

> +		if (instance < 0)

> +			return 0;

>   

> -	switch (entry->src_id) {

> -	case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:

> -		err_source = 0;

> -		break;

> -	case SDMA0_4_0__SRCID__SDMA_ECC:

> -		err_source = 1;

> -		break;

> -	default:

> -		return 0;

> -	}

> +		switch (entry->src_id) {

> +		case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:

> +			err_source = 0;

> +			break;

> +		case SDMA0_4_0__SRCID__SDMA_ECC:

> +			err_source = 1;

> +			break;

> +		default:

> +			return 0;

> +		}

>   

> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

>   

> -	amdgpu_ras_reset_gpu(adev, 0);

> +		amdgpu_ras_reset_gpu(adev, 0);

> +	}

>   

>   	return AMDGPU_RAS_SUCCESS;

>   }
> -----Original Message-----

> From: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

> Sent: 2019年8月30日 8:54

> To: amd-gfx@lists.freedesktop.org

> Cc: alexdeucher@gmail.com; Zhang, Hawking <Hawking.Zhang@amd.com>;

> ckoenig.leichtzumerken@gmail.com; Zhou1, Tao <Tao.Zhou1@amd.com>;

> Grodzovsky, Andrey <Andrey.Grodzovsky@amd.com>

> Subject: [PATCH v2 1/2] dmr/amdgpu: Avoid HW GPU reset for RAS.

> 

> Problem:

> Under certain conditions, when some IP bocks take a RAS error, we can get


[Tao] typo: "dmr/amdgpu" -> "drm/amdgpu", "IP bocks" -> "IP blocks"

> into a situation where a GPU reset is not possible due to issues in RAS in

> SMU/PSP.

> 

> Temporary fix until proper solution in PSP/SMU is ready:

> When uncorrectable error happens the DF will unconditionally broadcast

> error event packets to all its clients/slave upon receiving fatal error event and

> freeze all its outbound queues, err_event_athub interrupt  will be triggered.

> In such case and we use this interrupt

> to issue GPU reset. THe GPU reset code is modified for such case to avoid HW

> reset, only stops schedulers, deatches all in progress and not yet scheduled

> job's fences, set error code on them and signals.

> Also reject any new incoming job submissions from user space.

> All this is done to notify the applications of the problem.

> 

> v2:

> Extract amdgpu_amdkfd_pre/post_reset from

> amdgpu_device_lock/unlock_adev Move

> amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param

> from amdgpu_ras_query_error_count

> 

> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>

> ---

>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  4 +++

>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46

> +++++++++++++++++++++++-------

>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  5 ++++

>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    | 38

> ++++++++++++++++++++++++

>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h    |  3 ++

>  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |  6 ++++

>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c    | 22 ++++++++++++--

>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h    | 10 +++++++

>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c      | 10 ++++---

>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      | 24 +++++++++-------

>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c     |  5 ++++

>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c     | 32 +++++++++++----------

>  12 files changed, 163 insertions(+), 42 deletions(-)

> 

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

> index 9da681e..300adb8 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

> @@ -38,6 +38,7 @@

>  #include "amdgpu_gmc.h"

>  #include "amdgpu_gem.h"

>  #include "amdgpu_display.h"

> +#include "amdgpu_ras.h"

> 

>  #if defined(HAVE_DRM_FREE_LARGE)

>  #define kvfree drm_free_large

> @@ -1461,6 +1462,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void

> *data, struct drm_file *filp)

>  	bool reserved_buffers = false;

>  	int i, r;

> 

> +	if (amdgpu_ras_intr_triggered())

> +		return -EHWPOISON;

> +

>  	if (!adev->accel_working)

>  		return -EBUSY;

> 

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> index a5daccc..d3a078b 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

> @@ -3727,25 +3727,18 @@ static bool amdgpu_device_lock_adev(struct

> amdgpu_device *adev, bool trylock)

>  		adev->mp1_state = PP_MP1_STATE_NONE;

>  		break;

>  	}

> -	/* Block kfd: SRIOV would do it separately */

> -	if (!amdgpu_sriov_vf(adev))

> -                amdgpu_amdkfd_pre_reset(adev);

> 

>  	return true;

>  }

> 

>  static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)  {

> -	/*unlock kfd: SRIOV would do it separately */

> -	if (!amdgpu_sriov_vf(adev))

> -                amdgpu_amdkfd_post_reset(adev);

>  	amdgpu_vf_error_trans_all(adev);

>  	adev->mp1_state = PP_MP1_STATE_NONE;

>  	adev->in_gpu_reset = 0;

>  	mutex_unlock(&adev->lock_reset);

>  }

> 

> -

>  /**

>   * amdgpu_device_gpu_recover - reset the asic and recover scheduler

>   *

> @@ -3765,11 +3758,12 @@ int amdgpu_device_gpu_recover(struct

> amdgpu_device *adev,

>  	struct amdgpu_hive_info *hive = NULL;

>  	struct amdgpu_device *tmp_adev = NULL;

>  	int i, r = 0;

> +	bool in_ras_intr = amdgpu_ras_intr_triggered();

> 

>  	need_full_reset = job_signaled = false;

>  	INIT_LIST_HEAD(&device_list);

> 

> -	dev_info(adev->dev, "GPU reset begin!\n");

> +	dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs

> +stop":"reset");

> 

>  	cancel_delayed_work_sync(&adev->delayed_init_work);

> 

> @@ -3796,9 +3790,16 @@ int amdgpu_device_gpu_recover(struct

> amdgpu_device *adev,

>  		return 0;

>  	}

> 

> +	/* Block kfd: SRIOV would do it separately */

> +	if (!amdgpu_sriov_vf(adev))

> +                amdgpu_amdkfd_pre_reset(adev);

> +

>  	/* Build list of devices to reset */

>  	if  (adev->gmc.xgmi.num_physical_nodes > 1) {

>  		if (!hive) {

> +			/*unlock kfd: SRIOV would do it separately */

> +			if (!amdgpu_sriov_vf(adev))

> +		                amdgpu_amdkfd_post_reset(adev);

>  			amdgpu_device_unlock_adev(adev);

>  			return -ENODEV;

>  		}

> @@ -3824,7 +3825,7 @@ int amdgpu_device_gpu_recover(struct

> amdgpu_device *adev,

>  	/* block all schedulers and reset given job's ring */

>  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

>  		/* disable ras on ALL IPs */

> -		if (amdgpu_device_ip_need_full_reset(tmp_adev))

> +		if (!in_ras_intr &&

> amdgpu_device_ip_need_full_reset(tmp_adev))

>  			amdgpu_ras_suspend(tmp_adev);

> 

>  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3834,10

> +3835,26 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

>  				continue;

> 

>  			drm_sched_stop(&ring->sched, job ? &job->base :

> NULL);

> +

> +			if (in_ras_intr)

> +				amdgpu_job_stop_all_jobs_on_sched(&ring-

> >sched);

>  		}

>  	}

> 

> 

> +	if (in_ras_intr) {

> +		list_for_each_entry(tmp_adev, device_list_handle,

> gmc.xgmi.head) {

> +			if (tmp_adev == adev)

> +				continue;

> +

> +			if (amdgpu_device_lock_adev(tmp_adev, false)

> && !amdgpu_sriov_vf(tmp_adev))

> +				amdgpu_amdkfd_pre_reset(tmp_adev);

> +

> +		}

> +

> +		goto skip_sched_resume;

> +	}

> +

>  	/*

>  	 * Must check guilty signal here since after this point all old

>  	 * HW fences are force signaled.

> @@ -3872,7 +3889,9 @@ int amdgpu_device_gpu_recover(struct

> amdgpu_device *adev,

>  		if (tmp_adev == adev)

>  			continue;

> 

> -		amdgpu_device_lock_adev(tmp_adev, false);

> +		if (amdgpu_device_lock_adev(tmp_adev, false)

> && !amdgpu_sriov_vf(tmp_adev))

> +			amdgpu_amdkfd_pre_reset(tmp_adev);

> +

>  		r = amdgpu_device_pre_asic_reset(tmp_adev,

>  						 NULL,

>  						 &need_full_reset);

> @@ -3900,6 +3919,7 @@ int amdgpu_device_gpu_recover(struct

> amdgpu_device *adev,

> 

>  	/* Post ASIC reset for all devs .*/

>  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> +

>  		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {

>  			struct amdgpu_ring *ring = tmp_adev->rings[i];

> 

> @@ -3926,7 +3946,13 @@ int amdgpu_device_gpu_recover(struct

> amdgpu_device *adev,

>  		} else {

>  			dev_info(tmp_adev->dev, "GPU reset(%d)

> succeeded!\n", atomic_read(&adev->gpu_reset_counter));

>  		}

> +	}

> 

> +skip_sched_resume:

> +	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {

> +		/*unlock kfd: SRIOV would do it separately */

> +		if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev))

> +	                amdgpu_amdkfd_post_reset(tmp_adev);

>  		amdgpu_device_unlock_adev(tmp_adev);

>  	}

> 

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> index 151d7f2..757fd6d 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> @@ -40,6 +40,8 @@

> 

>  #include "amdgpu_amdkfd.h"

> 

> +#include "amdgpu_ras.h"

> +

>  /*

>   * KMS wrapper.

>   * - 3.0.0 - initial driver

> @@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)

>  	struct drm_device *dev = pci_get_drvdata(pdev);

>  	struct amdgpu_device *adev = dev->dev_private;

> 

> +	if (amdgpu_ras_intr_triggered())

> +		return;

> +

>  	/* if we are running in a VM, make sure the device

>  	 * torn down properly on reboot/shutdown.

>  	 * unfortunately we can't detect certain diff --git

> a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

> index 4d67b77..b12981e 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

> @@ -250,6 +250,44 @@ static struct dma_fence *amdgpu_job_run(struct

> drm_sched_job *sched_job)

>  	return fence;

>  }

> 

> +#define to_drm_sched_job(sched_job)		\

> +		container_of((sched_job), struct drm_sched_job,

> queue_node)

> +

> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler

> *sched)

> +{

> +	struct drm_sched_job *s_job;

> +	struct drm_sched_entity *s_entity = NULL;

> +	int i;

> +

> +	/* Signal all jobs not yet scheduled */

> +	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >=

> DRM_SCHED_PRIORITY_MIN; i--) {

> +		struct drm_sched_rq *rq = &sched->sched_rq[i];

> +

> +		if (!rq)

> +			continue;

> +

> +		spin_lock(&rq->lock);

> +		list_for_each_entry(s_entity, &rq->entities, list) {

> +			while ((s_job =

> to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {

> +				struct drm_sched_fence *s_fence = s_job-

> >s_fence;

> +

> +				dma_fence_signal(&s_fence->scheduled);

> +				dma_fence_set_error(&s_fence->finished, -

> EHWPOISON);

> +				dma_fence_signal(&s_fence->finished);

> +			}

> +		}

> +		spin_unlock(&rq->lock);

> +	}

> +

> +	/* Signal all jobs already scheduled to HW */

> +	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {

> +		struct drm_sched_fence *s_fence = s_job->s_fence;

> +

> +		dma_fence_set_error(&s_fence->finished, -EHWPOISON);

> +		dma_fence_signal(&s_fence->finished);

> +	}

> +}

> +

>  const struct drm_sched_backend_ops amdgpu_sched_ops = {

>  	.dependency = amdgpu_job_dependency,

>  	.run_job = amdgpu_job_run,

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

> index 51e6250..dc7ee93 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

> @@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct

> drm_sched_entity *entity,

>  		      void *owner, struct dma_fence **f);  int

> amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring

> *ring,

>  			     struct dma_fence **fence);

> +

> +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler

> +*sched);

> +

>  #endif

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> index da2143d..ced766c 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

> @@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device

> *dev, struct drm_file *file_priv)

>  	/* Ensure IB tests are run on ring */

>  	flush_delayed_work(&adev->delayed_init_work);

> 

> +

> +	if (amdgpu_ras_intr_triggered()) {

> +		DRM_ERROR("RAS Intr triggered, device disabled!!");

> +		return -EHWPOISON;

> +	}

> +

>  	file_priv->driver_priv = NULL;

> 

>  	r = pm_runtime_get_sync(dev->dev);

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> index 2d5897a..7b00ac6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

> @@ -24,6 +24,8 @@

>  #include <linux/debugfs.h>

>  #include <linux/list.h>

>  #include <linux/module.h>

> +#include <linux/reboot.h>

> +#include <linux/syscalls.h>

>  #include "amdgpu.h"

>  #include "amdgpu_ras.h"

>  #include "amdgpu_atomfirmware.h"

> @@ -64,6 +66,9 @@ const char *ras_block_string[] = {

>  /* inject address is 52 bits */

>  #define	RAS_UMC_INJECT_ADDR_LIMIT	(0x1ULL << 52)

> 

> +

> +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);

> +

>  static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,

>  		uint64_t offset, uint64_t size,

>  		struct amdgpu_bo **bo_ptr);

> @@ -188,6 +193,10 @@ static int

> amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,

> 

>  	return 0;

>  }

> +

> +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device

> *adev,

> +		struct ras_common_if *head);

> +

>  /**

>   * DOC: AMDGPU RAS debugfs control interface

>   *

> @@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct

> amdgpu_device *adev,

>  	info->ue_count = obj->err_data.ue_count;

>  	info->ce_count = obj->err_data.ce_count;

> 

> -	if (err_data.ce_count)

> +	if (err_data.ce_count) {

>  		dev_info(adev->dev, "%ld correctable errors detected in %s

> block\n",

>  			 obj->err_data.ce_count, ras_block_str(info-

> >head.block));

> -	if (err_data.ue_count)

> +	}

> +	if (err_data.ue_count) {

>  		dev_info(adev->dev, "%ld uncorrectable errors detected

> in %s block\n",

>  			 obj->err_data.ue_count, ras_block_str(info-

> >head.block));

> +	}

> 

>  	return 0;

>  }

> @@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)

> 

>  	return 0;

>  }

> +

> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) {

> +	if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {

> +		DRM_WARN("RAS event of type

> ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");

> +	}

> +}

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> index 5a0df73..cf5ffb6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

> @@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct

> amdgpu_device *adev,

> 

>  int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,

>  		struct ras_dispatch_if *info);

> +

> +extern atomic_t amdgpu_ras_in_intr;

> +

> +static inline bool amdgpu_ras_intr_triggered(void) {

> +	return !!atomic_read(&amdgpu_ras_in_intr);

> +}

> +

> +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);

> +

>  #endif

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> index b2c86a0..e7a83f6 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> @@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct

> amdgpu_device *adev,

>  		struct amdgpu_iv_entry *entry)

>  {

>  	/* TODO ue will trigger an interrupt. */

> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> -	if (adev->gfx.funcs->query_ras_error_count)

> -		adev->gfx.funcs->query_ras_error_count(adev, err_data);

> -	amdgpu_ras_reset_gpu(adev, 0);

> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {

> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> +		if (adev->gfx.funcs->query_ras_error_count)

> +			adev->gfx.funcs->query_ras_error_count(adev,

> err_data);

> +		amdgpu_ras_reset_gpu(adev, 0);

> +	}

>  	return AMDGPU_RAS_SUCCESS;

>  }

> 

> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> index 43b4fbc..87a66c2 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> @@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct

> amdgpu_device *adev,

>  		struct ras_err_data *err_data,

>  		struct amdgpu_iv_entry *entry)

>  {

> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> -	if (adev->umc.funcs->query_ras_error_count)

> -		adev->umc.funcs->query_ras_error_count(adev, err_data);

> -	/* umc query_ras_error_address is also responsible for clearing

> -	 * error status

> -	 */

> -	if (adev->umc.funcs->query_ras_error_address)

> -		adev->umc.funcs->query_ras_error_address(adev, err_data);

> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {

[Tao]  Comment is recommended for the if condition

> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> +		if (adev->umc.funcs->query_ras_error_count)

> +			adev->umc.funcs->query_ras_error_count(adev,

> err_data);

> +		/* umc query_ras_error_address is also responsible for

> clearing

> +		 * error status

> +		 */

> +		if (adev->umc.funcs->query_ras_error_address)

> +			adev->umc.funcs->query_ras_error_address(adev,

> err_data);

> 

> -	/* only uncorrectable error needs gpu reset */

> -	if (err_data->ue_count)

> -		amdgpu_ras_reset_gpu(adev, 0);

> +		/* only uncorrectable error needs gpu reset */

> +		if (err_data->ue_count)

> +			amdgpu_ras_reset_gpu(adev, 0);

> +	}

> 

>  	return AMDGPU_RAS_SUCCESS;

>  }

> diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

> b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

> index 367f9d6..545990c 100644

> --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

> +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c

> @@ -30,6 +30,7 @@

>  #include "nbio/nbio_7_4_0_smn.h"

>  #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"

>  #include <uapi/linux/kfd_ioctl.h>

> +#include "amdgpu_ras.h"

> 

>  #define smnNBIF_MGCG_CTRL_LCLK	0x1013a21c

> 

> @@ -329,6 +330,8 @@ static void

> nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device

>  						BIF_DOORBELL_INT_CNTL,

> 

> 	RAS_CNTLR_INTERRUPT_CLEAR, 1);

>  		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,

> bif_doorbell_intr_cntl);

> +

> +		amdgpu_ras_global_ras_isr(adev);

>  	}

>  }

> 

> @@ -344,6 +347,8 @@ static void

> nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d

>  						BIF_DOORBELL_INT_CNTL,

> 

> 	RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);

>  		WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL,

> bif_doorbell_intr_cntl);

> +

> +		amdgpu_ras_global_ras_isr(adev);

>  	}

>  }

> 

> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

> index 956432f..438e504 100644

> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c

> @@ -1972,24 +1972,26 @@ static int

> sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,

>  	uint32_t err_source;

>  	int instance;

> 

> -	instance = sdma_v4_0_irq_id_to_seq(entry->client_id);

> -	if (instance < 0)

> -		return 0;

> +	if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {

> +		instance = sdma_v4_0_irq_id_to_seq(entry->client_id);

> +		if (instance < 0)

> +			return 0;

> 

> -	switch (entry->src_id) {

> -	case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:

> -		err_source = 0;

> -		break;

> -	case SDMA0_4_0__SRCID__SDMA_ECC:

> -		err_source = 1;

> -		break;

> -	default:

> -		return 0;

> -	}

> +		switch (entry->src_id) {

> +		case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:

> +			err_source = 0;

> +			break;

> +		case SDMA0_4_0__SRCID__SDMA_ECC:

> +			err_source = 1;

> +			break;

> +		default:

> +			return 0;

> +		}

> 

> -	kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> +		kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);

> 

> -	amdgpu_ras_reset_gpu(adev, 0);

> +		amdgpu_ras_reset_gpu(adev, 0);

> +	}

> 

>  	return AMDGPU_RAS_SUCCESS;

>  }

> --

> 2.7.4