[1/1] drm/amdkfd: Consistently apply noretry setting

Submitted by Kuehling, Felix on June 22, 2019, 12:20 a.m.

Details

Message ID 20190622001959.32290-1-Felix.Kuehling@amd.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Kuehling, Felix June 22, 2019, 12:20 a.m.
Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This
makes the noretry param no longer KFD-specific. On GFX10 I'm not
changing SH_MEM_CONFIG in this commit because GFX10 has different
retry behaviour in the SQ and I don't have a way to test it at the
moment.

Suggested-by: Christian König <Christian.Koenig@amd.com>
CC: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++
 drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-
 drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-
 drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-
 drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-
 .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-
 9 files changed, 20 insertions(+), 17 deletions(-)

-- 
2.17.1

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9b1efdf94bdf..05875279c09e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -164,6 +164,7 @@  extern int amdgpu_async_gfx_ring;
 extern int amdgpu_mcbp;
 extern int amdgpu_discovery;
 extern int amdgpu_mes;
+extern int amdgpu_noretry;
 
 #ifdef CONFIG_DRM_AMDGPU_SI
 extern int amdgpu_si_support;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 7cf6ab07b113..0d578d95be93 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -140,6 +140,7 @@  int amdgpu_async_gfx_ring = 1;
 int amdgpu_mcbp = 0;
 int amdgpu_discovery = 0;
 int amdgpu_mes = 0;
+int amdgpu_noretry;
 
 struct amdgpu_mgpu_info mgpu_info = {
 	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
@@ -591,6 +592,10 @@  MODULE_PARM_DESC(mes,
 	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");
 module_param_named(mes, amdgpu_mes, int, 0444);
 
+MODULE_PARM_DESC(noretry,
+	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");
+module_param_named(noretry, amdgpu_noretry, int, 0644);
+
 #ifdef CONFIG_HSA_AMD
 /**
  * DOC: sched_policy (int)
@@ -666,17 +671,6 @@  module_param(ignore_crat, int, 0444);
 MODULE_PARM_DESC(ignore_crat,
 	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");
 
-/**
- * DOC: noretry (int)
- * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.
- * Setting 1 disables retry.
- * Retry is needed for recoverable page faults.
- */
-int noretry;
-module_param(noretry, int, 0644);
-MODULE_PARM_DESC(noretry,
-	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");
-
 /**
  * DOC: halt_if_hws_hang (int)
  * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index e0f3014e76ea..c4e715170bfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1938,11 +1938,15 @@  static void gfx_v9_0_constants_init(struct amdgpu_device *adev)
 		if (i == 0) {
 			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,
 					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);
+			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,
+					    !!amdgpu_noretry);
 			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);
 			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);
 		} else {
 			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,
 					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);
+			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,
+					    !!amdgpu_noretry);
 			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);
 			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,
 				(adev->gmc.private_aperture_start >> 48));
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
index 9f0f189fc111..15986748f59f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c
@@ -236,7 +236,8 @@  static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
 				    block_size);
 		/* Send no-retry XNACK on fault to suppress VM fault storm. */
 		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
-				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
+				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
+				    !amdgpu_noretry);
 		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);
 		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
 		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
index b7de60a15623..d605b4963f8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c
@@ -215,7 +215,8 @@  static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)
 				adev->vm_manager.block_size - 9);
 		/* Send no-retry XNACK on fault to suppress VM fault storm. */
 		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,
-				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
+				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
+				    !amdgpu_noretry);
 		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);
 		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
 		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
index 05d1d448c8f5..dc5ce03034d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c
@@ -265,7 +265,8 @@  static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)
 				    block_size);
 		/* Send no-retry XNACK on fault to suppress VM fault storm. */
 		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,
-				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);
+				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
+				    !amdgpu_noretry);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
index 37a1a318ae63..0f9549f19ade 100644
--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c
@@ -205,7 +205,8 @@  static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)
 				    adev->vm_manager.block_size - 9);
 		/* Send no-retry XNACK on fault to suppress VM fault storm. */
 		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,
-				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);
+				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,
+				    !amdgpu_noretry);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);
 		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
index e9fe39382371..95a82ac455f2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c
@@ -61,7 +61,7 @@  static int update_qpd_v9(struct device_queue_manager *dqm,
 		qpd->sh_mem_config =
 				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
 					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;
-		if (noretry &&
+		if (amdgpu_noretry &&
 		    !dqm->dev->device_info->needs_iommu_device)
 			qpd->sh_mem_config |=
 				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d4bba0124d29..aa7bf20d20f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -157,7 +157,7 @@  extern int ignore_crat;
 /*
  * Set sh_mem_config.retry_disable on Vega10
  */
-extern int noretry;
+extern int amdgpu_noretry;
 
 /*
  * Halt if HWS hang is detected

Comments

Ping.

Christian, Philip, any opinion about this patch?

On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:
> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This

> makes the noretry param no longer KFD-specific. On GFX10 I'm not

> changing SH_MEM_CONFIG in this commit because GFX10 has different

> retry behaviour in the SQ and I don't have a way to test it at the

> moment.

>

> Suggested-by: Christian König <Christian.Koenig@amd.com>

> CC: Philip Yang <Philip.Yang@amd.com>

> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------

>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++

>   drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-

>   drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-

>   drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-

>   drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-

>   .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-

>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-

>   9 files changed, 20 insertions(+), 17 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index 9b1efdf94bdf..05875279c09e 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;

>   extern int amdgpu_mcbp;

>   extern int amdgpu_discovery;

>   extern int amdgpu_mes;

> +extern int amdgpu_noretry;

>   

>   #ifdef CONFIG_DRM_AMDGPU_SI

>   extern int amdgpu_si_support;

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> index 7cf6ab07b113..0d578d95be93 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;

>   int amdgpu_mcbp = 0;

>   int amdgpu_discovery = 0;

>   int amdgpu_mes = 0;

> +int amdgpu_noretry;

>   

>   struct amdgpu_mgpu_info mgpu_info = {

>   	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),

> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,

>   	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");

>   module_param_named(mes, amdgpu_mes, int, 0444);

>   

> +MODULE_PARM_DESC(noretry,

> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");

> +module_param_named(noretry, amdgpu_noretry, int, 0644);

> +

>   #ifdef CONFIG_HSA_AMD

>   /**

>    * DOC: sched_policy (int)

> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);

>   MODULE_PARM_DESC(ignore_crat,

>   	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");

>   

> -/**

> - * DOC: noretry (int)

> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.

> - * Setting 1 disables retry.

> - * Retry is needed for recoverable page faults.

> - */

> -int noretry;

> -module_param(noretry, int, 0644);

> -MODULE_PARM_DESC(noretry,

> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");

> -

>   /**

>    * DOC: halt_if_hws_hang (int)

>    * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> index e0f3014e76ea..c4e715170bfe 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)

>   		if (i == 0) {

>   			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>   					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

> +					    !!amdgpu_noretry);

>   			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>   			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);

>   		} else {

>   			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>   					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

> +					    !!amdgpu_noretry);

>   			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>   			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,

>   				(adev->gmc.private_aperture_start >> 48));

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

> index 9f0f189fc111..15986748f59f 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>   				    block_size);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

> index b7de60a15623..d605b4963f8a 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>   				adev->vm_manager.block_size - 9);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

> index 05d1d448c8f5..dc5ce03034d3 100644

> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>   				    block_size);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

> index 37a1a318ae63..0f9549f19ade 100644

> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>   				    adev->vm_manager.block_size - 9);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

> index e9fe39382371..95a82ac455f2 100644

> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,

>   		qpd->sh_mem_config =

>   				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<

>   					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;

> -		if (noretry &&

> +		if (amdgpu_noretry &&

>   		    !dqm->dev->device_info->needs_iommu_device)

>   			qpd->sh_mem_config |=

>   				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;

> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

> index d4bba0124d29..aa7bf20d20f8 100644

> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

> @@ -157,7 +157,7 @@ extern int ignore_crat;

>   /*

>    * Set sh_mem_config.retry_disable on Vega10

>    */

> -extern int noretry;

> +extern int amdgpu_noretry;

>   

>   /*

>    * Halt if HWS hang is detected
amdgpu_noretry default value is 0, this will generate VM fault storm 
because the vm fault is not recovered. It may slow down the machine and 
need reboot after application VM fault. Maybe change default value to 1?

Other than that, this is reviewed by Philip Yang <Philip.Yang@amd.com>

On 2019-07-02 3:05 p.m., Kuehling, Felix wrote:
> Ping.

> 

> Christian, Philip, any opinion about this patch?

> 

> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:

>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This

>> makes the noretry param no longer KFD-specific. On GFX10 I'm not

>> changing SH_MEM_CONFIG in this commit because GFX10 has different

>> retry behaviour in the SQ and I don't have a way to test it at the

>> moment.

>>

>> Suggested-by: Christian König <Christian.Koenig@amd.com>

>> CC: Philip Yang <Philip.Yang@amd.com>

>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

>> ---

>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +

>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------

>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++

>>    drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-

>>    drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-

>>    drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-

>>    drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-

>>    .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-

>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-

>>    9 files changed, 20 insertions(+), 17 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> index 9b1efdf94bdf..05875279c09e 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;

>>    extern int amdgpu_mcbp;

>>    extern int amdgpu_discovery;

>>    extern int amdgpu_mes;

>> +extern int amdgpu_noretry;

>>    

>>    #ifdef CONFIG_DRM_AMDGPU_SI

>>    extern int amdgpu_si_support;

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>> index 7cf6ab07b113..0d578d95be93 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;

>>    int amdgpu_mcbp = 0;

>>    int amdgpu_discovery = 0;

>>    int amdgpu_mes = 0;

>> +int amdgpu_noretry;

>>    

>>    struct amdgpu_mgpu_info mgpu_info = {

>>    	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),

>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,

>>    	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");

>>    module_param_named(mes, amdgpu_mes, int, 0444);

>>    

>> +MODULE_PARM_DESC(noretry,

>> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");

>> +module_param_named(noretry, amdgpu_noretry, int, 0644);

>> +

>>    #ifdef CONFIG_HSA_AMD

>>    /**

>>     * DOC: sched_policy (int)

>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);

>>    MODULE_PARM_DESC(ignore_crat,

>>    	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");

>>    

>> -/**

>> - * DOC: noretry (int)

>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.

>> - * Setting 1 disables retry.

>> - * Retry is needed for recoverable page faults.

>> - */

>> -int noretry;

>> -module_param(noretry, int, 0644);

>> -MODULE_PARM_DESC(noretry,

>> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");

>> -

>>    /**

>>     * DOC: halt_if_hws_hang (int)

>>     * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>> index e0f3014e76ea..c4e715170bfe 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)

>>    		if (i == 0) {

>>    			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>    					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>> +					    !!amdgpu_noretry);

>>    			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>    			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);

>>    		} else {

>>    			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>    					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>> +					    !!amdgpu_noretry);

>>    			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>    			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,

>>    				(adev->gmc.private_aperture_start >> 48));

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>> index 9f0f189fc111..15986748f59f 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>    				    block_size);

>>    		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>    		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>> +				    !amdgpu_noretry);

>>    		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>    		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>    		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>> index b7de60a15623..d605b4963f8a 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>    				adev->vm_manager.block_size - 9);

>>    		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>    		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,

>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>> +				    !amdgpu_noretry);

>>    		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);

>>    		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>    		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>> index 05d1d448c8f5..dc5ce03034d3 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>    				    block_size);

>>    		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>    		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>> +				    !amdgpu_noretry);

>>    		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>    		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>    		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>> index 37a1a318ae63..0f9549f19ade 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>    				    adev->vm_manager.block_size - 9);

>>    		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>    		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,

>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>> +				    !amdgpu_noretry);

>>    		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);

>>    		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>    		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>> index e9fe39382371..95a82ac455f2 100644

>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,

>>    		qpd->sh_mem_config =

>>    				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<

>>    					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;

>> -		if (noretry &&

>> +		if (amdgpu_noretry &&

>>    		    !dqm->dev->device_info->needs_iommu_device)

>>    			qpd->sh_mem_config |=

>>    				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;

>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>> index d4bba0124d29..aa7bf20d20f8 100644

>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>> @@ -157,7 +157,7 @@ extern int ignore_crat;

>>    /*

>>     * Set sh_mem_config.retry_disable on Vega10

>>     */

>> -extern int noretry;

>> +extern int amdgpu_noretry;

>>    

>>    /*

>>     * Halt if HWS hang is detected

On 2019-07-03 6:19 p.m., Yang, Philip wrote:
> amdgpu_noretry default value is 0, this will generate VM fault storm

> because the vm fault is not recovered. It may slow down the machine and

> need reboot after application VM fault. Maybe change default value to 1?


This is the same as the current behaviour. My change doesn't modify the 
default behaviour, it only makes it configurable in a sensible way.

If we want to change the default, I'd do that in a separate commit.

Regards,
   Felix

>

> Other than that, this is reviewed by Philip Yang <Philip.Yang@amd.com>

>

> On 2019-07-02 3:05 p.m., Kuehling, Felix wrote:

>> Ping.

>>

>> Christian, Philip, any opinion about this patch?

>>

>> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:

>>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This

>>> makes the noretry param no longer KFD-specific. On GFX10 I'm not

>>> changing SH_MEM_CONFIG in this commit because GFX10 has different

>>> retry behaviour in the SQ and I don't have a way to test it at the

>>> moment.

>>>

>>> Suggested-by: Christian König <Christian.Koenig@amd.com>

>>> CC: Philip Yang <Philip.Yang@amd.com>

>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

>>> ---

>>>     drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +

>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------

>>>     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++

>>>     drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-

>>>     drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-

>>>     drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-

>>>     drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-

>>>     .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-

>>>     drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-

>>>     9 files changed, 20 insertions(+), 17 deletions(-)

>>>

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> index 9b1efdf94bdf..05875279c09e 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;

>>>     extern int amdgpu_mcbp;

>>>     extern int amdgpu_discovery;

>>>     extern int amdgpu_mes;

>>> +extern int amdgpu_noretry;

>>>     

>>>     #ifdef CONFIG_DRM_AMDGPU_SI

>>>     extern int amdgpu_si_support;

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>> index 7cf6ab07b113..0d578d95be93 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;

>>>     int amdgpu_mcbp = 0;

>>>     int amdgpu_discovery = 0;

>>>     int amdgpu_mes = 0;

>>> +int amdgpu_noretry;

>>>     

>>>     struct amdgpu_mgpu_info mgpu_info = {

>>>     	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),

>>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,

>>>     	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");

>>>     module_param_named(mes, amdgpu_mes, int, 0444);

>>>     

>>> +MODULE_PARM_DESC(noretry,

>>> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");

>>> +module_param_named(noretry, amdgpu_noretry, int, 0644);

>>> +

>>>     #ifdef CONFIG_HSA_AMD

>>>     /**

>>>      * DOC: sched_policy (int)

>>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);

>>>     MODULE_PARM_DESC(ignore_crat,

>>>     	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");

>>>     

>>> -/**

>>> - * DOC: noretry (int)

>>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.

>>> - * Setting 1 disables retry.

>>> - * Retry is needed for recoverable page faults.

>>> - */

>>> -int noretry;

>>> -module_param(noretry, int, 0644);

>>> -MODULE_PARM_DESC(noretry,

>>> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");

>>> -

>>>     /**

>>>      * DOC: halt_if_hws_hang (int)

>>>      * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>> index e0f3014e76ea..c4e715170bfe 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)

>>>     		if (i == 0) {

>>>     			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>>     					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>>> +					    !!amdgpu_noretry);

>>>     			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>>     			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);

>>>     		} else {

>>>     			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>>     					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>>> +					    !!amdgpu_noretry);

>>>     			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>>     			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,

>>>     				(adev->gmc.private_aperture_start >> 48));

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>> index 9f0f189fc111..15986748f59f 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>>     				    block_size);

>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>     		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>> +				    !amdgpu_noretry);

>>>     		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>>     		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>     		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>> index b7de60a15623..d605b4963f8a 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>>     				adev->vm_manager.block_size - 9);

>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>     		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,

>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>> +				    !amdgpu_noretry);

>>>     		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);

>>>     		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>     		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>> index 05d1d448c8f5..dc5ce03034d3 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>>     				    block_size);

>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>     		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>> +				    !amdgpu_noretry);

>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>> index 37a1a318ae63..0f9549f19ade 100644

>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>>     				    adev->vm_manager.block_size - 9);

>>>     		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>     		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,

>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>> +				    !amdgpu_noretry);

>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);

>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>     		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>> index e9fe39382371..95a82ac455f2 100644

>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,

>>>     		qpd->sh_mem_config =

>>>     				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<

>>>     					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;

>>> -		if (noretry &&

>>> +		if (amdgpu_noretry &&

>>>     		    !dqm->dev->device_info->needs_iommu_device)

>>>     			qpd->sh_mem_config |=

>>>     				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;

>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>> index d4bba0124d29..aa7bf20d20f8 100644

>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>> @@ -157,7 +157,7 @@ extern int ignore_crat;

>>>     /*

>>>      * Set sh_mem_config.retry_disable on Vega10

>>>      */

>>> -extern int noretry;

>>> +extern int amdgpu_noretry;

>>>     

>>>     /*

>>>      * Halt if HWS hang is detected
On 2019-07-04 12:02 p.m., Kuehling, Felix wrote:
> On 2019-07-03 6:19 p.m., Yang, Philip wrote:

>> amdgpu_noretry default value is 0, this will generate VM fault storm

>> because the vm fault is not recovered. It may slow down the machine and

>> need reboot after application VM fault. Maybe change default value to 1?

> 

> This is the same as the current behaviour. My change doesn't modify the

> default behaviour, it only makes it configurable in a sensible way.

> 

> If we want to change the default, I'd do that in a separate commit.

>

I misunderstood VM_CONTEXT1_CNT/RETRY_PERMISSION_OR_INVALID_PAGE_FAULT 
change, yes, this is the same behavior. retry was already enabled by 
another patch "drm/amdgpu: re-enable retry faults" before. But I don't 
find the patch to silent retry storm on the mailing list.

Regards,
Philip

> Regards,

>     Felix

> 

>>

>> Other than that, this is reviewed by Philip Yang <Philip.Yang@amd.com>

>>

>> On 2019-07-02 3:05 p.m., Kuehling, Felix wrote:

>>> Ping.

>>>

>>> Christian, Philip, any opinion about this patch?

>>>

>>> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:

>>>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This

>>>> makes the noretry param no longer KFD-specific. On GFX10 I'm not

>>>> changing SH_MEM_CONFIG in this commit because GFX10 has different

>>>> retry behaviour in the SQ and I don't have a way to test it at the

>>>> moment.

>>>>

>>>> Suggested-by: Christian König <Christian.Koenig@amd.com>

>>>> CC: Philip Yang <Philip.Yang@amd.com>

>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

>>>> ---

>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +

>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------

>>>>      drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++

>>>>      drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-

>>>>      drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-

>>>>      drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-

>>>>      drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-

>>>>      .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-

>>>>      drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-

>>>>      9 files changed, 20 insertions(+), 17 deletions(-)

>>>>

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>>> index 9b1efdf94bdf..05875279c09e 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;

>>>>      extern int amdgpu_mcbp;

>>>>      extern int amdgpu_discovery;

>>>>      extern int amdgpu_mes;

>>>> +extern int amdgpu_noretry;

>>>>      

>>>>      #ifdef CONFIG_DRM_AMDGPU_SI

>>>>      extern int amdgpu_si_support;

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>>> index 7cf6ab07b113..0d578d95be93 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;

>>>>      int amdgpu_mcbp = 0;

>>>>      int amdgpu_discovery = 0;

>>>>      int amdgpu_mes = 0;

>>>> +int amdgpu_noretry;

>>>>      

>>>>      struct amdgpu_mgpu_info mgpu_info = {

>>>>      	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),

>>>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,

>>>>      	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");

>>>>      module_param_named(mes, amdgpu_mes, int, 0444);

>>>>      

>>>> +MODULE_PARM_DESC(noretry,

>>>> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");

>>>> +module_param_named(noretry, amdgpu_noretry, int, 0644);

>>>> +

>>>>      #ifdef CONFIG_HSA_AMD

>>>>      /**

>>>>       * DOC: sched_policy (int)

>>>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);

>>>>      MODULE_PARM_DESC(ignore_crat,

>>>>      	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");

>>>>      

>>>> -/**

>>>> - * DOC: noretry (int)

>>>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.

>>>> - * Setting 1 disables retry.

>>>> - * Retry is needed for recoverable page faults.

>>>> - */

>>>> -int noretry;

>>>> -module_param(noretry, int, 0644);

>>>> -MODULE_PARM_DESC(noretry,

>>>> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");

>>>> -

>>>>      /**

>>>>       * DOC: halt_if_hws_hang (int)

>>>>       * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>>> index e0f3014e76ea..c4e715170bfe 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)

>>>>      		if (i == 0) {

>>>>      			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>>>      					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>>>> +					    !!amdgpu_noretry);

>>>>      			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>>>      			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);

>>>>      		} else {

>>>>      			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>>>      					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>>>> +					    !!amdgpu_noretry);

>>>>      			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>>>      			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,

>>>>      				(adev->gmc.private_aperture_start >> 48));

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>>> index 9f0f189fc111..15986748f59f 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>      				    block_size);

>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>      		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>> +				    !amdgpu_noretry);

>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>>> index b7de60a15623..d605b4963f8a 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>      				adev->vm_manager.block_size - 9);

>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>      		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,

>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>> +				    !amdgpu_noretry);

>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);

>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>      		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>>> index 05d1d448c8f5..dc5ce03034d3 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>      				    block_size);

>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>      		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>> +				    !amdgpu_noretry);

>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>>> index 37a1a318ae63..0f9549f19ade 100644

>>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>      				    adev->vm_manager.block_size - 9);

>>>>      		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>      		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,

>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>> +				    !amdgpu_noretry);

>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);

>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>      		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>>> index e9fe39382371..95a82ac455f2 100644

>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,

>>>>      		qpd->sh_mem_config =

>>>>      				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<

>>>>      					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;

>>>> -		if (noretry &&

>>>> +		if (amdgpu_noretry &&

>>>>      		    !dqm->dev->device_info->needs_iommu_device)

>>>>      			qpd->sh_mem_config |=

>>>>      				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;

>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>>> index d4bba0124d29..aa7bf20d20f8 100644

>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>>> @@ -157,7 +157,7 @@ extern int ignore_crat;

>>>>      /*

>>>>       * Set sh_mem_config.retry_disable on Vega10

>>>>       */

>>>> -extern int noretry;

>>>> +extern int amdgpu_noretry;

>>>>      

>>>>      /*

>>>>       * Halt if HWS hang is detected
On 2019-07-04 1:16 p.m., Yang, Philip wrote:
> On 2019-07-04 12:02 p.m., Kuehling, Felix wrote:

>> On 2019-07-03 6:19 p.m., Yang, Philip wrote:

>>> amdgpu_noretry default value is 0, this will generate VM fault storm

>>> because the vm fault is not recovered. It may slow down the machine and

>>> need reboot after application VM fault. Maybe change default value to 1?

>> This is the same as the current behaviour. My change doesn't modify the

>> default behaviour, it only makes it configurable in a sensible way.

>>

>> If we want to change the default, I'd do that in a separate commit.

>>

> I misunderstood VM_CONTEXT1_CNT/RETRY_PERMISSION_OR_INVALID_PAGE_FAULT

> change, yes, this is the same behavior. retry was already enabled by

> another patch "drm/amdgpu: re-enable retry faults" before. But I don't

> find the patch to silent retry storm on the mailing list.


I think Christian is referring to his patch "drm/amdgpu: add graceful VM 
fault handling". I was kind of expecting to see something that would 
turn a retry fault into a non-retry fault by setting a special 
combination of PTE flags, as we discussed with the debugger team. That's 
would do two things:

 1. Stop the retry loop
 2. Allow the debugger to see where the VM fault occurred

But I don't quite see it in that patch. I think it could be easily 
modified to do that, though.

My bigger concern with that patch series is the whole concept of 
"direct" page table updates. It makes some questionable assumptions 
about page table residency, IMO.

Regards,
   Felix


>

> Regards,

> Philip

>

>> Regards,

>>      Felix

>>

>>> Other than that, this is reviewed by Philip Yang <Philip.Yang@amd.com>

>>>

>>> On 2019-07-02 3:05 p.m., Kuehling, Felix wrote:

>>>> Ping.

>>>>

>>>> Christian, Philip, any opinion about this patch?

>>>>

>>>> On 2019-06-21 8:20 p.m., Kuehling, Felix wrote:

>>>>> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This

>>>>> makes the noretry param no longer KFD-specific. On GFX10 I'm not

>>>>> changing SH_MEM_CONFIG in this commit because GFX10 has different

>>>>> retry behaviour in the SQ and I don't have a way to test it at the

>>>>> moment.

>>>>>

>>>>> Suggested-by: Christian König <Christian.Koenig@amd.com>

>>>>> CC: Philip Yang <Philip.Yang@amd.com>

>>>>> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

>>>>> ---

>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +

>>>>>       drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------

>>>>>       drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++

>>>>>       drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-

>>>>>       drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-

>>>>>       drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-

>>>>>       drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-

>>>>>       .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-

>>>>>       drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-

>>>>>       9 files changed, 20 insertions(+), 17 deletions(-)

>>>>>

>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>>>> index 9b1efdf94bdf..05875279c09e 100644

>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

>>>>> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;

>>>>>       extern int amdgpu_mcbp;

>>>>>       extern int amdgpu_discovery;

>>>>>       extern int amdgpu_mes;

>>>>> +extern int amdgpu_noretry;

>>>>>       

>>>>>       #ifdef CONFIG_DRM_AMDGPU_SI

>>>>>       extern int amdgpu_si_support;

>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>>>> index 7cf6ab07b113..0d578d95be93 100644

>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

>>>>> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;

>>>>>       int amdgpu_mcbp = 0;

>>>>>       int amdgpu_discovery = 0;

>>>>>       int amdgpu_mes = 0;

>>>>> +int amdgpu_noretry;

>>>>>       

>>>>>       struct amdgpu_mgpu_info mgpu_info = {

>>>>>       	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),

>>>>> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,

>>>>>       	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");

>>>>>       module_param_named(mes, amdgpu_mes, int, 0444);

>>>>>       

>>>>> +MODULE_PARM_DESC(noretry,

>>>>> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");

>>>>> +module_param_named(noretry, amdgpu_noretry, int, 0644);

>>>>> +

>>>>>       #ifdef CONFIG_HSA_AMD

>>>>>       /**

>>>>>        * DOC: sched_policy (int)

>>>>> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);

>>>>>       MODULE_PARM_DESC(ignore_crat,

>>>>>       	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");

>>>>>       

>>>>> -/**

>>>>> - * DOC: noretry (int)

>>>>> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.

>>>>> - * Setting 1 disables retry.

>>>>> - * Retry is needed for recoverable page faults.

>>>>> - */

>>>>> -int noretry;

>>>>> -module_param(noretry, int, 0644);

>>>>> -MODULE_PARM_DESC(noretry,

>>>>> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");

>>>>> -

>>>>>       /**

>>>>>        * DOC: halt_if_hws_hang (int)

>>>>>        * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.

>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>>>> index e0f3014e76ea..c4e715170bfe 100644

>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

>>>>> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)

>>>>>       		if (i == 0) {

>>>>>       			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>>>>       					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>>>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>>>>> +					    !!amdgpu_noretry);

>>>>>       			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>>>>       			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);

>>>>>       		} else {

>>>>>       			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>>>>>       					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

>>>>> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

>>>>> +					    !!amdgpu_noretry);

>>>>>       			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>>>>>       			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,

>>>>>       				(adev->gmc.private_aperture_start >> 48));

>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>>>> index 9f0f189fc111..15986748f59f 100644

>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

>>>>> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>>       				    block_size);

>>>>>       		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>>       		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>>> +				    !amdgpu_noretry);

>>>>>       		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>>>>       		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>>       		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>>>> index b7de60a15623..d605b4963f8a 100644

>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

>>>>> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>>       				adev->vm_manager.block_size - 9);

>>>>>       		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>>       		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,

>>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>>> +				    !amdgpu_noretry);

>>>>>       		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);

>>>>>       		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>>       		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>>>> index 05d1d448c8f5..dc5ce03034d3 100644

>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

>>>>> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>>       				    block_size);

>>>>>       		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>>       		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

>>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

>>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>>> +				    !amdgpu_noretry);

>>>>>       		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>>>>>       		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>>       		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>>>> index 37a1a318ae63..0f9549f19ade 100644

>>>>> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

>>>>> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>>>>>       				    adev->vm_manager.block_size - 9);

>>>>>       		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>>>>>       		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,

>>>>> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

>>>>> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

>>>>> +				    !amdgpu_noretry);

>>>>>       		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);

>>>>>       		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>>>>>       		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>>>> index e9fe39382371..95a82ac455f2 100644

>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

>>>>> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,

>>>>>       		qpd->sh_mem_config =

>>>>>       				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<

>>>>>       					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;

>>>>> -		if (noretry &&

>>>>> +		if (amdgpu_noretry &&

>>>>>       		    !dqm->dev->device_info->needs_iommu_device)

>>>>>       			qpd->sh_mem_config |=

>>>>>       				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;

>>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>>>> index d4bba0124d29..aa7bf20d20f8 100644

>>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

>>>>> @@ -157,7 +157,7 @@ extern int ignore_crat;

>>>>>       /*

>>>>>        * Set sh_mem_config.retry_disable on Vega10

>>>>>        */

>>>>> -extern int noretry;

>>>>> +extern int amdgpu_noretry;

>>>>>       

>>>>>       /*

>>>>>        * Halt if HWS hang is detected
Reviewed-by : Shaoyun.liu < Shaoyun.liu@amd.com>

I manually applied the change and  verified on vega10 on visualization  
which we do have  retry vm fault issue . With  this patch , the no-retry 
interrupt delivered as expected and I don't see  the interrupt storm.

Regards

shaoyun.liu


On 2019-06-21 8:19 p.m., Felix Kuehling wrote:
> Apply the same setting to SH_MEM_CONFIG and VM_CONTEXT1_CNTL. This

> makes the noretry param no longer KFD-specific. On GFX10 I'm not

> changing SH_MEM_CONFIG in this commit because GFX10 has different

> retry behaviour in the SQ and I don't have a way to test it at the

> moment.

>

> Suggested-by: Christian König <Christian.Koenig@amd.com>

> CC: Philip Yang <Philip.Yang@amd.com>

> Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>

> ---

>   drivers/gpu/drm/amd/amdgpu/amdgpu.h              |  1 +

>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c          | 16 +++++-----------

>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c            |  4 ++++

>   drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c         |  3 ++-

>   drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c         |  3 ++-

>   drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c          |  3 ++-

>   drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c          |  3 ++-

>   .../drm/amd/amdkfd/kfd_device_queue_manager_v9.c |  2 +-

>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h            |  2 +-

>   9 files changed, 20 insertions(+), 17 deletions(-)

>

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> index 9b1efdf94bdf..05875279c09e 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> @@ -164,6 +164,7 @@ extern int amdgpu_async_gfx_ring;

>   extern int amdgpu_mcbp;

>   extern int amdgpu_discovery;

>   extern int amdgpu_mes;

> +extern int amdgpu_noretry;

>   

>   #ifdef CONFIG_DRM_AMDGPU_SI

>   extern int amdgpu_si_support;

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> index 7cf6ab07b113..0d578d95be93 100644

> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

> @@ -140,6 +140,7 @@ int amdgpu_async_gfx_ring = 1;

>   int amdgpu_mcbp = 0;

>   int amdgpu_discovery = 0;

>   int amdgpu_mes = 0;

> +int amdgpu_noretry;

>   

>   struct amdgpu_mgpu_info mgpu_info = {

>   	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),

> @@ -591,6 +592,10 @@ MODULE_PARM_DESC(mes,

>   	"Enable Micro Engine Scheduler (0 = disabled (default), 1 = enabled)");

>   module_param_named(mes, amdgpu_mes, int, 0444);

>   

> +MODULE_PARM_DESC(noretry,

> +	"Disable retry faults (0 = retry enabled (default), 1 = retry disabled)");

> +module_param_named(noretry, amdgpu_noretry, int, 0644);

> +

>   #ifdef CONFIG_HSA_AMD

>   /**

>    * DOC: sched_policy (int)

> @@ -666,17 +671,6 @@ module_param(ignore_crat, int, 0444);

>   MODULE_PARM_DESC(ignore_crat,

>   	"Ignore CRAT table during KFD initialization (0 = use CRAT (default), 1 = ignore CRAT)");

>   

> -/**

> - * DOC: noretry (int)

> - * This parameter sets sh_mem_config.retry_disable. Default value, 0, enables retry.

> - * Setting 1 disables retry.

> - * Retry is needed for recoverable page faults.

> - */

> -int noretry;

> -module_param(noretry, int, 0644);

> -MODULE_PARM_DESC(noretry,

> -	"Set sh_mem_config.retry_disable on Vega10 (0 = retry enabled (default), 1 = retry disabled)");

> -

>   /**

>    * DOC: halt_if_hws_hang (int)

>    * Halt if HWS hang is detected. Default value, 0, disables the halt on hang.

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> index e0f3014e76ea..c4e715170bfe 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> @@ -1938,11 +1938,15 @@ static void gfx_v9_0_constants_init(struct amdgpu_device *adev)

>   		if (i == 0) {

>   			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>   					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

> +					    !!amdgpu_noretry);

>   			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>   			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_BASES, 0);

>   		} else {

>   			tmp = REG_SET_FIELD(0, SH_MEM_CONFIG, ALIGNMENT_MODE,

>   					    SH_MEM_ALIGNMENT_MODE_UNALIGNED);

> +			tmp = REG_SET_FIELD(tmp, SH_MEM_CONFIG, RETRY_DISABLE,

> +					    !!amdgpu_noretry);

>   			WREG32_SOC15_RLC(GC, 0, mmSH_MEM_CONFIG, tmp);

>   			tmp = REG_SET_FIELD(0, SH_MEM_BASES, PRIVATE_BASE,

>   				(adev->gmc.private_aperture_start >> 48));

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

> index 9f0f189fc111..15986748f59f 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c

> @@ -236,7 +236,8 @@ static void gfxhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>   				    block_size);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(GC, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

> index b7de60a15623..d605b4963f8a 100644

> --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_0.c

> @@ -215,7 +215,8 @@ static void gfxhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>   				adev->vm_manager.block_size - 9);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, GCVM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

> index 05d1d448c8f5..dc5ce03034d3 100644

> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c

> @@ -265,7 +265,8 @@ static void mmhub_v1_0_setup_vmid_config(struct amdgpu_device *adev)

>   				    block_size);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, VM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 1);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

> index 37a1a318ae63..0f9549f19ade 100644

> --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

> +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v2_0.c

> @@ -205,7 +205,8 @@ static void mmhub_v2_0_setup_vmid_config(struct amdgpu_device *adev)

>   				    adev->vm_manager.block_size - 9);

>   		/* Send no-retry XNACK on fault to suppress VM fault storm. */

>   		tmp = REG_SET_FIELD(tmp, MMVM_CONTEXT1_CNTL,

> -				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT, 0);

> +				    RETRY_PERMISSION_OR_INVALID_PAGE_FAULT,

> +				    !amdgpu_noretry);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_CNTL, i, tmp);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32, i*2, 0);

>   		WREG32_SOC15_OFFSET(MMHUB, 0, mmMMVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32, i*2, 0);

> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

> index e9fe39382371..95a82ac455f2 100644

> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager_v9.c

> @@ -61,7 +61,7 @@ static int update_qpd_v9(struct device_queue_manager *dqm,

>   		qpd->sh_mem_config =

>   				SH_MEM_ALIGNMENT_MODE_UNALIGNED <<

>   					SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT;

> -		if (noretry &&

> +		if (amdgpu_noretry &&

>   		    !dqm->dev->device_info->needs_iommu_device)

>   			qpd->sh_mem_config |=

>   				1 << SH_MEM_CONFIG__RETRY_DISABLE__SHIFT;

> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

> index d4bba0124d29..aa7bf20d20f8 100644

> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

> @@ -157,7 +157,7 @@ extern int ignore_crat;

>   /*

>    * Set sh_mem_config.retry_disable on Vega10

>    */

> -extern int noretry;

> +extern int amdgpu_noretry;

>   

>   /*

>    * Halt if HWS hang is detected