drm/amdgpu/gmc9: give more chance for tlb flush if failed(v2)

Submitted by Deng, Emily on April 4, 2018, 5:01 a.m.

Details

Message ID 1522818115-20328-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "drm/amdgpu/gmc9: give more chance for tlb flush if failed(v2)" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily April 4, 2018, 5:01 a.m.
under SR-IOV sometimes CPU based tlb flush would timeout within
the given 100ms period, instead let it fail and continue we can
give it more chance to repeat the tlb flush on the failed VMHUB

this could fix the massive "Timeout waiting for VM flush ACK"
error during vk_encoder test.

v2:refine the code

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 50 ++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 22 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 503070f..44602d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -328,7 +328,8 @@  static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
 {
 	/* Use register 17 for GART */
 	const unsigned eng = 17;
-	unsigned i, j;
+	unsigned retry = 3;
+	unsigned i, j, k;
 
 	spin_lock(&adev->gmc.invalidate_lock);
 
@@ -336,31 +337,36 @@  static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
 		struct amdgpu_vmhub *hub = &adev->vmhub[i];
 		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
 
-		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
+		for (k = 0; k < retry; ++k) {
 
-		/* Busy wait for ACK.*/
-		for (j = 0; j < 100; j++) {
-			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
-			tmp &= 1 << vmid;
-			if (tmp)
+			WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
+
+			/* Busy wait for ACK.*/
+			for (j = 0; j < 100; j++) {
+				tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
+				tmp &= 1 << vmid;
+				if (tmp)
+					break;
+				cpu_relax();
+			}
+			if (j < 100)
 				break;
-			cpu_relax();
-		}
-		if (j < 100)
-			continue;
-
-		/* Wait for ACK with a delay.*/
-		for (j = 0; j < adev->usec_timeout; j++) {
-			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
-			tmp &= 1 << vmid;
-			if (tmp)
+
+			/* Wait for ACK with a delay.*/
+			for (j = 0; j < adev->usec_timeout; j++) {
+				tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
+				tmp &= 1 << vmid;
+				if (tmp)
+					break;
+				udelay(1);
+			}
+			if (j < adev->usec_timeout)
 				break;
-			udelay(1);
+			if (k == retry)
+				DRM_ERROR("Timeout waiting for VM flush ACK!\n");
+			else
+				DRM_ERROR("Need one more try to write the VMHUB flush request!");
 		}
-		if (j < adev->usec_timeout)
-			continue;
-
-		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
 	}
 
 	spin_unlock(&adev->gmc.invalidate_lock);

Comments

Am 04.04.2018 um 07:01 schrieb Emily Deng:
> under SR-IOV sometimes CPU based tlb flush would timeout within
> the given 100ms period, instead let it fail and continue we can
> give it more chance to repeat the tlb flush on the failed VMHUB
>
> this could fix the massive "Timeout waiting for VM flush ACK"
> error during vk_encoder test.
>
> v2:refine the code
>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

Acked-by: Christian König <christian.koenig@amd.com>

But that is still a rather ugly workaround, we should probably not 
upstream that and wait for the RLC fix instead.

Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 50 ++++++++++++++++++++---------------
>   1 file changed, 28 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 503070f..44602d4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -328,7 +328,8 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   {
>   	/* Use register 17 for GART */
>   	const unsigned eng = 17;
> -	unsigned i, j;
> +	unsigned retry = 3;
> +	unsigned i, j, k;
>   
>   	spin_lock(&adev->gmc.invalidate_lock);
>   
> @@ -336,31 +337,36 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
>   		struct amdgpu_vmhub *hub = &adev->vmhub[i];
>   		u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
>   
> -		WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> +		for (k = 0; k < retry; ++k) {
>   
> -		/* Busy wait for ACK.*/
> -		for (j = 0; j < 100; j++) {
> -			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
> -			tmp &= 1 << vmid;
> -			if (tmp)
> +			WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
> +
> +			/* Busy wait for ACK.*/
> +			for (j = 0; j < 100; j++) {
> +				tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
> +				tmp &= 1 << vmid;
> +				if (tmp)
> +					break;
> +				cpu_relax();
> +			}
> +			if (j < 100)
>   				break;
> -			cpu_relax();
> -		}
> -		if (j < 100)
> -			continue;
> -
> -		/* Wait for ACK with a delay.*/
> -		for (j = 0; j < adev->usec_timeout; j++) {
> -			tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
> -			tmp &= 1 << vmid;
> -			if (tmp)
> +
> +			/* Wait for ACK with a delay.*/
> +			for (j = 0; j < adev->usec_timeout; j++) {
> +				tmp = RREG32_NO_KIQ(hub->vm_inv_eng0_ack + eng);
> +				tmp &= 1 << vmid;
> +				if (tmp)
> +					break;
> +				udelay(1);
> +			}
> +			if (j < adev->usec_timeout)
>   				break;
> -			udelay(1);
> +			if (k == retry)
> +				DRM_ERROR("Timeout waiting for VM flush ACK!\n");
> +			else
> +				DRM_ERROR("Need one more try to write the VMHUB flush request!");
>   		}
> -		if (j < adev->usec_timeout)
> -			continue;
> -
> -		DRM_ERROR("Timeout waiting for VM flush ACK!\n");
>   	}
>   
>   	spin_unlock(&adev->gmc.invalidate_lock);