amdgpu: fix multi-process hang issue

Submitted by Deng, Emily on Aug. 20, 2018, 3:35 a.m.

Details

Message ID 1534736155-14643-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "amdgpu: fix multi-process hang issue" ( rev: 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily Aug. 20, 2018, 3:35 a.m.
From: Monk Liu <Monk.Liu@amd.com>

SWDEV-146499: hang during multi vulkan process testing

cause:
the second frame's PREAMBLE_IB have clear-state
and LOAD actions, those actions ruin the pipeline
that is still doing process in the previous frame's
work-load IB.

fix:
need insert pipeline sync if have context switch for
SRIOV (because only SRIOV will report PREEMPTION flag
to UMD)

Change-Id: If676da7a9b0147114cd76d19b6035ed8033de449
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 5c22cfd..66efa85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -95,6 +95,12 @@  void amdgpu_ib_free(struct amdgpu_device *adev, struct amdgpu_ib *ib,
 	amdgpu_sa_bo_free(adev, &ib->sa_bo, f);
 }
 
+static bool amdgpu_ib_has_preamble(struct amdgpu_ring *ring, bool ctx_switch) {
+	return (amdgpu_sriov_vf(ring->adev) &&
+			ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
+			ctx_switch);
+}
+
 /**
  * amdgpu_ib_schedule - schedule an IB (Indirect Buffer) on the ring
  *
@@ -123,7 +129,7 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_ib *ib = &ibs[0];
 	struct dma_fence *tmp = NULL;
-	bool skip_preamble, need_ctx_switch;
+	bool need_ctx_switch;
 	unsigned patch_offset = ~0;
 	struct amdgpu_vm *vm;
 	uint64_t fence_ctx;
@@ -156,6 +162,8 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 		return -EINVAL;
 	}
 
+	need_ctx_switch = ring->current_ctx != fence_ctx;
+
 	alloc_size = ring->funcs->emit_frame_size + num_ibs *
 		ring->funcs->emit_ib_size;
 
@@ -167,6 +175,7 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 
 	if (ring->funcs->emit_pipeline_sync && job &&
 	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
+		 amdgpu_ib_has_preamble(ring, need_ctx_switch) ||
 	     amdgpu_vm_need_pipeline_sync(ring, job))) {
 		need_pipe_sync = true;
 
@@ -200,8 +209,6 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 			amdgpu_asic_flush_hdp(adev, ring);
 	}
 
-	skip_preamble = ring->current_ctx == fence_ctx;
-	need_ctx_switch = ring->current_ctx != fence_ctx;
 	if (job && ring->funcs->emit_cntxcntl) {
 		if (need_ctx_switch)
 			status |= AMDGPU_HAVE_CTX_SWITCH;
@@ -215,7 +222,7 @@  int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
 
 		/* drop preamble IBs if we don't have a context switch */
 		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) &&
-			skip_preamble &&
+			!need_ctx_switch &&
 			!(status & AMDGPU_PREAMBLE_IB_PRESENT_FIRST) &&
 			!amdgpu_sriov_vf(adev)) /* for SRIOV preemption, Preamble CE ib must be inserted anyway */
 			continue;

Comments

Am 20.08.2018 um 05:35 schrieb Emily Deng:
> From: Monk Liu <Monk.Liu@amd.com>
>
> SWDEV-146499: hang during multi vulkan process testing
>
> cause:
> the second frame's PREAMBLE_IB have clear-state
> and LOAD actions, those actions ruin the pipeline
> that is still doing process in the previous frame's
> work-load IB.
>
> fix:
> need insert pipeline sync if have context switch for
> SRIOV (because only SRIOV will report PREEMPTION flag
> to UMD)
>
> Change-Id: If676da7a9b0147114cd76d19b6035ed8033de449
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 +++++++++++----
>   1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> index 5c22cfd..66efa85 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
> @@ -95,6 +95,12 @@ void amdgpu_ib_free(struct amdgpu_device *adev, struct amdgpu_ib *ib,
>   	amdgpu_sa_bo_free(adev, &ib->sa_bo, f);
>   }
>   
> +static bool amdgpu_ib_has_preamble(struct amdgpu_ring *ring, bool ctx_switch) {
> +	return (amdgpu_sriov_vf(ring->adev) &&
> +			ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
> +			ctx_switch);
> +}
> +

Well NAK, please merge that into amdgpu_ib_schedule() and drop the extra 
check for GFX, that will apply to compute queues as well.

>   /**
>    * amdgpu_ib_schedule - schedule an IB (Indirect Buffer) on the ring
>    *
> @@ -123,7 +129,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   	struct amdgpu_device *adev = ring->adev;
>   	struct amdgpu_ib *ib = &ibs[0];
>   	struct dma_fence *tmp = NULL;
> -	bool skip_preamble, need_ctx_switch;
> +	bool need_ctx_switch;
>   	unsigned patch_offset = ~0;
>   	struct amdgpu_vm *vm;
>   	uint64_t fence_ctx;
> @@ -156,6 +162,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   		return -EINVAL;
>   	}
>   
> +	need_ctx_switch = ring->current_ctx != fence_ctx;
> +
>   	alloc_size = ring->funcs->emit_frame_size + num_ibs *
>   		ring->funcs->emit_ib_size;
>   
> @@ -167,6 +175,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   
>   	if (ring->funcs->emit_pipeline_sync && job &&
>   	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||
> +		 amdgpu_ib_has_preamble(ring, need_ctx_switch) ||
>   	     amdgpu_vm_need_pipeline_sync(ring, job))) {
>   		need_pipe_sync = true;
>   
> @@ -200,8 +209,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   			amdgpu_asic_flush_hdp(adev, ring);
>   	}
>   
> -	skip_preamble = ring->current_ctx == fence_ctx;
> -	need_ctx_switch = ring->current_ctx != fence_ctx;
>   	if (job && ring->funcs->emit_cntxcntl) {
>   		if (need_ctx_switch)
>   			status |= AMDGPU_HAVE_CTX_SWITCH;
> @@ -215,7 +222,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,
>   
>   		/* drop preamble IBs if we don't have a context switch */
>   		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) &&
> -			skip_preamble &&
> +			!need_ctx_switch &&
>   			!(status & AMDGPU_PREAMBLE_IB_PRESENT_FIRST) &&
>   			!amdgpu_sriov_vf(adev)) /* for SRIOV preemption, Preamble CE ib must be inserted anyway */

Ok, please clean that up more carefully. All prerequisites which are not 
IB dependent should come before the loop.

BTW: The handling of AMDGPU_PREAMBLE_IB_PRESENT_FIRST in amdgpu_cs.c is 
completely broken as well.

Regards,
Christian.

>   			continue;
>-----Original Message-----

>From: Christian König <ckoenig.leichtzumerken@gmail.com>

>Sent: Monday, August 20, 2018 9:15 PM

>To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

>Cc: Liu, Monk <Monk.Liu@amd.com>

>Subject: Re: [PATCH] amdgpu: fix multi-process hang issue

>

>Am 20.08.2018 um 05:35 schrieb Emily Deng:

>> From: Monk Liu <Monk.Liu@amd.com>

>>

>> SWDEV-146499: hang during multi vulkan process testing

>>

>> cause:

>> the second frame's PREAMBLE_IB have clear-state and LOAD actions,

>> those actions ruin the pipeline that is still doing process in the

>> previous frame's work-load IB.

>>

>> fix:

>> need insert pipeline sync if have context switch for SRIOV (because

>> only SRIOV will report PREEMPTION flag to UMD)

>>

>> Change-Id: If676da7a9b0147114cd76d19b6035ed8033de449

>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

>> ---

>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 15 +++++++++++----

>>   1 file changed, 11 insertions(+), 4 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> index 5c22cfd..66efa85 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c

>> @@ -95,6 +95,12 @@ void amdgpu_ib_free(struct amdgpu_device *adev,

>struct amdgpu_ib *ib,

>>   	amdgpu_sa_bo_free(adev, &ib->sa_bo, f);

>>   }

>>

>> +static bool amdgpu_ib_has_preamble(struct amdgpu_ring *ring, bool

>ctx_switch) {

>> +	return (amdgpu_sriov_vf(ring->adev) &&

>> +			ring->funcs->type == AMDGPU_RING_TYPE_GFX &&

>> +			ctx_switch);

>> +}

>> +

>

>Well NAK, please merge that into amdgpu_ib_schedule() and drop the extra

>check for GFX, that will apply to compute queues as well.

>

>>   /**

>>    * amdgpu_ib_schedule - schedule an IB (Indirect Buffer) on the ring

>>    *

>> @@ -123,7 +129,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring,

>unsigned num_ibs,

>>   	struct amdgpu_device *adev = ring->adev;

>>   	struct amdgpu_ib *ib = &ibs[0];

>>   	struct dma_fence *tmp = NULL;

>> -	bool skip_preamble, need_ctx_switch;

>> +	bool need_ctx_switch;

>>   	unsigned patch_offset = ~0;

>>   	struct amdgpu_vm *vm;

>>   	uint64_t fence_ctx;

>> @@ -156,6 +162,8 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring,

>unsigned num_ibs,

>>   		return -EINVAL;

>>   	}

>>

>> +	need_ctx_switch = ring->current_ctx != fence_ctx;

>> +

>>   	alloc_size = ring->funcs->emit_frame_size + num_ibs *

>>   		ring->funcs->emit_ib_size;

>>

>> @@ -167,6 +175,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring,

>> unsigned num_ibs,

>>

>>   	if (ring->funcs->emit_pipeline_sync && job &&

>>   	    ((tmp = amdgpu_sync_get_fence(&job->sched_sync, NULL)) ||

>> +		 amdgpu_ib_has_preamble(ring, need_ctx_switch) ||

>>   	     amdgpu_vm_need_pipeline_sync(ring, job))) {

>>   		need_pipe_sync = true;

>>

>> @@ -200,8 +209,6 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring,

>unsigned num_ibs,

>>   			amdgpu_asic_flush_hdp(adev, ring);

>>   	}

>>

>> -	skip_preamble = ring->current_ctx == fence_ctx;

>> -	need_ctx_switch = ring->current_ctx != fence_ctx;

>>   	if (job && ring->funcs->emit_cntxcntl) {

>>   		if (need_ctx_switch)

>>   			status |= AMDGPU_HAVE_CTX_SWITCH; @@ -215,7

>+222,7 @@ int

>> amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs,

>>

>>   		/* drop preamble IBs if we don't have a context switch */

>>   		if ((ib->flags & AMDGPU_IB_FLAG_PREAMBLE) &&

>> -			skip_preamble &&

>> +			!need_ctx_switch &&

>>   			!(status & AMDGPU_PREAMBLE_IB_PRESENT_FIRST)

>&&

>>   			!amdgpu_sriov_vf(adev)) /* for SRIOV preemption,

>Preamble CE ib

>> must be inserted anyway */

>

>Ok, please clean that up more carefully. All prerequisites which are not IB

>dependent should come before the loop.

>

>BTW: The handling of AMDGPU_PREAMBLE_IB_PRESENT_FIRST in amdgpu_cs.c

>is completely broken as well.

Thanks, will refine more carefully.
>

>Regards,

>Christian.

>

>>   			continue;