drm/amdgpu: fix a kcq hang issue for SRIOV

Submitted by Deng, Emily on March 27, 2018, 5:58 a.m.

Details

Message ID 1522130286-25401-1-git-send-email-Emily.Deng@amd.com
State New
Headers show
Series "drm/amdgpu: fix a kcq hang issue for SRIOV" ( rev: 2 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Deng, Emily March 27, 2018, 5:58 a.m.
issue:
the vmflush in KCQ could be preempted (not like GFX ring
which doesn't allow preemption in ring buffer) and this lead
to vm flush fail when there is a world switch during
the vm flush procedure (between write invalidate request
and query invalidate ack)

fix:
separate vm flush for gfx and compute ring, and use
the new format command in compute's vm flush which
use only one package so no preemption could allowed

Signed-off-by: Monk Liu <Monk.Liu@amd.com>
Signed-off-by: Emily Deng <Emily.Deng@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
 4 files changed, 25 insertions(+), 6 deletions(-)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a7e2229..986659f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1790,6 +1790,7 @@  amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
 #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
 #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
 #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
+#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m) (r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
 #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
 #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
 #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 1d0d250..d85df5d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -152,6 +152,8 @@  struct amdgpu_ring_funcs {
 	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
 	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
 			      uint32_t val, uint32_t mask);
+	void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
+				uint32_t reg1, uint32_t val, uint32_t mask);
 	void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
 	/* priority functions */
 	void (*set_priority) (struct amdgpu_ring *ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 1ae3de1..509c9d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4078,6 +4078,13 @@  static void gfx_v9_0_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
 	gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
 }
 
+static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring *ring,
+					uint32_t reg0, uint32_t reg1,
+					uint32_t val, uint32_t mask)
+{
+	gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);
+}
+
 static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
 						 enum amdgpu_interrupt_state state)
 {
@@ -4415,7 +4422,7 @@  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 		7 + /* gfx_v9_0_ring_emit_hdp_flush */
 		5 + /* hdp invalidate */
 		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
-		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
+		(SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
 		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
 		2 + /* gfx_v9_0_ring_emit_vm_flush */
 		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
@@ -4433,6 +4440,7 @@  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
 	.set_priority = gfx_v9_0_ring_set_priority_compute,
 	.emit_wreg = gfx_v9_0_ring_emit_wreg,
 	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
+	.emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
 };
 
 static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index e687363..968447d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -385,11 +385,19 @@  static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
 	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
 			      upper_32_bits(pd_addr));
 
-	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
-
-	/* wait for the invalidate to complete */
-	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
-				  1 << vmid, 1 << vmid);
+    /* The world switch cannot be allowed to occur while
+       some invalidation controller code is waiting for an ack.
+       To workaround the hardware restriction, replace the original
+       two command with one command for compute ring */
+	if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE && amdgpu_sriov_vf(adev)) {
+		amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req + eng,
+				   hub->vm_inv_eng0_ack + eng, req, 1 << vmid);
+	} else {
+		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
+		/* wait for the invalidate to complete */
+		amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
+			   1 << vmid, 1 << vmid);
+	}
 
 	return pd_addr;
 }

Comments

Am 27.03.2018 um 07:58 schrieb Emily Deng:
> issue:
> the vmflush in KCQ could be preempted (not like GFX ring
> which doesn't allow preemption in ring buffer) and this lead
> to vm flush fail when there is a world switch during
> the vm flush procedure (between write invalidate request
> and query invalidate ack)
>
> fix:
> separate vm flush for gfx and compute ring, and use
> the new format command in compute's vm flush which
> use only one package so no preemption could allowed

NAK, as already discussed multiple times now that only circumvents the 
problem, but not really fixes it.

Just executing the "amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + 
eng, req);" multiple times has the same effect and we need to figure out 
why.

Regards,
Christian.

>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
>   4 files changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index a7e2229..986659f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>   #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>   #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m) (r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
>   #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 1d0d250..d85df5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
>   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>   			      uint32_t val, uint32_t mask);
> +	void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
> +				uint32_t reg1, uint32_t val, uint32_t mask);
>   	void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>   	/* priority functions */
>   	void (*set_priority) (struct amdgpu_ring *ring,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 1ae3de1..509c9d2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
>   	gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>   }
>   
> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring *ring,
> +					uint32_t reg0, uint32_t reg1,
> +					uint32_t val, uint32_t mask)
> +{
> +	gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);
> +}
> +
>   static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>   						 enum amdgpu_interrupt_state state)
>   {
> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> -		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> +		(SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
>   		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   	.set_priority = gfx_v9_0_ring_set_priority_compute,
>   	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> +	.emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
>   };
>   
>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e687363..968447d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -385,11 +385,19 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>   			      upper_32_bits(pd_addr));
>   
> -	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> -
> -	/* wait for the invalidate to complete */
> -	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> -				  1 << vmid, 1 << vmid);
> +    /* The world switch cannot be allowed to occur while
> +       some invalidation controller code is waiting for an ack.
> +       To workaround the hardware restriction, replace the original
> +       two command with one command for compute ring */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE && amdgpu_sriov_vf(adev)) {
> +		amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req + eng,
> +				   hub->vm_inv_eng0_ack + eng, req, 1 << vmid);
> +	} else {
> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> +		/* wait for the invalidate to complete */
> +		amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> +			   1 << vmid, 1 << vmid);
> +	}
>   
>   	return pd_addr;
>   }
Hi Christian,
    I think the root cause has been found out. It is the hardware restriction that couldn't do world switch between
invalidate request, and ack. We couldn't control world switch in guest driver, but could replace the origin two command 
with one command to avoid world switch.

    For the multiple times writing, it is also the same root cause, when the world switch happened between the  invalidate request, and ack,
it will have problem. So we could give more chance for SRIOV to do the invalidate request to workaround the case that world switch happens
between invalidate request, and ack

Best Wishes,
Emily Deng

> -----Original Message-----

> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]

> Sent: Tuesday, March 27, 2018 3:48 PM

> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

> Cc: Liu, Monk <Monk.Liu@amd.com>

> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV

> 

> Am 27.03.2018 um 07:58 schrieb Emily Deng:

> > issue:

> > the vmflush in KCQ could be preempted (not like GFX ring which doesn't

> > allow preemption in ring buffer) and this lead to vm flush fail when

> > there is a world switch during the vm flush procedure (between write

> > invalidate request and query invalidate ack)

> >

> > fix:

> > separate vm flush for gfx and compute ring, and use the new format

> > command in compute's vm flush which use only one package so no

> > preemption could allowed

> 

> NAK, as already discussed multiple times now that only circumvents the

> problem, but not really fixes it.

> 

> Just executing the "amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req +

> eng, req);" multiple times has the same effect and we need to figure out why.

> 

> Regards,

> Christian.

> 

> >

> > Signed-off-by: Monk Liu <Monk.Liu@amd.com>

> > Signed-off-by: Emily Deng <Emily.Deng@amd.com>

> > ---

> >   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +

> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++

> >   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-

> >   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----

> >   4 files changed, 25 insertions(+), 6 deletions(-)

> >

> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > index a7e2229..986659f 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> > @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct

> amdgpu_ring *ring)

> >   #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))

> >   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))

> >   #define amdgpu_ring_emit_reg_wait(r, d, v, m)

> > (r)->funcs->emit_reg_wait((r), (d), (v), (m))

> > +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m)

> > +(r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))

> >   #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))

> >   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))

> >   #define amdgpu_ring_init_cond_exec(r)

> > (r)->funcs->init_cond_exec((r)) diff --git

> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> > index 1d0d250..d85df5d 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> > @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {

> >   	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t

> val);

> >   	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,

> >   			      uint32_t val, uint32_t mask);

> > +	void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,

> > +				uint32_t reg1, uint32_t val, uint32_t mask);

> >   	void (*emit_tmz)(struct amdgpu_ring *ring, bool start);

> >   	/* priority functions */

> >   	void (*set_priority) (struct amdgpu_ring *ring, diff --git

> > a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> > index 1ae3de1..509c9d2 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> > @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct

> amdgpu_ring *ring, uint32_t reg,

> >   	gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);

> >   }

> >

> > +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring

> *ring,

> > +					uint32_t reg0, uint32_t reg1,

> > +					uint32_t val, uint32_t mask)

> > +{

> > +	gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20); }

> > +

> >   static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device

> *adev,

> >   						 enum

> amdgpu_interrupt_state state)

> >   {

> > @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs

> gfx_v9_0_ring_funcs_compute = {

> >   		7 + /* gfx_v9_0_ring_emit_hdp_flush */

> >   		5 + /* hdp invalidate */

> >   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */

> > -		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +

> > +		(SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +

> >   		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +

> >   		2 + /* gfx_v9_0_ring_emit_vm_flush */

> >   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm

> fence

> > */ @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs

> gfx_v9_0_ring_funcs_compute = {

> >   	.set_priority = gfx_v9_0_ring_set_priority_compute,

> >   	.emit_wreg = gfx_v9_0_ring_emit_wreg,

> >   	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,

> > +	.emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,

> >   };

> >

> >   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {

> > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> > index e687363..968447d 100644

> > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> > @@ -385,11 +385,19 @@ static uint64_t

> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,

> >   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),

> >   			      upper_32_bits(pd_addr));

> >

> > -	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);

> > -

> > -	/* wait for the invalidate to complete */

> > -	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,

> > -				  1 << vmid, 1 << vmid);

> > +    /* The world switch cannot be allowed to occur while

> > +       some invalidation controller code is waiting for an ack.

> > +       To workaround the hardware restriction, replace the original

> > +       two command with one command for compute ring */

> > +	if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&

> amdgpu_sriov_vf(adev)) {

> > +		amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req

> + eng,

> > +				   hub->vm_inv_eng0_ack + eng, req, 1 <<

> vmid);

> > +	} else {

> > +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng,

> req);

> > +		/* wait for the invalidate to complete */

> > +		amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack +

> eng,

> > +			   1 << vmid, 1 << vmid);

> > +	}

> >

> >   	return pd_addr;

> >   }
Hi Emily,

Am 27.03.2018 um 10:23 schrieb Deng, Emily:
> Hi Christian,
>      I think the root cause has been found out. It is the hardware restriction that couldn't do world switch between
> invalidate request, and ack. We couldn't control world switch in guest driver, but could replace the origin two command
> with one command to avoid world switch.

Unfortunately the problem was reproduced completely without world switch.

So the world switch is not the root cause, but only another trigger to 
the issue.

We need to find the root cause or otherwise will not fix the real 
problem but only mitigate the effects.

Regards,
Christian.

>
>      For the multiple times writing, it is also the same root cause, when the world switch happened between the  invalidate request, and ack,
> it will have problem. So we could give more chance for SRIOV to do the invalidate request to workaround the case that world switch happens
> between invalidate request, and ack
>
> Best Wishes,
> Emily Deng
>
>> -----Original Message-----
>> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]
>> Sent: Tuesday, March 27, 2018 3:48 PM
>> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org
>> Cc: Liu, Monk <Monk.Liu@amd.com>
>> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV
>>
>> Am 27.03.2018 um 07:58 schrieb Emily Deng:
>>> issue:
>>> the vmflush in KCQ could be preempted (not like GFX ring which doesn't
>>> allow preemption in ring buffer) and this lead to vm flush fail when
>>> there is a world switch during the vm flush procedure (between write
>>> invalidate request and query invalidate ack)
>>>
>>> fix:
>>> separate vm flush for gfx and compute ring, and use the new format
>>> command in compute's vm flush which use only one package so no
>>> preemption could allowed
>> NAK, as already discussed multiple times now that only circumvents the
>> problem, but not really fixes it.
>>
>> Just executing the "amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req +
>> eng, req);" multiple times has the same effect and we need to figure out why.
>>
>> Regards,
>> Christian.
>>
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
>>>    4 files changed, 25 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index a7e2229..986659f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct
>> amdgpu_ring *ring)
>>>    #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m)
>>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m)
>>> +(r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
>>>    #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>    #define amdgpu_ring_init_cond_exec(r)
>>> (r)->funcs->init_cond_exec((r)) diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 1d0d250..d85df5d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
>>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t
>> val);
>>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>    			      uint32_t val, uint32_t mask);
>>> +	void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
>>> +				uint32_t reg1, uint32_t val, uint32_t mask);
>>>    	void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>>>    	/* priority functions */
>>>    	void (*set_priority) (struct amdgpu_ring *ring, diff --git
>>> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 1ae3de1..509c9d2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct
>> amdgpu_ring *ring, uint32_t reg,
>>>    	gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>>>    }
>>>
>>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring
>> *ring,
>>> +					uint32_t reg0, uint32_t reg1,
>>> +					uint32_t val, uint32_t mask)
>>> +{
>>> +	gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20); }
>>> +
>>>    static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device
>> *adev,
>>>    						 enum
>> amdgpu_interrupt_state state)
>>>    {
>>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs
>> gfx_v9_0_ring_funcs_compute = {
>>>    		7 + /* gfx_v9_0_ring_emit_hdp_flush */
>>>    		5 + /* hdp invalidate */
>>>    		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>> -		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> +		(SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
>>>    		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>    		2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>    		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm
>> fence
>>> */ @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs
>> gfx_v9_0_ring_funcs_compute = {
>>>    	.set_priority = gfx_v9_0_ring_set_priority_compute,
>>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,
>>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>> +	.emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
>>>    };
>>>
>>>    static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index e687363..968447d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -385,11 +385,19 @@ static uint64_t
>> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>    	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>    			      upper_32_bits(pd_addr));
>>>
>>> -	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>> -
>>> -	/* wait for the invalidate to complete */
>>> -	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>> -				  1 << vmid, 1 << vmid);
>>> +    /* The world switch cannot be allowed to occur while
>>> +       some invalidation controller code is waiting for an ack.
>>> +       To workaround the hardware restriction, replace the original
>>> +       two command with one command for compute ring */
>>> +	if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
>> amdgpu_sriov_vf(adev)) {
>>> +		amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req
>> + eng,
>>> +				   hub->vm_inv_eng0_ack + eng, req, 1 <<
>> vmid);
>>> +	} else {
>>> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng,
>> req);
>>> +		/* wait for the invalidate to complete */
>>> +		amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack +
>> eng,
>>> +			   1 << vmid, 1 << vmid);
>>> +	}
>>>
>>>    	return pd_addr;
>>>    }
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Hi Christian,
     I only found the issue on SRIOV environment, does it also occurs on other cases, for example, thanks.

Best Wishes,
Emily Deng

> -----Original Message-----

> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]

> Sent: Tuesday, March 27, 2018 4:27 PM

> To: Deng, Emily <Emily.Deng@amd.com>; Koenig, Christian

> <Christian.Koenig@amd.com>; amd-gfx@lists.freedesktop.org

> Cc: Liu, Monk <Monk.Liu@amd.com>

> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV

> 

> Hi Emily,

> 

> Am 27.03.2018 um 10:23 schrieb Deng, Emily:

> > Hi Christian,

> >      I think the root cause has been found out. It is the hardware

> > restriction that couldn't do world switch between invalidate request,

> > and ack. We couldn't control world switch in guest driver, but could replace

> the origin two command with one command to avoid world switch.

> 

> Unfortunately the problem was reproduced completely without world switch.

> 

> So the world switch is not the root cause, but only another trigger to the

> issue.

> 

> We need to find the root cause or otherwise will not fix the real problem but

> only mitigate the effects.

> 

> Regards,

> Christian.

> 

> >

> >      For the multiple times writing, it is also the same root cause,

> > when the world switch happened between the  invalidate request, and

> > ack, it will have problem. So we could give more chance for SRIOV to

> > do the invalidate request to workaround the case that world switch

> > happens between invalidate request, and ack

> >

> > Best Wishes,

> > Emily Deng

> >

> >> -----Original Message-----

> >> From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com]

> >> Sent: Tuesday, March 27, 2018 3:48 PM

> >> To: Deng, Emily <Emily.Deng@amd.com>; amd-gfx@lists.freedesktop.org

> >> Cc: Liu, Monk <Monk.Liu@amd.com>

> >> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV

> >>

> >> Am 27.03.2018 um 07:58 schrieb Emily Deng:

> >>> issue:

> >>> the vmflush in KCQ could be preempted (not like GFX ring which

> >>> doesn't allow preemption in ring buffer) and this lead to vm flush

> >>> fail when there is a world switch during the vm flush procedure

> >>> (between write invalidate request and query invalidate ack)

> >>>

> >>> fix:

> >>> separate vm flush for gfx and compute ring, and use the new format

> >>> command in compute's vm flush which use only one package so no

> >>> preemption could allowed

> >> NAK, as already discussed multiple times now that only circumvents

> >> the problem, but not really fixes it.

> >>

> >> Just executing the "amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req

> >> + eng, req);" multiple times has the same effect and we need to figure out

> why.

> >>

> >> Regards,

> >> Christian.

> >>

> >>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

> >>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>

> >>> ---

> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +

> >>>    drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++

> >>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-

> >>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----

> >>>    4 files changed, 25 insertions(+), 6 deletions(-)

> >>>

> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >>> index a7e2229..986659f 100644

> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

> >>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct

> >> amdgpu_ring *ring)

> >>>    #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))

> >>>    #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d),

> (v))

> >>>    #define amdgpu_ring_emit_reg_wait(r, d, v, m)

> >>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))

> >>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m)

> >>> +(r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))

> >>>    #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))

> >>>    #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))

> >>>    #define amdgpu_ring_init_cond_exec(r)

> >>> (r)->funcs->init_cond_exec((r)) diff --git

> >>> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> >>> index 1d0d250..d85df5d 100644

> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

> >>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {

> >>>    	void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg,

> >>> uint32_t

> >> val);

> >>>    	void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,

> >>>    			      uint32_t val, uint32_t mask);

> >>> +	void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,

> >>> +				uint32_t reg1, uint32_t val, uint32_t mask);

> >>>    	void (*emit_tmz)(struct amdgpu_ring *ring, bool start);

> >>>    	/* priority functions */

> >>>    	void (*set_priority) (struct amdgpu_ring *ring, diff --git

> >>> a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> >>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> >>> index 1ae3de1..509c9d2 100644

> >>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

> >>> @@ -4078,6 +4078,13 @@ static void

> >>> gfx_v9_0_ring_emit_reg_wait(struct

> >> amdgpu_ring *ring, uint32_t reg,

> >>>    	gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);

> >>>    }

> >>>

> >>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring

> >> *ring,

> >>> +					uint32_t reg0, uint32_t reg1,

> >>> +					uint32_t val, uint32_t mask)

> >>> +{

> >>> +	gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);

> >>> +}

> >>> +

> >>>    static void gfx_v9_0_set_gfx_eop_interrupt_state(struct

> >>> amdgpu_device

> >> *adev,

> >>>    						 enum

> >> amdgpu_interrupt_state state)

> >>>    {

> >>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs

> >> gfx_v9_0_ring_funcs_compute = {

> >>>    		7 + /* gfx_v9_0_ring_emit_hdp_flush */

> >>>    		5 + /* hdp invalidate */

> >>>    		7 + /* gfx_v9_0_ring_emit_pipeline_sync */

> >>> -		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +

> >>> +		(SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +

> >>>    		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +

> >>>    		2 + /* gfx_v9_0_ring_emit_vm_flush */

> >>>    		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm

> >> fence

> >>> */ @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs

> >> gfx_v9_0_ring_funcs_compute = {

> >>>    	.set_priority = gfx_v9_0_ring_set_priority_compute,

> >>>    	.emit_wreg = gfx_v9_0_ring_emit_wreg,

> >>>    	.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,

> >>> +	.emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,

> >>>    };

> >>>

> >>>    static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {

> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> >>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> >>> index e687363..968447d 100644

> >>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

> >>> @@ -385,11 +385,19 @@ static uint64_t

> >> gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,

> >>>    	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),

> >>>    			      upper_32_bits(pd_addr));

> >>>

> >>> -	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);

> >>> -

> >>> -	/* wait for the invalidate to complete */

> >>> -	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,

> >>> -				  1 << vmid, 1 << vmid);

> >>> +    /* The world switch cannot be allowed to occur while

> >>> +       some invalidation controller code is waiting for an ack.

> >>> +       To workaround the hardware restriction, replace the original

> >>> +       two command with one command for compute ring */

> >>> +	if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&

> >> amdgpu_sriov_vf(adev)) {

> >>> +		amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req

> >> + eng,

> >>> +				   hub->vm_inv_eng0_ack + eng, req, 1 <<

> >> vmid);

> >>> +	} else {

> >>> +		amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng,

> >> req);

> >>> +		/* wait for the invalidate to complete */

> >>> +		amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack +

> >> eng,

> >>> +			   1 << vmid, 1 << vmid);

> >>> +	}

> >>>

> >>>    	return pd_addr;

> >>>    }

> > _______________________________________________

> > amd-gfx mailing list

> > amd-gfx@lists.freedesktop.org

> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On Tue, Mar 27, 2018 at 1:58 AM, Emily Deng <Emily.Deng@amd.com> wrote:
> issue:
> the vmflush in KCQ could be preempted (not like GFX ring
> which doesn't allow preemption in ring buffer) and this lead
> to vm flush fail when there is a world switch during
> the vm flush procedure (between write invalidate request
> and query invalidate ack)
>
> fix:
> separate vm flush for gfx and compute ring, and use
> the new format command in compute's vm flush which
> use only one package so no preemption could allowed
>
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
>  4 files changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index a7e2229..986659f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>  #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>  #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m) (r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
>  #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>  #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>  #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 1d0d250..d85df5d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
>         void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>         void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>                               uint32_t val, uint32_t mask);
> +       void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
> +                               uint32_t reg1, uint32_t val, uint32_t mask);

This is pretty ugly.  We don't need a new callback for this
workaround.  It will just confuse things for other IPs.  Just call
gfx_v9_0_wait_reg_mem() directly if you need it in
gmc_v9_0_emit_flush_gpu_tlb().

Alex

>         void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>         /* priority functions */
>         void (*set_priority) (struct amdgpu_ring *ring,
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 1ae3de1..509c9d2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
>         gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>  }
>
> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring *ring,
> +                                       uint32_t reg0, uint32_t reg1,
> +                                       uint32_t val, uint32_t mask)
> +{
> +       gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);
> +}
> +
>  static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>                                                  enum amdgpu_interrupt_state state)
>  {
> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>                 7 + /* gfx_v9_0_ring_emit_hdp_flush */
>                 5 + /* hdp invalidate */
>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> -               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> +               (SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>         .set_priority = gfx_v9_0_ring_set_priority_compute,
>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
> +       .emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
>  };
>
>  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index e687363..968447d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -385,11 +385,19 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>                               upper_32_bits(pd_addr));
>
> -       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> -
> -       /* wait for the invalidate to complete */
> -       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> -                                 1 << vmid, 1 << vmid);
> +    /* The world switch cannot be allowed to occur while
> +       some invalidation controller code is waiting for an ack.
> +       To workaround the hardware restriction, replace the original
> +       two command with one command for compute ring */
> +       if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE && amdgpu_sriov_vf(adev)) {
> +               amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req + eng,
> +                                  hub->vm_inv_eng0_ack + eng, req, 1 << vmid);
> +       } else {
> +               amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
> +               /* wait for the invalidate to complete */
> +               amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
> +                          1 << vmid, 1 << vmid);
> +       }
>
>         return pd_addr;
>  }
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On Tue, Mar 27, 2018 at 11:15 AM, Alex Deucher <alexdeucher@gmail.com> wrote:
> On Tue, Mar 27, 2018 at 1:58 AM, Emily Deng <Emily.Deng@amd.com> wrote:
>> issue:
>> the vmflush in KCQ could be preempted (not like GFX ring
>> which doesn't allow preemption in ring buffer) and this lead
>> to vm flush fail when there is a world switch during
>> the vm flush procedure (between write invalidate request
>> and query invalidate ack)
>>
>> fix:
>> separate vm flush for gfx and compute ring, and use
>> the new format command in compute's vm flush which
>> use only one package so no preemption could allowed
>>
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
>>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
>>  4 files changed, 25 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index a7e2229..986659f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>  #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>  #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m) (r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
>>  #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>  #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>  #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 1d0d250..d85df5d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
>>         void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>         void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>                               uint32_t val, uint32_t mask);
>> +       void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
>> +                               uint32_t reg1, uint32_t val, uint32_t mask);
>
> This is pretty ugly.  We don't need a new callback for this
> workaround.  It will just confuse things for other IPs.  Just call
> gfx_v9_0_wait_reg_mem() directly if you need it in
> gmc_v9_0_emit_flush_gpu_tlb().
>
> Alex
>
>>         void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>>         /* priority functions */
>>         void (*set_priority) (struct amdgpu_ring *ring,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 1ae3de1..509c9d2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
>>         gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>>  }
>>
>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring *ring,
>> +                                       uint32_t reg0, uint32_t reg1,
>> +                                       uint32_t val, uint32_t mask)
>> +{
>> +       gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);
>> +}
>> +
>>  static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>                                                  enum amdgpu_interrupt_state state)
>>  {
>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>                 7 + /* gfx_v9_0_ring_emit_hdp_flush */
>>                 5 + /* hdp invalidate */
>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>> -               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>> +               (SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
>>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>> @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>         .set_priority = gfx_v9_0_ring_set_priority_compute,
>>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>> +       .emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
>>  };
>>
>>  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index e687363..968447d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -385,11 +385,19 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>                               upper_32_bits(pd_addr));
>>
>> -       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>> -
>> -       /* wait for the invalidate to complete */
>> -       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>> -                                 1 << vmid, 1 << vmid);
>> +    /* The world switch cannot be allowed to occur while
>> +       some invalidation controller code is waiting for an ack.
>> +       To workaround the hardware restriction, replace the original
>> +       two command with one command for compute ring */
>> +       if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE && amdgpu_sriov_vf(adev)) {
>> +               amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req + eng,
>> +                                  hub->vm_inv_eng0_ack + eng, req, 1 << vmid);

Also, it looks like this should be safe to do for all cases.  It
basically just moves from a two packet solution to a one packet
solution.

Alex


>> +       } else {
>> +               amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>> +               /* wait for the invalidate to complete */
>> +               amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>> +                          1 << vmid, 1 << vmid);
>> +       }
>>
>>         return pd_addr;
>>  }
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On Tue, Mar 27, 2018 at 11:15 AM, Alex Deucher <alexdeucher@gmail.com> wrote:
> On Tue, Mar 27, 2018 at 1:58 AM, Emily Deng <Emily.Deng@amd.com> wrote:
>> issue:
>> the vmflush in KCQ could be preempted (not like GFX ring
>> which doesn't allow preemption in ring buffer) and this lead
>> to vm flush fail when there is a world switch during
>> the vm flush procedure (between write invalidate request
>> and query invalidate ack)
>>
>> fix:
>> separate vm flush for gfx and compute ring, and use
>> the new format command in compute's vm flush which
>> use only one package so no preemption could allowed
>>
>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
>>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
>>  4 files changed, 25 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index a7e2229..986659f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>  #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>  #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>  #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m) (r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
>>  #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>  #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>  #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> index 1d0d250..d85df5d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
>>         void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>         void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>                               uint32_t val, uint32_t mask);
>> +       void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
>> +                               uint32_t reg1, uint32_t val, uint32_t mask);
>
> This is pretty ugly.  We don't need a new callback for this
> workaround.  It will just confuse things for other IPs.  Just call
> gfx_v9_0_wait_reg_mem() directly if you need it in
> gmc_v9_0_emit_flush_gpu_tlb().

Nevermind, I misread the patch and thought the change was in the gfx9
module.  In this case, I'd rather split this into three patches.

1. add the new ring callback with a better name.  E.g., emit_reg_wait_oneshot()
2. add the new callback implementation to gfx9 and gfx8 (I think gfx8
will need this as well since we support sr-iov there too)
3. in the gmc8 and 9 code tlb_flush callbacks, check if the ring
supports the new callback and if so, use that, otherwise, fall back to
the existing code.


>
> Alex
>
>>         void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>>         /* priority functions */
>>         void (*set_priority) (struct amdgpu_ring *ring,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> index 1ae3de1..509c9d2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>> @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
>>         gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>>  }
>>
>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring *ring,
>> +                                       uint32_t reg0, uint32_t reg1,
>> +                                       uint32_t val, uint32_t mask)
>> +{
>> +       gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);
>> +}
>> +
>>  static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>                                                  enum amdgpu_interrupt_state state)
>>  {
>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>                 7 + /* gfx_v9_0_ring_emit_hdp_flush */
>>                 5 + /* hdp invalidate */
>>                 7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>> -               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>> +               (SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
>>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>                 2 + /* gfx_v9_0_ring_emit_vm_flush */
>>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>> @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>         .set_priority = gfx_v9_0_ring_set_priority_compute,
>>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>> +       .emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
>>  };
>>
>>  static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index e687363..968447d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -385,11 +385,19 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>         amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>                               upper_32_bits(pd_addr));
>>
>> -       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>> -
>> -       /* wait for the invalidate to complete */
>> -       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>> -                                 1 << vmid, 1 << vmid);
>> +    /* The world switch cannot be allowed to occur while
>> +       some invalidation controller code is waiting for an ack.
>> +       To workaround the hardware restriction, replace the original
>> +       two command with one command for compute ring */
>> +       if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE && amdgpu_sriov_vf(adev)) {

This will break gfx 8 with sr-iov.

Alex

>> +               amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req + eng,
>> +                                  hub->vm_inv_eng0_ack + eng, req, 1 << vmid);
>> +       } else {
>> +               amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>> +               /* wait for the invalidate to complete */
>> +               amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>> +                          1 << vmid, 1 << vmid);
>> +       }
>>
>>         return pd_addr;
>>  }
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Am 27.03.2018 um 17:37 schrieb Alex Deucher:
> On Tue, Mar 27, 2018 at 11:15 AM, Alex Deucher <alexdeucher@gmail.com> wrote:
>> On Tue, Mar 27, 2018 at 1:58 AM, Emily Deng <Emily.Deng@amd.com> wrote:
>>> issue:
>>> the vmflush in KCQ could be preempted (not like GFX ring
>>> which doesn't allow preemption in ring buffer) and this lead
>>> to vm flush fail when there is a world switch during
>>> the vm flush procedure (between write invalidate request
>>> and query invalidate ack)
>>>
>>> fix:
>>> separate vm flush for gfx and compute ring, and use
>>> the new format command in compute's vm flush which
>>> use only one package so no preemption could allowed
>>>
>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
>>>   4 files changed, 25 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index a7e2229..986659f 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>   #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d), (v))
>>>   #define amdgpu_ring_emit_reg_wait(r, d, v, m) (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m) (r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
>>>   #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index 1d0d250..d85df5d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
>>>          void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg, uint32_t val);
>>>          void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>                                uint32_t val, uint32_t mask);
>>> +       void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
>>> +                               uint32_t reg1, uint32_t val, uint32_t mask);
>> This is pretty ugly.  We don't need a new callback for this
>> workaround.  It will just confuse things for other IPs.  Just call
>> gfx_v9_0_wait_reg_mem() directly if you need it in
>> gmc_v9_0_emit_flush_gpu_tlb().
> Nevermind, I misread the patch and thought the change was in the gfx9
> module.  In this case, I'd rather split this into three patches.
>
> 1. add the new ring callback with a better name.  E.g., emit_reg_wait_oneshot()

Maybe "emit_reg_write_wait()", but I still think that is an absolutely 
worse idea.

> 2. add the new callback implementation to gfx9 and gfx8 (I think gfx8
> will need this as well since we support sr-iov there too)

gfx8 doesn't have the hardware bug which seems to make this necessary, 
not does it have the same VMHUB design as gfx9.

> 3. in the gmc8 and 9 code tlb_flush callbacks, check if the ring
> supports the new callback and if so, use that, otherwise, fall back to
> the existing code.

Actually I would rather say we provide a stub in amdgpu_ring.c which 
just uses the separate write and wait callbacks.

Christian.

>
>
>> Alex
>>
>>>          void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>>>          /* priority functions */
>>>          void (*set_priority) (struct amdgpu_ring *ring,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 1ae3de1..509c9d2 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct amdgpu_ring *ring, uint32_t reg,
>>>          gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>>>   }
>>>
>>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring *ring,
>>> +                                       uint32_t reg0, uint32_t reg1,
>>> +                                       uint32_t val, uint32_t mask)
>>> +{
>>> +       gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask, 0x20);
>>> +}
>>> +
>>>   static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>>>                                                   enum amdgpu_interrupt_state state)
>>>   {
>>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>                  7 + /* gfx_v9_0_ring_emit_hdp_flush */
>>>                  5 + /* hdp invalidate */
>>>                  7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>> -               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> +               (SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
>>>                  SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>                  2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>                  8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>> @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>          .set_priority = gfx_v9_0_ring_set_priority_compute,
>>>          .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>>          .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>> +       .emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
>>>   };
>>>
>>>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index e687363..968447d 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -385,11 +385,19 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>          amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>                                upper_32_bits(pd_addr));
>>>
>>> -       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>> -
>>> -       /* wait for the invalidate to complete */
>>> -       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>> -                                 1 << vmid, 1 << vmid);
>>> +    /* The world switch cannot be allowed to occur while
>>> +       some invalidation controller code is waiting for an ack.
>>> +       To workaround the hardware restriction, replace the original
>>> +       two command with one command for compute ring */
>>> +       if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE && amdgpu_sriov_vf(adev)) {
> This will break gfx 8 with sr-iov.
>
> Alex
>
>>> +               amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req + eng,
>>> +                                  hub->vm_inv_eng0_ack + eng, req, 1 << vmid);
>>> +       } else {
>>> +               amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>> +               /* wait for the invalidate to complete */
>>> +               amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>> +                          1 << vmid, 1 << vmid);
>>> +       }
>>>
>>>          return pd_addr;
>>>   }
>>> --
>>> 2.7.4
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On Tue, Mar 27, 2018 at 11:43 AM, Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
> Am 27.03.2018 um 17:37 schrieb Alex Deucher:
>>
>> On Tue, Mar 27, 2018 at 11:15 AM, Alex Deucher <alexdeucher@gmail.com>
>> wrote:
>>>
>>> On Tue, Mar 27, 2018 at 1:58 AM, Emily Deng <Emily.Deng@amd.com> wrote:
>>>>
>>>> issue:
>>>> the vmflush in KCQ could be preempted (not like GFX ring
>>>> which doesn't allow preemption in ring buffer) and this lead
>>>> to vm flush fail when there is a world switch during
>>>> the vm flush procedure (between write invalidate request
>>>> and query invalidate ack)
>>>>
>>>> fix:
>>>> separate vm flush for gfx and compute ring, and use
>>>> the new format command in compute's vm flush which
>>>> use only one package so no preemption could allowed
>>>>
>>>> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
>>>> Signed-off-by: Emily Deng <Emily.Deng@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  2 ++
>>>>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    | 10 +++++++++-
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c    | 18 +++++++++++++-----
>>>>   4 files changed, 25 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index a7e2229..986659f 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -1790,6 +1790,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring)
>>>>   #define amdgpu_ring_emit_rreg(r, d) (r)->funcs->emit_rreg((r), (d))
>>>>   #define amdgpu_ring_emit_wreg(r, d, v) (r)->funcs->emit_wreg((r), (d),
>>>> (v))
>>>>   #define amdgpu_ring_emit_reg_wait(r, d, v, m)
>>>> (r)->funcs->emit_reg_wait((r), (d), (v), (m))
>>>> +#define amdgpu_ring_emit_reg_wait1(r, d0, d1, v, m)
>>>> (r)->funcs->emit_reg_wait1((r), (d0), (d1), (v), (m))
>>>>   #define amdgpu_ring_emit_tmz(r, b) (r)->funcs->emit_tmz((r), (b))
>>>>   #define amdgpu_ring_pad_ib(r, ib) ((r)->funcs->pad_ib((r), (ib)))
>>>>   #define amdgpu_ring_init_cond_exec(r) (r)->funcs->init_cond_exec((r))
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> index 1d0d250..d85df5d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>>> @@ -152,6 +152,8 @@ struct amdgpu_ring_funcs {
>>>>          void (*emit_wreg)(struct amdgpu_ring *ring, uint32_t reg,
>>>> uint32_t val);
>>>>          void (*emit_reg_wait)(struct amdgpu_ring *ring, uint32_t reg,
>>>>                                uint32_t val, uint32_t mask);
>>>> +       void (*emit_reg_wait1)(struct amdgpu_ring *ring, uint32_t reg0,
>>>> +                               uint32_t reg1, uint32_t val, uint32_t
>>>> mask);
>>>
>>> This is pretty ugly.  We don't need a new callback for this
>>> workaround.  It will just confuse things for other IPs.  Just call
>>> gfx_v9_0_wait_reg_mem() directly if you need it in
>>> gmc_v9_0_emit_flush_gpu_tlb().
>>
>> Nevermind, I misread the patch and thought the change was in the gfx9
>> module.  In this case, I'd rather split this into three patches.
>>
>> 1. add the new ring callback with a better name.  E.g.,
>> emit_reg_wait_oneshot()
>
>
> Maybe "emit_reg_write_wait()", but I still think that is an absolutely worse
> idea.

Well apparently the issue is that world switches can't happen between
the req and the ack, so this would consolidate the operation to one
packet which should avoid that issue.

>
>> 2. add the new callback implementation to gfx9 and gfx8 (I think gfx8
>> will need this as well since we support sr-iov there too)
>
>
> gfx8 doesn't have the hardware bug which seems to make this necessary, not
> does it have the same VMHUB design as gfx9.

Oh, right, in this case it's the req/ack engines which were new for
soc15.  We may want the same fix for sdma4 though.

>
>> 3. in the gmc8 and 9 code tlb_flush callbacks, check if the ring
>> supports the new callback and if so, use that, otherwise, fall back to
>> the existing code.
>
>
> Actually I would rather say we provide a stub in amdgpu_ring.c which just
> uses the separate write and wait callbacks.

yeah, that may be cleaner.

Alex

>
> Christian.
>
>
>>
>>
>>> Alex
>>>
>>>>          void (*emit_tmz)(struct amdgpu_ring *ring, bool start);
>>>>          /* priority functions */
>>>>          void (*set_priority) (struct amdgpu_ring *ring,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> index 1ae3de1..509c9d2 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>>> @@ -4078,6 +4078,13 @@ static void gfx_v9_0_ring_emit_reg_wait(struct
>>>> amdgpu_ring *ring, uint32_t reg,
>>>>          gfx_v9_0_wait_reg_mem(ring, 0, 0, 0, reg, 0, val, mask, 0x20);
>>>>   }
>>>>
>>>> +static void gfx_v9_0_ring_emit_reg_wait_compute(struct amdgpu_ring
>>>> *ring,
>>>> +                                       uint32_t reg0, uint32_t reg1,
>>>> +                                       uint32_t val, uint32_t mask)
>>>> +{
>>>> +       gfx_v9_0_wait_reg_mem(ring, 0, 0, 1, reg0, reg1, val, mask,
>>>> 0x20);
>>>> +}
>>>> +
>>>>   static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device
>>>> *adev,
>>>>                                                   enum
>>>> amdgpu_interrupt_state state)
>>>>   {
>>>> @@ -4415,7 +4422,7 @@ static const struct amdgpu_ring_funcs
>>>> gfx_v9_0_ring_funcs_compute = {
>>>>                  7 + /* gfx_v9_0_ring_emit_hdp_flush */
>>>>                  5 + /* hdp invalidate */
>>>>                  7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>> -               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>>> +               (SOC15_FLUSH_GPU_TLB_NUM_WREG - 1) * 5 +
>>>>                  SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>>>                  2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>>                  8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user
>>>> fence, vm fence */
>>>> @@ -4433,6 +4440,7 @@ static const struct amdgpu_ring_funcs
>>>> gfx_v9_0_ring_funcs_compute = {
>>>>          .set_priority = gfx_v9_0_ring_set_priority_compute,
>>>>          .emit_wreg = gfx_v9_0_ring_emit_wreg,
>>>>          .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>>>> +       .emit_reg_wait1 = gfx_v9_0_ring_emit_reg_wait_compute,
>>>>   };
>>>>
>>>>   static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index e687363..968447d 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -385,11 +385,19 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct
>>>> amdgpu_ring *ring,
>>>>          amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 *
>>>> vmid),
>>>>                                upper_32_bits(pd_addr));
>>>>
>>>> -       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>> -
>>>> -       /* wait for the invalidate to complete */
>>>> -       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>> -                                 1 << vmid, 1 << vmid);
>>>> +    /* The world switch cannot be allowed to occur while
>>>> +       some invalidation controller code is waiting for an ack.
>>>> +       To workaround the hardware restriction, replace the original
>>>> +       two command with one command for compute ring */
>>>> +       if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
>>>> amdgpu_sriov_vf(adev)) {
>>
>> This will break gfx 8 with sr-iov.
>>
>> Alex
>>
>>>> +               amdgpu_ring_emit_reg_wait1(ring, hub->vm_inv_eng0_req +
>>>> eng,
>>>> +                                  hub->vm_inv_eng0_ack + eng, req, 1 <<
>>>> vmid);
>>>> +       } else {
>>>> +               amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng,
>>>> req);
>>>> +               /* wait for the invalidate to complete */
>>>> +               amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack +
>>>> eng,
>>>> +                          1 << vmid, 1 << vmid);
>>>> +       }
>>>>
>>>>          return pd_addr;
>>>>   }
>>>> --
>>>> 2.7.4
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx@lists.freedesktop.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
Am 27.03.2018 um 17:52 schrieb Alex Deucher:
> [SNIP]
>>> 2. add the new callback implementation to gfx9 and gfx8 (I think gfx8
>>> will need this as well since we support sr-iov there too)
>>
>> gfx8 doesn't have the hardware bug which seems to make this necessary, not
>> does it have the same VMHUB design as gfx9.
> Oh, right, in this case it's the req/ack engines which were new for
> soc15.  We may want the same fix for sdma4 though.

And exactly that is one of the reasons why this workaround doesn't work 
correctly.

The SDMA is not directly connected to the GFXHUB, so even if the SDMA 
would provide a single command for this the write/wait would still be 
executed as two operations.

In other words we can again run into the problem and the same thing 
applies for CPU based updates.

The only real workaround would be to write the request, read the 
register back and if the write didn't succeeded write it again.

But seriously remember that this issue is not limited to the VMHUB 
registers. Do you want to write and read back every register to make 
sure the write succeeded?

Regards,
Christian.
On Tue, Mar 27, 2018 at 12:30 PM, Christian König
<christian.koenig@amd.com> wrote:
> Am 27.03.2018 um 17:52 schrieb Alex Deucher:
>>
>> [SNIP]
>>>>
>>>> 2. add the new callback implementation to gfx9 and gfx8 (I think gfx8
>>>> will need this as well since we support sr-iov there too)
>>>
>>>
>>> gfx8 doesn't have the hardware bug which seems to make this necessary,
>>> not
>>> does it have the same VMHUB design as gfx9.
>>
>> Oh, right, in this case it's the req/ack engines which were new for
>> soc15.  We may want the same fix for sdma4 though.
>
>
> And exactly that is one of the reasons why this workaround doesn't work
> correctly.
>
> The SDMA is not directly connected to the GFXHUB, so even if the SDMA would
> provide a single command for this the write/wait would still be executed as
> two operations.

I'm not sure I follow.  I think there are two issues: the hw bug you
are referring to and the SR-IOV requirement that the req and the ack
can't be split by a world switch.  I believe the world switch happens
at at least packet granularity so I think for the SR-IOV requirement
using a single packet should handle it.

>
> In other words we can again run into the problem and the same thing applies
> for CPU based updates.

yeah, CPU based updates could indeed be an issue for the SR-IOV
requirement, but in that case it's easier to read back and retry.

Alex


>
> The only real workaround would be to write the request, read the register
> back and if the write didn't succeeded write it again.
>
> But seriously remember that this issue is not limited to the VMHUB
> registers. Do you want to write and read back every register to make sure
> the write succeeded?
>
> Regards,
> Christian.
Am 27.03.2018 um 18:56 schrieb Alex Deucher:
> On Tue, Mar 27, 2018 at 12:30 PM, Christian König
> <christian.koenig@amd.com> wrote:
>> Am 27.03.2018 um 17:52 schrieb Alex Deucher:
>>> [SNIP]
>>>>> 2. add the new callback implementation to gfx9 and gfx8 (I think gfx8
>>>>> will need this as well since we support sr-iov there too)
>>>>
>>>> gfx8 doesn't have the hardware bug which seems to make this necessary,
>>>> not
>>>> does it have the same VMHUB design as gfx9.
>>> Oh, right, in this case it's the req/ack engines which were new for
>>> soc15.  We may want the same fix for sdma4 though.
>>
>> And exactly that is one of the reasons why this workaround doesn't work
>> correctly.
>>
>> The SDMA is not directly connected to the GFXHUB, so even if the SDMA would
>> provide a single command for this the write/wait would still be executed as
>> two operations.
> I'm not sure I follow.  I think there are two issues: the hw bug you
> are referring to and the SR-IOV requirement that the req and the ack
> can't be split by a world switch.  I believe the world switch happens
> at at least packet granularity so I think for the SR-IOV requirement
> using a single packet should handle it.

The problem is to me it looks like there is no SR-IOV requirement to not 
split the req and ack. The hardware is duplicated per VF and I suggested 
to Emily to test my multiple write workaround.

Since I didn't heard back I strongly assume that this worked as well and 
that can only mean that we are indeed running into the same hw issue again.

That in turn means that not only the GFXHUB is affect, but ANY (GRBM?) 
register write could be silently dropped. I can't imagine how we want to 
build a stable driver around this.

I unfortunately can't reliable reproduce the issue on bare metal any 
more. It would probably best if we could setup a call with some of the 
hardware guys to come up with a plan to narrow down this issue further.

Regards,
Christian.

>
>> In other words we can again run into the problem and the same thing applies
>> for CPU based updates.
> yeah, CPU based updates could indeed be an issue for the SR-IOV
> requirement, but in that case it's easier to read back and retry.
>
> Alex
>
>
>> The only real workaround would be to write the request, read the register
>> back and if the write didn't succeeded write it again.
>>
>> But seriously remember that this issue is not limited to the VMHUB
>> registers. Do you want to write and read back every register to make sure
>> the write succeeded?
>>
>> Regards,
>> Christian.
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Thanks you all

Let's keep pushing on hardware team ... hope MC or RLCV side could discover something interesting 

/Monk

-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 

Sent: 2018年3月28日 1:18
To: Alex Deucher <alexdeucher@gmail.com>; Koenig, Christian <Christian.Koenig@amd.com>
Cc: Deng, Emily <Emily.Deng@amd.com>; amd-gfx list <amd-gfx@lists.freedesktop.org>; Liu, Monk <Monk.Liu@amd.com>
Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV

Am 27.03.2018 um 18:56 schrieb Alex Deucher:
> On Tue, Mar 27, 2018 at 12:30 PM, Christian König 

> <christian.koenig@amd.com> wrote:

>> Am 27.03.2018 um 17:52 schrieb Alex Deucher:

>>> [SNIP]

>>>>> 2. add the new callback implementation to gfx9 and gfx8 (I think 

>>>>> gfx8 will need this as well since we support sr-iov there too)

>>>>

>>>> gfx8 doesn't have the hardware bug which seems to make this 

>>>> necessary, not does it have the same VMHUB design as gfx9.

>>> Oh, right, in this case it's the req/ack engines which were new for 

>>> soc15.  We may want the same fix for sdma4 though.

>>

>> And exactly that is one of the reasons why this workaround doesn't 

>> work correctly.

>>

>> The SDMA is not directly connected to the GFXHUB, so even if the SDMA 

>> would provide a single command for this the write/wait would still be 

>> executed as two operations.

> I'm not sure I follow.  I think there are two issues: the hw bug you 

> are referring to and the SR-IOV requirement that the req and the ack 

> can't be split by a world switch.  I believe the world switch happens 

> at at least packet granularity so I think for the SR-IOV requirement 

> using a single packet should handle it.


The problem is to me it looks like there is no SR-IOV requirement to not split the req and ack. The hardware is duplicated per VF and I suggested to Emily to test my multiple write workaround.

Since I didn't heard back I strongly assume that this worked as well and that can only mean that we are indeed running into the same hw issue again.

That in turn means that not only the GFXHUB is affect, but ANY (GRBM?) register write could be silently dropped. I can't imagine how we want to build a stable driver around this.

I unfortunately can't reliable reproduce the issue on bare metal any more. It would probably best if we could setup a call with some of the hardware guys to come up with a plan to narrow down this issue further.

Regards,
Christian.

>

>> In other words we can again run into the problem and the same thing 

>> applies for CPU based updates.

> yeah, CPU based updates could indeed be an issue for the SR-IOV 

> requirement, but in that case it's easier to read back and retry.

>

> Alex

>

>

>> The only real workaround would be to write the request, read the 

>> register back and if the write didn't succeeded write it again.

>>

>> But seriously remember that this issue is not limited to the VMHUB 

>> registers. Do you want to write and read back every register to make 

>> sure the write succeeded?

>>

>> Regards,

>> Christian.

> _______________________________________________

> amd-gfx mailing list

> amd-gfx@lists.freedesktop.org

> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> 

The SDMA is not directly connected to the GFXHUB, so even if the SDMA would provide a single command for this the write/wait would still be executed as two operations.

I don't understand this point, more details may be ??

For SDMA from v148 ucode, it'll ignore PREEMPT command when it is doing SRBM_WRITE and POLL_MEM_REG on registers, so as long as SDMA is dong vm invalidate the world switch will not 
Interrupt it 

/Monk

-----Original Message-----
From: Koenig, Christian 

Sent: 2018年3月28日 0:30
To: Alex Deucher <alexdeucher@gmail.com>
Cc: Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; amd-gfx list <amd-gfx@lists.freedesktop.org>
Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV

Am 27.03.2018 um 17:52 schrieb Alex Deucher:
> [SNIP]

>>> 2. add the new callback implementation to gfx9 and gfx8 (I think 

>>> gfx8 will need this as well since we support sr-iov there too)

>>

>> gfx8 doesn't have the hardware bug which seems to make this 

>> necessary, not does it have the same VMHUB design as gfx9.

> Oh, right, in this case it's the req/ack engines which were new for 

> soc15.  We may want the same fix for sdma4 though.


And exactly that is one of the reasons why this workaround doesn't work correctly.

The SDMA is not directly connected to the GFXHUB, so even if the SDMA would provide a single command for this the write/wait would still be executed as two operations.

In other words we can again run into the problem and the same thing applies for CPU based updates.

The only real workaround would be to write the request, read the register back and if the write didn't succeeded write it again.

But seriously remember that this issue is not limited to the VMHUB registers. Do you want to write and read back every register to make sure the write succeeded?

Regards,
Christian.
Am 28.03.2018 um 06:36 schrieb Liu, Monk:
> The SDMA is not directly connected to the GFXHUB, so even if the SDMA would provide a single command for this the write/wait would still be executed as two operations.
>
> I don't understand this point, more details may be ??
>
> For SDMA from v148 ucode, it'll ignore PREEMPT command when it is doing SRBM_WRITE and POLL_MEM_REG on registers, so as long as SDMA is dong vm invalidate the world switch will not
> Interrupt it

Ah! Good to know, I was assuming that the GFX block might actually 
switch anyway in this case.

Christian.

>
> /Monk
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: 2018年3月28日 0:30
> To: Alex Deucher <alexdeucher@gmail.com>
> Cc: Deng, Emily <Emily.Deng@amd.com>; Liu, Monk <Monk.Liu@amd.com>; amd-gfx list <amd-gfx@lists.freedesktop.org>
> Subject: Re: [PATCH] drm/amdgpu: fix a kcq hang issue for SRIOV
>
> Am 27.03.2018 um 17:52 schrieb Alex Deucher:
>> [SNIP]
>>>> 2. add the new callback implementation to gfx9 and gfx8 (I think
>>>> gfx8 will need this as well since we support sr-iov there too)
>>> gfx8 doesn't have the hardware bug which seems to make this
>>> necessary, not does it have the same VMHUB design as gfx9.
>> Oh, right, in this case it's the req/ack engines which were new for
>> soc15.  We may want the same fix for sdma4 though.
> And exactly that is one of the reasons why this workaround doesn't work correctly.
>
> The SDMA is not directly connected to the GFXHUB, so even if the SDMA would provide a single command for this the write/wait would still be executed as two operations.
>
> In other words we can again run into the problem and the same thing applies for CPU based updates.
>
> The only real workaround would be to write the request, read the register back and if the write didn't succeeded write it again.
>
> But seriously remember that this issue is not limited to the VMHUB registers. Do you want to write and read back every register to make sure the write succeeded?
>
> Regards,
> Christian.