drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues

Submitted by Marek Olšák on Jan. 21, 2019, 11:46 p.m.

Details

Message ID 20190121234647.3995-1-maraeo@gmail.com
State New
Headers show
Series "drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues" ( rev: 1 ) in AMD X.Org drivers

Not browsing as part of any series.

Commit Message

Marek Olšák Jan. 21, 2019, 11:46 p.m.
From: Marek Olšák <marek.olsak@amd.com>

I'm not increasing the DRM version because GDS isn't totally without bugs yet.

Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 17 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 17 ++++++++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 36 +++++++++++++++++++++++++
 include/uapi/drm/amdgpu_drm.h           |  5 ++++
 5 files changed, 77 insertions(+)

Patch hide | download patch | download mbox

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
index ecbcefe49a98..f89f5734d985 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
@@ -30,20 +30,22 @@  struct amdgpu_bo;
 struct amdgpu_gds_asic_info {
 	uint32_t	total_size;
 	uint32_t	gfx_partition_size;
 	uint32_t	cs_partition_size;
 };
 
 struct amdgpu_gds {
 	struct amdgpu_gds_asic_info	mem;
 	struct amdgpu_gds_asic_info	gws;
 	struct amdgpu_gds_asic_info	oa;
+	uint32_t			gds_compute_max_wave_id;
+
 	/* At present, GDS, GWS and OA resources for gfx (graphics)
 	 * is always pre-allocated and available for graphics operation.
 	 * Such resource is shared between all gfx clients.
 	 * TODO: move this operation to user space
 	 * */
 	struct amdgpu_bo*		gds_gfx_bo;
 	struct amdgpu_bo*		gws_gfx_bo;
 	struct amdgpu_bo*		oa_gfx_bo;
 };
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index 7984292f9282..d971ea914755 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -2257,20 +2257,36 @@  static void gfx_v7_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 					  (2 << 0) |
 #endif
 					  (ib->gpu_addr & 0xFFFFFFFC));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
 	amdgpu_ring_write(ring, control);
 }
 
@@ -5050,20 +5066,21 @@  static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev)
 	adev->gfx.priv_inst_irq.num_types = 1;
 	adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
 }
 
 static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
 		adev->gds.oa.gfx_partition_size = 4;
 		adev->gds.oa.cs_partition_size = 1;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index a26747681ed6..dcdae74fc0e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6077,20 +6077,36 @@  static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 				(2 << 0) |
 #endif
 				(ib->gpu_addr & 0xFFFFFFFC));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
 	amdgpu_ring_write(ring, control);
 }
 
@@ -6989,20 +7005,21 @@  static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev)
 {
 	adev->gfx.rlc.funcs = &iceland_rlc_funcs;
 }
 
 static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
 {
 	/* init asci gds info */
 	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
+	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
 		adev->gds.oa.gfx_partition_size = 4;
 		adev->gds.oa.cs_partition_size = 1;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 262ee3cf6f1c..63b898fc0467 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4003,20 +4003,36 @@  static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 }
 
 static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
 					  struct amdgpu_job *job,
 					  struct amdgpu_ib *ib,
 					  uint32_t flags)
 {
 	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
 	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
 
+	/* Currently, there is a high possibility to get wave ID mismatch
+	 * between ME and GDS, leading to a hw deadlock, because ME generates
+	 * different wave IDs than the GDS expects. This situation happens
+	 * randomly when at least 5 compute pipes use GDS ordered append.
+	 * The wave IDs generated by ME are also wrong after suspend/resume.
+	 * Those are probably bugs somewhere else in the kernel driver.
+	 *
+	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
+	 * GDS to 0 for this ring (me/pipe).
+	 */
+	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
+		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
+		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
+		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
+	}
+
 	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
 	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
 	amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
 				(2 << 0) |
 #endif
 				lower_32_bits(ib->gpu_addr));
 	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
 	amdgpu_ring_write(ring, control);
 }
@@ -4839,20 +4855,40 @@  static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
 		adev->gds.mem.total_size = 0x10000;
 		break;
 	case CHIP_RAVEN:
 		adev->gds.mem.total_size = 0x1000;
 		break;
 	default:
 		adev->gds.mem.total_size = 0x10000;
 		break;
 	}
 
+	switch (adev->asic_type) {
+	case CHIP_VEGA10:
+	case CHIP_VEGA20:
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	case CHIP_VEGA12:
+		adev->gds.gds_compute_max_wave_id = 0x27f;
+		break;
+	case CHIP_RAVEN:
+		if (adev->rev_id >= 0x8)
+			adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
+		else
+			adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
+		break;
+	default:
+		/* this really depends on the chip */
+		adev->gds.gds_compute_max_wave_id = 0x7ff;
+		break;
+	}
+
 	adev->gds.gws.total_size = 64;
 	adev->gds.oa.total_size = 16;
 
 	if (adev->gds.mem.total_size == 64 * 1024) {
 		adev->gds.mem.gfx_partition_size = 4096;
 		adev->gds.mem.cs_partition_size = 4096;
 
 		adev->gds.gws.gfx_partition_size = 4;
 		adev->gds.gws.cs_partition_size = 4;
 
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index faaad04814e4..662d379ea624 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -561,20 +561,25 @@  union drm_amdgpu_cs {
 /* Preamble flag, which means the IB could be dropped if no context switch */
 #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
 
 /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
 #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
 
 /* The IB fence should do the L2 writeback but not invalidate any shader
  * caches (L2/vL1/sL1/I$). */
 #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
 
+/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
+ * This will reset wave ID counters for the IB.
+ */
+#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
+
 struct drm_amdgpu_cs_chunk_ib {
 	__u32 _pad;
 	/** AMDGPU_IB_FLAG_* */
 	__u32 flags;
 	/** Virtual address to begin IB execution */
 	__u64 va_start;
 	/** Size of submission */
 	__u32 ib_bytes;
 	/** HW IP to submit to */
 	__u32 ip_type;

Comments

Am 22.01.19 um 00:46 schrieb Marek Olšák:
> From: Marek Olšák <marek.olsak@amd.com>
>
> I'm not increasing the DRM version because GDS isn't totally without bugs yet.

Looks mostly good on first glance.

But one things that is certainly wrong is when you add any 
amdgpu_ring_write() call to the emit_ibs callback you also need to 
update the estimation on how many DW can be used by an IB.

Look out for the structure where the gfx_*_ring_emit_ib is used.

Regards,
Christian.

>
> Signed-off-by: Marek Olšák <marek.olsak@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 17 ++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 17 ++++++++++++
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 36 +++++++++++++++++++++++++
>   include/uapi/drm/amdgpu_drm.h           |  5 ++++
>   5 files changed, 77 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> index ecbcefe49a98..f89f5734d985 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
> @@ -30,20 +30,22 @@ struct amdgpu_bo;
>   struct amdgpu_gds_asic_info {
>   	uint32_t	total_size;
>   	uint32_t	gfx_partition_size;
>   	uint32_t	cs_partition_size;
>   };
>   
>   struct amdgpu_gds {
>   	struct amdgpu_gds_asic_info	mem;
>   	struct amdgpu_gds_asic_info	gws;
>   	struct amdgpu_gds_asic_info	oa;
> +	uint32_t			gds_compute_max_wave_id;
> +
>   	/* At present, GDS, GWS and OA resources for gfx (graphics)
>   	 * is always pre-allocated and available for graphics operation.
>   	 * Such resource is shared between all gfx clients.
>   	 * TODO: move this operation to user space
>   	 * */
>   	struct amdgpu_bo*		gds_gfx_bo;
>   	struct amdgpu_bo*		gws_gfx_bo;
>   	struct amdgpu_bo*		oa_gfx_bo;
>   };
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index 7984292f9282..d971ea914755 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -2257,20 +2257,36 @@ static void gfx_v7_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   }
>   
>   static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   					  struct amdgpu_job *job,
>   					  struct amdgpu_ib *ib,
>   					  uint32_t flags)
>   {
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>   
> +	/* Currently, there is a high possibility to get wave ID mismatch
> +	 * between ME and GDS, leading to a hw deadlock, because ME generates
> +	 * different wave IDs than the GDS expects. This situation happens
> +	 * randomly when at least 5 compute pipes use GDS ordered append.
> +	 * The wave IDs generated by ME are also wrong after suspend/resume.
> +	 * Those are probably bugs somewhere else in the kernel driver.
> +	 *
> +	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
> +	 * GDS to 0 for this ring (me/pipe).
> +	 */
> +	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
> +		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
> +		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
> +	}
> +
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>   	amdgpu_ring_write(ring,
>   #ifdef __BIG_ENDIAN
>   					  (2 << 0) |
>   #endif
>   					  (ib->gpu_addr & 0xFFFFFFFC));
>   	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>   	amdgpu_ring_write(ring, control);
>   }
>   
> @@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct amdgpu_device *adev)
>   	adev->gfx.priv_inst_irq.num_types = 1;
>   	adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
>   }
>   
>   static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
>   {
>   	/* init asci gds info */
>   	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
> +	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>   
>   	if (adev->gds.mem.total_size == 64 * 1024) {
>   		adev->gds.mem.gfx_partition_size = 4096;
>   		adev->gds.mem.cs_partition_size = 4096;
>   
>   		adev->gds.gws.gfx_partition_size = 4;
>   		adev->gds.gws.cs_partition_size = 4;
>   
>   		adev->gds.oa.gfx_partition_size = 4;
>   		adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index a26747681ed6..dcdae74fc0e1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -6077,20 +6077,36 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   }
>   
>   static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   					  struct amdgpu_job *job,
>   					  struct amdgpu_ib *ib,
>   					  uint32_t flags)
>   {
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>   
> +	/* Currently, there is a high possibility to get wave ID mismatch
> +	 * between ME and GDS, leading to a hw deadlock, because ME generates
> +	 * different wave IDs than the GDS expects. This situation happens
> +	 * randomly when at least 5 compute pipes use GDS ordered append.
> +	 * The wave IDs generated by ME are also wrong after suspend/resume.
> +	 * Those are probably bugs somewhere else in the kernel driver.
> +	 *
> +	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
> +	 * GDS to 0 for this ring (me/pipe).
> +	 */
> +	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
> +		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID - PACKET3_SET_CONFIG_REG_START);
> +		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
> +	}
> +
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>   	amdgpu_ring_write(ring,
>   #ifdef __BIG_ENDIAN
>   				(2 << 0) |
>   #endif
>   				(ib->gpu_addr & 0xFFFFFFFC));
>   	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
>   	amdgpu_ring_write(ring, control);
>   }
>   
> @@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev)
>   {
>   	adev->gfx.rlc.funcs = &iceland_rlc_funcs;
>   }
>   
>   static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
>   {
>   	/* init asci gds info */
>   	adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
> +	adev->gds.gds_compute_max_wave_id = RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);
>   
>   	if (adev->gds.mem.total_size == 64 * 1024) {
>   		adev->gds.mem.gfx_partition_size = 4096;
>   		adev->gds.mem.cs_partition_size = 4096;
>   
>   		adev->gds.gws.gfx_partition_size = 4;
>   		adev->gds.gws.cs_partition_size = 4;
>   
>   		adev->gds.oa.gfx_partition_size = 4;
>   		adev->gds.oa.cs_partition_size = 1;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 262ee3cf6f1c..63b898fc0467 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -4003,20 +4003,36 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
>   }
>   
>   static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
>   					  struct amdgpu_job *job,
>   					  struct amdgpu_ib *ib,
>   					  uint32_t flags)
>   {
>   	unsigned vmid = AMDGPU_JOB_GET_VMID(job);
>   	u32 control = INDIRECT_BUFFER_VALID | ib->length_dw | (vmid << 24);
>   
> +	/* Currently, there is a high possibility to get wave ID mismatch
> +	 * between ME and GDS, leading to a hw deadlock, because ME generates
> +	 * different wave IDs than the GDS expects. This situation happens
> +	 * randomly when at least 5 compute pipes use GDS ordered append.
> +	 * The wave IDs generated by ME are also wrong after suspend/resume.
> +	 * Those are probably bugs somewhere else in the kernel driver.
> +	 *
> +	 * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters in ME and
> +	 * GDS to 0 for this ring (me/pipe).
> +	 */
> +	if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
> +		amdgpu_ring_write(ring, PACKET3(PACKET3_SET_CONFIG_REG, 1));
> +		amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
> +		amdgpu_ring_write(ring, ring->adev->gds.gds_compute_max_wave_id);
> +	}
> +
>   	amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
>   	BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
>   	amdgpu_ring_write(ring,
>   #ifdef __BIG_ENDIAN
>   				(2 << 0) |
>   #endif
>   				lower_32_bits(ib->gpu_addr));
>   	amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
>   	amdgpu_ring_write(ring, control);
>   }
> @@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct amdgpu_device *adev)
>   		adev->gds.mem.total_size = 0x10000;
>   		break;
>   	case CHIP_RAVEN:
>   		adev->gds.mem.total_size = 0x1000;
>   		break;
>   	default:
>   		adev->gds.mem.total_size = 0x10000;
>   		break;
>   	}
>   
> +	switch (adev->asic_type) {
> +	case CHIP_VEGA10:
> +	case CHIP_VEGA20:
> +		adev->gds.gds_compute_max_wave_id = 0x7ff;
> +		break;
> +	case CHIP_VEGA12:
> +		adev->gds.gds_compute_max_wave_id = 0x27f;
> +		break;
> +	case CHIP_RAVEN:
> +		if (adev->rev_id >= 0x8)
> +			adev->gds.gds_compute_max_wave_id = 0x77; /* raven2 */
> +		else
> +			adev->gds.gds_compute_max_wave_id = 0x15f; /* raven1 */
> +		break;
> +	default:
> +		/* this really depends on the chip */
> +		adev->gds.gds_compute_max_wave_id = 0x7ff;
> +		break;
> +	}
> +
>   	adev->gds.gws.total_size = 64;
>   	adev->gds.oa.total_size = 16;
>   
>   	if (adev->gds.mem.total_size == 64 * 1024) {
>   		adev->gds.mem.gfx_partition_size = 4096;
>   		adev->gds.mem.cs_partition_size = 4096;
>   
>   		adev->gds.gws.gfx_partition_size = 4;
>   		adev->gds.gws.cs_partition_size = 4;
>   
> diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> index faaad04814e4..662d379ea624 100644
> --- a/include/uapi/drm/amdgpu_drm.h
> +++ b/include/uapi/drm/amdgpu_drm.h
> @@ -561,20 +561,25 @@ union drm_amdgpu_cs {
>   /* Preamble flag, which means the IB could be dropped if no context switch */
>   #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)
>   
>   /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag detected */
>   #define AMDGPU_IB_FLAG_PREEMPT (1<<2)
>   
>   /* The IB fence should do the L2 writeback but not invalidate any shader
>    * caches (L2/vL1/sL1/I$). */
>   #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)
>   
> +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before PACKET3_INDIRECT_BUFFER.
> + * This will reset wave ID counters for the IB.
> + */
> +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
> +
>   struct drm_amdgpu_cs_chunk_ib {
>   	__u32 _pad;
>   	/** AMDGPU_IB_FLAG_* */
>   	__u32 flags;
>   	/** Virtual address to begin IB execution */
>   	__u64 va_start;
>   	/** Size of submission */
>   	__u32 ib_bytes;
>   	/** HW IP to submit to */
>   	__u32 ip_type;