radeonsi: use CP DMA for the null const buffer clear on CIK

Submitted by Marek Olšák on April 12, 2019, 3:17 p.m.

Details

Message ID 20190412151719.24446-1-maraeo@gmail.com
State New
Headers show
Series "radeonsi: use CP DMA for the null const buffer clear on CIK" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Marek Olšák April 12, 2019, 3:17 p.m.
From: Marek Olšák <marek.olsak@amd.com>

This is a workaround for a thread deadlock that I have no idea
why it occurs.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879
Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e
---
 src/gallium/drivers/radeonsi/si_clear.c        | 6 +++---
 src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++---
 src/gallium/drivers/radeonsi/si_pipe.c         | 2 +-
 src/gallium/drivers/radeonsi/si_pipe.h         | 3 ++-
 src/gallium/drivers/radeonsi/si_test_dma.c     | 2 +-
 5 files changed, 12 insertions(+), 9 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index e1805f2a1c9..ead680b857b 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -256,21 +256,21 @@  void vi_dcc_clear_level(struct si_context *sctx,
 		 * would be more efficient than separate per-layer clear operations.
 		 */
 		assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1);
 
 		dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
 		clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
 			     num_layers;
 	}
 
 	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-			&clear_value, 4, SI_COHERENCY_CB_META);
+			&clear_value, 4, SI_COHERENCY_CB_META, false);
 }
 
 /* Set the same micro tile mode as the destination of the last MSAA resolve.
  * This allows hitting the MSAA resolve fast path, which requires that both
  * src and dst micro tile modes match.
  */
 static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
 					   struct si_texture *tex)
 {
 	if (tex->buffer.b.is_shared ||
@@ -489,21 +489,21 @@  static void si_do_fast_color_clear(struct si_context *sctx,
 
 			/* DCC fast clear with MSAA should clear CMASK to 0xC. */
 			if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
 				/* TODO: This doesn't work with MSAA. */
 				if (eliminate_needed)
 					continue;
 
 				uint32_t clear_value = 0xCCCCCCCC;
 				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 						tex->cmask_offset, tex->surface.cmask_size,
-						&clear_value, 4, SI_COHERENCY_CB_META);
+						&clear_value, 4, SI_COHERENCY_CB_META, false);
 				fmask_decompress_needed = true;
 			}
 
 			vi_dcc_clear_level(sctx, tex, 0, reset_value);
 			tex->separate_dcc_dirty = true;
 		} else {
 			if (too_small)
 				continue;
 
 			/* 128-bit formats are unusupported */
@@ -517,21 +517,21 @@  static void si_do_fast_color_clear(struct si_context *sctx,
 
 			/* ensure CMASK is enabled */
 			si_alloc_separate_cmask(sctx->screen, tex);
 			if (!tex->cmask_buffer)
 				continue;
 
 			/* Do the fast clear. */
 			uint32_t clear_value = 0;
 			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 					tex->cmask_offset, tex->surface.cmask_size,
-					&clear_value, 4, SI_COHERENCY_CB_META);
+					&clear_value, 4, SI_COHERENCY_CB_META, false);
 			eliminate_needed = true;
 		}
 
 		if ((eliminate_needed || fmask_decompress_needed) &&
 		    !(tex->dirty_level_mask & (1 << level))) {
 			tex->dirty_level_mask |= 1 << level;
 			p_atomic_inc(&sctx->screen->compressed_colortex_counter);
 		}
 
 		/* We can change the micro tile mode before a full clear. */
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 1abeac6adb0..fb0d8d2f1b6 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -179,21 +179,22 @@  static void si_compute_do_clear_or_copy(struct si_context *sctx,
 
 	/* Restore states. */
 	ctx->bind_compute_state(ctx, saved_cs);
 	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
 				saved_writable_mask);
 	si_compute_internal_end(sctx);
 }
 
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher)
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma)
 {
 	if (!size)
 		return;
 
 	unsigned clear_alignment = MIN2(clear_value_size, 4);
 
 	assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
 	assert(offset % clear_alignment == 0);
 	assert(size % clear_alignment == 0);
 	assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
@@ -243,21 +244,22 @@  void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		return;
 	}
 
 	uint64_t aligned_size = size & ~3ull;
 	if (aligned_size >= 4) {
 		/* Before GFX9, CP DMA was very slow when clearing GTT, so never
 		 * use CP DMA clears on those chips, because we can't be certain
 		 * about buffer placements.
 		 */
 		if (clear_value_size > 4 ||
-		    (clear_value_size == 4 &&
+		    (!force_cpdma &&
+		     clear_value_size == 4 &&
 		     offset % 4 == 0 &&
 		     (size > 32*1024 || sctx->chip_class <= VI))) {
 			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
 						    aligned_size, clear_value,
 						    clear_value_size, coher);
 		} else {
 			assert(clear_value_size == 4);
 			si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
 					       aligned_size, *clear_value, 0, coher,
 					       get_cache_policy(sctx, coher, size));
@@ -277,21 +279,21 @@  void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 	}
 }
 
 static void si_pipe_clear_buffer(struct pipe_context *ctx,
 				 struct pipe_resource *dst,
 				 unsigned offset, unsigned size,
 				 const void *clear_value,
 				 int clear_value_size)
 {
 	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
-			clear_value_size, SI_COHERENCY_SHADER);
+			clear_value_size, SI_COHERENCY_SHADER, false);
 }
 
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size)
 {
 	if (!size)
 		return;
 
 	enum si_coherency coher = SI_COHERENCY_SHADER;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 5caeb575623..5d376e6181a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -634,21 +634,21 @@  static struct pipe_context *si_create_context(struct pipe_screen *screen,
 			  sizeof(sctx->sample_positions), &sctx->sample_positions);
 
 	/* this must be last */
 	si_begin_new_gfx_cs(sctx);
 
 	if (sctx->chip_class == CIK) {
 		/* Clear the NULL constant buffer, because loads should return zeros. */
 		uint32_t clear_value = 0;
 		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
 				sctx->null_const_buf.buffer->width0,
-				&clear_value, 4, SI_COHERENCY_SHADER);
+				&clear_value, 4, SI_COHERENCY_SHADER, true);
 	}
 	return &sctx->b;
 fail:
 	fprintf(stderr, "radeonsi: Failed to create a context.\n");
 	si_destroy_context(&sctx->b);
 	return NULL;
 }
 
 static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
 						   void *priv, unsigned flags)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 301d38649bf..aaa95f32d20 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1182,21 +1182,22 @@  bool vi_alpha_is_on_msb(enum pipe_format format);
 void vi_dcc_clear_level(struct si_context *sctx,
 			struct si_texture *tex,
 			unsigned level, unsigned clear_value);
 void si_init_clear_functions(struct si_context *sctx);
 
 /* si_compute_blit.c */
 unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
 			    enum si_cache_policy cache_policy);
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher);
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma);
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
 void si_compute_copy_image(struct si_context *sctx,
 			   struct pipe_resource *dst,
 			   unsigned dst_level,
 			   struct pipe_resource *src,
 			   unsigned src_level,
 			   unsigned dstx, unsigned dsty, unsigned dstz,
 			   const struct pipe_box *src_box);
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index 90a2032cd80..7e396e671be 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -302,21 +302,21 @@  void si_test_dma(struct si_screen *sscreen)
 		       tsrc.width0, tsrc.height0, tsrc.array_size,
 		       array_mode_to_string(sscreen, &ssrc->surface), bpp);
 		fflush(stdout);
 
 		/* set src pixels */
 		set_random_pixels(ctx, src, &src_cpu);
 
 		/* clear dst pixels */
 		uint32_t zero = 0;
 		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
-		                SI_COHERENCY_SHADER);
+		                SI_COHERENCY_SHADER, false);
 		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
 
 		/* preparation */
 		max_width = MIN2(tsrc.width0, tdst.width0);
 		max_height = MIN2(tsrc.height0, tdst.height0);
 		max_depth = MIN2(tsrc.array_size, tdst.array_size);
 
 		num = do_partial_copies ? num_partial_copies : 1;
 		for (j = 0; j < num; j++) {
 			int width, height, depth;

Comments

I would suggest to document that workaround somewhere in the code.

On 4/12/19 5:17 PM, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak@amd.com>
>
> This is a workaround for a thread deadlock that I have no idea
> why it occurs.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879
> Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e
> ---
>   src/gallium/drivers/radeonsi/si_clear.c        | 6 +++---
>   src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++---
>   src/gallium/drivers/radeonsi/si_pipe.c         | 2 +-
>   src/gallium/drivers/radeonsi/si_pipe.h         | 3 ++-
>   src/gallium/drivers/radeonsi/si_test_dma.c     | 2 +-
>   5 files changed, 12 insertions(+), 9 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
> index e1805f2a1c9..ead680b857b 100644
> --- a/src/gallium/drivers/radeonsi/si_clear.c
> +++ b/src/gallium/drivers/radeonsi/si_clear.c
> @@ -256,21 +256,21 @@ void vi_dcc_clear_level(struct si_context *sctx,
>   		 * would be more efficient than separate per-layer clear operations.
>   		 */
>   		assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1);
>   
>   		dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
>   		clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size *
>   			     num_layers;
>   	}
>   
>   	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
> -			&clear_value, 4, SI_COHERENCY_CB_META);
> +			&clear_value, 4, SI_COHERENCY_CB_META, false);
>   }
>   
>   /* Set the same micro tile mode as the destination of the last MSAA resolve.
>    * This allows hitting the MSAA resolve fast path, which requires that both
>    * src and dst micro tile modes match.
>    */
>   static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
>   					   struct si_texture *tex)
>   {
>   	if (tex->buffer.b.is_shared ||
> @@ -489,21 +489,21 @@ static void si_do_fast_color_clear(struct si_context *sctx,
>   
>   			/* DCC fast clear with MSAA should clear CMASK to 0xC. */
>   			if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
>   				/* TODO: This doesn't work with MSAA. */
>   				if (eliminate_needed)
>   					continue;
>   
>   				uint32_t clear_value = 0xCCCCCCCC;
>   				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
>   						tex->cmask_offset, tex->surface.cmask_size,
> -						&clear_value, 4, SI_COHERENCY_CB_META);
> +						&clear_value, 4, SI_COHERENCY_CB_META, false);
>   				fmask_decompress_needed = true;
>   			}
>   
>   			vi_dcc_clear_level(sctx, tex, 0, reset_value);
>   			tex->separate_dcc_dirty = true;
>   		} else {
>   			if (too_small)
>   				continue;
>   
>   			/* 128-bit formats are unusupported */
> @@ -517,21 +517,21 @@ static void si_do_fast_color_clear(struct si_context *sctx,
>   
>   			/* ensure CMASK is enabled */
>   			si_alloc_separate_cmask(sctx->screen, tex);
>   			if (!tex->cmask_buffer)
>   				continue;
>   
>   			/* Do the fast clear. */
>   			uint32_t clear_value = 0;
>   			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
>   					tex->cmask_offset, tex->surface.cmask_size,
> -					&clear_value, 4, SI_COHERENCY_CB_META);
> +					&clear_value, 4, SI_COHERENCY_CB_META, false);
>   			eliminate_needed = true;
>   		}
>   
>   		if ((eliminate_needed || fmask_decompress_needed) &&
>   		    !(tex->dirty_level_mask & (1 << level))) {
>   			tex->dirty_level_mask |= 1 << level;
>   			p_atomic_inc(&sctx->screen->compressed_colortex_counter);
>   		}
>   
>   		/* We can change the micro tile mode before a full clear. */
> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
> index 1abeac6adb0..fb0d8d2f1b6 100644
> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> @@ -179,21 +179,22 @@ static void si_compute_do_clear_or_copy(struct si_context *sctx,
>   
>   	/* Restore states. */
>   	ctx->bind_compute_state(ctx, saved_cs);
>   	ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb,
>   				saved_writable_mask);
>   	si_compute_internal_end(sctx);
>   }
>   
>   void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
>   		     uint64_t offset, uint64_t size, uint32_t *clear_value,
> -		     uint32_t clear_value_size, enum si_coherency coher)
> +		     uint32_t clear_value_size, enum si_coherency coher,
> +		     bool force_cpdma)
>   {
>   	if (!size)
>   		return;
>   
>   	unsigned clear_alignment = MIN2(clear_value_size, 4);
>   
>   	assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */
>   	assert(offset % clear_alignment == 0);
>   	assert(size % clear_alignment == 0);
>   	assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */
> @@ -243,21 +244,22 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
>   		return;
>   	}
>   
>   	uint64_t aligned_size = size & ~3ull;
>   	if (aligned_size >= 4) {
>   		/* Before GFX9, CP DMA was very slow when clearing GTT, so never
>   		 * use CP DMA clears on those chips, because we can't be certain
>   		 * about buffer placements.
>   		 */
>   		if (clear_value_size > 4 ||
> -		    (clear_value_size == 4 &&
> +		    (!force_cpdma &&
> +		     clear_value_size == 4 &&
>   		     offset % 4 == 0 &&
>   		     (size > 32*1024 || sctx->chip_class <= VI))) {
>   			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
>   						    aligned_size, clear_value,
>   						    clear_value_size, coher);
>   		} else {
>   			assert(clear_value_size == 4);
>   			si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, offset,
>   					       aligned_size, *clear_value, 0, coher,
>   					       get_cache_policy(sctx, coher, size));
> @@ -277,21 +279,21 @@ void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
>   	}
>   }
>   
>   static void si_pipe_clear_buffer(struct pipe_context *ctx,
>   				 struct pipe_resource *dst,
>   				 unsigned offset, unsigned size,
>   				 const void *clear_value,
>   				 int clear_value_size)
>   {
>   	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
> -			clear_value_size, SI_COHERENCY_SHADER);
> +			clear_value_size, SI_COHERENCY_SHADER, false);
>   }
>   
>   void si_copy_buffer(struct si_context *sctx,
>   		    struct pipe_resource *dst, struct pipe_resource *src,
>   		    uint64_t dst_offset, uint64_t src_offset, unsigned size)
>   {
>   	if (!size)
>   		return;
>   
>   	enum si_coherency coher = SI_COHERENCY_SHADER;
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 5caeb575623..5d376e6181a 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -634,21 +634,21 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
>   			  sizeof(sctx->sample_positions), &sctx->sample_positions);
>   
>   	/* this must be last */
>   	si_begin_new_gfx_cs(sctx);
>   
>   	if (sctx->chip_class == CIK) {
>   		/* Clear the NULL constant buffer, because loads should return zeros. */
>   		uint32_t clear_value = 0;
>   		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
>   				sctx->null_const_buf.buffer->width0,
> -				&clear_value, 4, SI_COHERENCY_SHADER);
> +				&clear_value, 4, SI_COHERENCY_SHADER, true);
>   	}
>   	return &sctx->b;
>   fail:
>   	fprintf(stderr, "radeonsi: Failed to create a context.\n");
>   	si_destroy_context(&sctx->b);
>   	return NULL;
>   }
>   
>   static struct pipe_context *si_pipe_create_context(struct pipe_screen *screen,
>   						   void *priv, unsigned flags)
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 301d38649bf..aaa95f32d20 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -1182,21 +1182,22 @@ bool vi_alpha_is_on_msb(enum pipe_format format);
>   void vi_dcc_clear_level(struct si_context *sctx,
>   			struct si_texture *tex,
>   			unsigned level, unsigned clear_value);
>   void si_init_clear_functions(struct si_context *sctx);
>   
>   /* si_compute_blit.c */
>   unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
>   			    enum si_cache_policy cache_policy);
>   void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
>   		     uint64_t offset, uint64_t size, uint32_t *clear_value,
> -		     uint32_t clear_value_size, enum si_coherency coher);
> +		     uint32_t clear_value_size, enum si_coherency coher,
> +		     bool force_cpdma);
>   void si_copy_buffer(struct si_context *sctx,
>   		    struct pipe_resource *dst, struct pipe_resource *src,
>   		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
>   void si_compute_copy_image(struct si_context *sctx,
>   			   struct pipe_resource *dst,
>   			   unsigned dst_level,
>   			   struct pipe_resource *src,
>   			   unsigned src_level,
>   			   unsigned dstx, unsigned dsty, unsigned dstz,
>   			   const struct pipe_box *src_box);
> diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
> index 90a2032cd80..7e396e671be 100644
> --- a/src/gallium/drivers/radeonsi/si_test_dma.c
> +++ b/src/gallium/drivers/radeonsi/si_test_dma.c
> @@ -302,21 +302,21 @@ void si_test_dma(struct si_screen *sscreen)
>   		       tsrc.width0, tsrc.height0, tsrc.array_size,
>   		       array_mode_to_string(sscreen, &ssrc->surface), bpp);
>   		fflush(stdout);
>   
>   		/* set src pixels */
>   		set_random_pixels(ctx, src, &src_cpu);
>   
>   		/* clear dst pixels */
>   		uint32_t zero = 0;
>   		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
> -		                SI_COHERENCY_SHADER);
> +		                SI_COHERENCY_SHADER, false);
>   		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
>   
>   		/* preparation */
>   		max_width = MIN2(tsrc.width0, tdst.width0);
>   		max_height = MIN2(tsrc.height0, tdst.height0);
>   		max_depth = MIN2(tsrc.array_size, tdst.array_size);
>   
>   		num = do_partial_copies ? num_partial_copies : 1;
>   		for (j = 0; j < num; j++) {
>   			int width, height, depth;


Looks good to me.

On 4/23/19 8:51 PM, Dylan Baker wrote:
> Hi Marek and Samuel,
>
> I've staged this for 19.0, but I had to fix some very minor rebase conflicts.
> I'm getting ready to make a release, could one of you take a peak at the tip of
> the staging/19.0 branch and let me know if what I did looks okay?
>
> Thanks,
> Dylan
>
> Quoting Samuel Pitoiset (2019-04-16 08:24:01)
>> I don't have much context for that issue, so:
>>
>> Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
>>
>> On 4/12/19 10:15 PM, Marek Ol\u0161 k wrote:
>>
>>      Done locally.
>>
>>      Marek
>>
>>      On Fri, Apr 12, 2019 at 12:20 PM Samuel Pitoiset <samuel.pitoiset@gmail.com
>>      > wrote:
>>
>>          I would suggest to document that workaround somewhere in the code.
>>
>>          On 4/12/19 5:17 PM, Marek Ol\u0161 k wrote:
>>          > From: Marek Ol\u0161 k <marek.olsak@amd.com>
>>          >
>>          > This is a workaround for a thread deadlock that I have no idea
>>          > why it occurs.
>>          >
>>          > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879
>>          > Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e
>>          > ---
>>          >   src/gallium/drivers/radeonsi/si_clear.c        | 6 +++---
>>          >   src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++---
>>          >   src/gallium/drivers/radeonsi/si_pipe.c         | 2 +-
>>          >   src/gallium/drivers/radeonsi/si_pipe.h         | 3 ++-
>>          >   src/gallium/drivers/radeonsi/si_test_dma.c     | 2 +-
>>          >   5 files changed, 12 insertions(+), 9 deletions(-)
>>          >
>>          > diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/
>>          drivers/radeonsi/si_clear.c
>>          > index e1805f2a1c9..ead680b857b 100644
>>          > --- a/src/gallium/drivers/radeonsi/si_clear.c
>>          > +++ b/src/gallium/drivers/radeonsi/si_clear.c
>>          > @@ -256,21 +256,21 @@ void vi_dcc_clear_level(struct si_context
>>          *sctx,
>>          >                * would be more efficient than separate per-layer
>>          clear operations.
>>          >                */
>>          >               assert(tex->buffer.b.b.nr_storage_samples <= 2 ||
>>          num_layers == 1);
>>          >
>>          >               dcc_offset += tex->surface.u.legacy.level
>>          [level].dcc_offset;
>>          >               clear_size = tex->surface.u.legacy.level
>>          [level].dcc_fast_clear_size *
>>          >                            num_layers;
>>          >       }
>>          >
>>          >       si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
>>          > -                     &clear_value, 4, SI_COHERENCY_CB_META);
>>          > +                     &clear_value, 4, SI_COHERENCY_CB_META, false);
>>          >   }
>>          >
>>          >   /* Set the same micro tile mode as the destination of the last MSAA
>>          resolve.
>>          >    * This allows hitting the MSAA resolve fast path, which requires
>>          that both
>>          >    * src and dst micro tile modes match.
>>          >    */
>>          >   static void si_set_optimal_micro_tile_mode(struct si_screen
>>          *sscreen,
>>          >                                          struct si_texture *tex)
>>          >   {
>>          >       if (tex->buffer.b.is_shared ||
>>          > @@ -489,21 +489,21 @@ static void si_do_fast_color_clear(struct
>>          si_context *sctx,
>>          >
>>          >                       /* DCC fast clear with MSAA should clear CMASK
>>          to 0xC. */
>>          >                       if (tex->buffer.b.b.nr_samples >= 2 && tex->
>>          cmask_buffer) {
>>          >                               /* TODO: This doesn't work with MSAA. *
>>          /
>>          >                               if (eliminate_needed)
>>          >                                       continue;
>>          >
>>          >                               uint32_t clear_value = 0xCCCCCCCC;
>>          >                               si_clear_buffer(sctx, &tex->
>>          cmask_buffer->b.b,
>>          >                                               tex->cmask_offset,
>>          tex->surface.cmask_size,
>>          > -                                             &clear_value, 4,
>>          SI_COHERENCY_CB_META);
>>          > +                                             &clear_value, 4,
>>          SI_COHERENCY_CB_META, false);
>>          >                               fmask_decompress_needed = true;
>>          >                       }
>>          >
>>          >                       vi_dcc_clear_level(sctx, tex, 0, reset_value);
>>          >                       tex->separate_dcc_dirty = true;
>>          >               } else {
>>          >                       if (too_small)
>>          >                               continue;
>>          >
>>          >                       /* 128-bit formats are unusupported */
>>          > @@ -517,21 +517,21 @@ static void si_do_fast_color_clear(struct
>>          si_context *sctx,
>>          >
>>          >                       /* ensure CMASK is enabled */
>>          >                       si_alloc_separate_cmask(sctx->screen, tex);
>>          >                       if (!tex->cmask_buffer)
>>          >                               continue;
>>          >
>>          >                       /* Do the fast clear. */
>>          >                       uint32_t clear_value = 0;
>>          >                       si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
>>          >                                       tex->cmask_offset, tex->
>>          surface.cmask_size,
>>          > -                                     &clear_value, 4,
>>          SI_COHERENCY_CB_META);
>>          > +                                     &clear_value, 4,
>>          SI_COHERENCY_CB_META, false);
>>          >                       eliminate_needed = true;
>>          >               }
>>          >
>>          >               if ((eliminate_needed || fmask_decompress_needed) &&
>>          >                   !(tex->dirty_level_mask & (1 << level))) {
>>          >                       tex->dirty_level_mask |= 1 << level;
>>          >                       p_atomic_inc(&sctx->screen->
>>          compressed_colortex_counter);
>>          >               }
>>          >
>>          >               /* We can change the micro tile mode before a full
>>          clear. */
>>          > diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/
>>          gallium/drivers/radeonsi/si_compute_blit.c
>>          > index 1abeac6adb0..fb0d8d2f1b6 100644
>>          > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
>>          > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
>>          > @@ -179,21 +179,22 @@ static void si_compute_do_clear_or_copy(struct
>>          si_context *sctx,
>>          >
>>          >       /* Restore states. */
>>          >       ctx->bind_compute_state(ctx, saved_cs);
>>          >       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 :
>>          1, saved_sb,
>>          >                               saved_writable_mask);
>>          >       si_compute_internal_end(sctx);
>>          >   }
>>          >
>>          >   void si_clear_buffer(struct si_context *sctx, struct pipe_resource
>>          *dst,
>>          >                    uint64_t offset, uint64_t size, uint32_t
>>          *clear_value,
>>          > -                  uint32_t clear_value_size, enum si_coherency
>>          coher)
>>          > +                  uint32_t clear_value_size, enum si_coherency
>>          coher,
>>          > +                  bool force_cpdma)
>>          >   {
>>          >       if (!size)
>>          >               return;
>>          >
>>          >       unsigned clear_alignment = MIN2(clear_value_size, 4);
>>          >
>>          >       assert(clear_value_size != 3 && clear_value_size != 6); /* 12
>>          is allowed. */
>>          >       assert(offset % clear_alignment == 0);
>>          >       assert(size % clear_alignment == 0);
>>          >       assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in
>>          all codepaths */
>>          > @@ -243,21 +244,22 @@ void si_clear_buffer(struct si_context *sctx,
>>          struct pipe_resource *dst,
>>          >               return;
>>          >       }
>>          >
>>          >       uint64_t aligned_size = size & ~3ull;
>>          >       if (aligned_size >= 4) {
>>          >               /* Before GFX9, CP DMA was very slow when clearing GTT,
>>          so never
>>          >                * use CP DMA clears on those chips, because we can't
>>          be certain
>>          >                * about buffer placements.
>>          >                */
>>          >               if (clear_value_size > 4 ||
>>          > -                 (clear_value_size == 4 &&
>>          > +                 (!force_cpdma &&
>>          > +                  clear_value_size == 4 &&
>>          >                    offset % 4 == 0 &&
>>          >                    (size > 32*1024 || sctx->chip_class <= VI))) {
>>          >                       si_compute_do_clear_or_copy(sctx, dst, offset,
>>          NULL, 0,
>>          >                                                   aligned_size,
>>          clear_value,
>>          >                                                   clear_value_size,
>>          coher);
>>          >               } else {
>>          >                       assert(clear_value_size == 4);
>>          >                       si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst,
>>          offset,
>>          >                                              aligned_size,
>>          *clear_value, 0, coher,
>>          >                                              get_cache_policy(sctx,
>>          coher, size));
>>          > @@ -277,21 +279,21 @@ void si_clear_buffer(struct si_context *sctx,
>>          struct pipe_resource *dst,
>>          >       }
>>          >   }
>>          >
>>          >   static void si_pipe_clear_buffer(struct pipe_context *ctx,
>>          >                                struct pipe_resource *dst,
>>          >                                unsigned offset, unsigned size,
>>          >                                const void *clear_value,
>>          >                                int clear_value_size)
>>          >   {
>>          >       si_clear_buffer((struct si_context*)ctx, dst, offset, size,
>>          (uint32_t*)clear_value,
>>          > -                     clear_value_size, SI_COHERENCY_SHADER);
>>          > +                     clear_value_size, SI_COHERENCY_SHADER, false);
>>          >   }
>>          >
>>          >   void si_copy_buffer(struct si_context *sctx,
>>          >                   struct pipe_resource *dst, struct pipe_resource
>>          *src,
>>          >                   uint64_t dst_offset, uint64_t src_offset, unsigned
>>          size)
>>          >   {
>>          >       if (!size)
>>          >               return;
>>          >
>>          >       enum si_coherency coher = SI_COHERENCY_SHADER;
>>          > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/
>>          drivers/radeonsi/si_pipe.c
>>          > index 5caeb575623..5d376e6181a 100644
>>          > --- a/src/gallium/drivers/radeonsi/si_pipe.c
>>          > +++ b/src/gallium/drivers/radeonsi/si_pipe.c
>>          > @@ -634,21 +634,21 @@ static struct pipe_context *si_create_context
>>          (struct pipe_screen *screen,
>>          >                         sizeof(sctx->sample_positions), &sctx->
>>          sample_positions);
>>          >
>>          >       /* this must be last */
>>          >       si_begin_new_gfx_cs(sctx);
>>          >
>>          >       if (sctx->chip_class == CIK) {
>>          >               /* Clear the NULL constant buffer, because loads should
>>          return zeros. */
>>          >               uint32_t clear_value = 0;
>>          >               si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
>>          >                               sctx->null_const_buf.buffer->width0,
>>          > -                             &clear_value, 4, SI_COHERENCY_SHADER);
>>          > +                             &clear_value, 4, SI_COHERENCY_SHADER,
>>          true);
>>          >       }
>>          >       return &sctx->b;
>>          >   fail:
>>          >       fprintf(stderr, "radeonsi: Failed to create a context.\n");
>>          >       si_destroy_context(&sctx->b);
>>          >       return NULL;
>>          >   }
>>          >
>>          >   static struct pipe_context *si_pipe_create_context(struct
>>          pipe_screen *screen,
>>          >                                                  void *priv, unsigned
>>          flags)
>>          > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/
>>          drivers/radeonsi/si_pipe.h
>>          > index 301d38649bf..aaa95f32d20 100644
>>          > --- a/src/gallium/drivers/radeonsi/si_pipe.h
>>          > +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>>          > @@ -1182,21 +1182,22 @@ bool vi_alpha_is_on_msb(enum pipe_format
>>          format);
>>          >   void vi_dcc_clear_level(struct si_context *sctx,
>>          >                       struct si_texture *tex,
>>          >                       unsigned level, unsigned clear_value);
>>          >   void si_init_clear_functions(struct si_context *sctx);
>>          >
>>          >   /* si_compute_blit.c */
>>          >   unsigned si_get_flush_flags(struct si_context *sctx, enum
>>          si_coherency coher,
>>          >                           enum si_cache_policy cache_policy);
>>          >   void si_clear_buffer(struct si_context *sctx, struct pipe_resource
>>          *dst,
>>          >                    uint64_t offset, uint64_t size, uint32_t
>>          *clear_value,
>>          > -                  uint32_t clear_value_size, enum si_coherency
>>          coher);
>>          > +                  uint32_t clear_value_size, enum si_coherency
>>          coher,
>>          > +                  bool force_cpdma);
>>          >   void si_copy_buffer(struct si_context *sctx,
>>          >                   struct pipe_resource *dst, struct pipe_resource
>>          *src,
>>          >                   uint64_t dst_offset, uint64_t src_offset, unsigned
>>          size);
>>          >   void si_compute_copy_image(struct si_context *sctx,
>>          >                          struct pipe_resource *dst,
>>          >                          unsigned dst_level,
>>          >                          struct pipe_resource *src,
>>          >                          unsigned src_level,
>>          >                          unsigned dstx, unsigned dsty, unsigned dstz,
>>          >                          const struct pipe_box *src_box);
>>          > diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium
>>          /drivers/radeonsi/si_test_dma.c
>>          > index 90a2032cd80..7e396e671be 100644
>>          > --- a/src/gallium/drivers/radeonsi/si_test_dma.c
>>          > +++ b/src/gallium/drivers/radeonsi/si_test_dma.c
>>          > @@ -302,21 +302,21 @@ void si_test_dma(struct si_screen *sscreen)
>>          >                      tsrc.width0, tsrc.height0, tsrc.array_size,
>>          >                      array_mode_to_string(sscreen, &ssrc->surface),
>>          bpp);
>>          >               fflush(stdout);
>>          >
>>          >               /* set src pixels */
>>          >               set_random_pixels(ctx, src, &src_cpu);
>>          >
>>          >               /* clear dst pixels */
>>          >               uint32_t zero = 0;
>>          >               si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size,
>>          &zero, 4,
>>          > -                             SI_COHERENCY_SHADER);
>>          > +                             SI_COHERENCY_SHADER, false);
>>          >               memset(dst_cpu.ptr, 0, dst_cpu.layer_stride *
>>          tdst.array_size);
>>          >
>>          >               /* preparation */
>>          >               max_width = MIN2(tsrc.width0, tdst.width0);
>>          >               max_height = MIN2(tsrc.height0, tdst.height0);
>>          >               max_depth = MIN2(tsrc.array_size, tdst.array_size);
>>          >
>>          >               num = do_partial_copies ? num_partial_copies : 1;
>>          >               for (j = 0; j < num; j++) {
>>          >                       int width, height, depth;
>>