radeonsi: use CP DMA for the null const buffer clear on CIK

Submitted by Marek Olšák on April 23, 2019, 8:16 p.m.

Details

Message ID CAAxE2A4sxziqk2J1e5Zy7c-4O_0iLr4itTpSKhaMaMHHoD5G4A@mail.gmail.com
State New
Headers show
Series "radeonsi: use CP DMA for the null const buffer clear on CIK" ( rev: 2 ) in Mesa

Not browsing as part of any series.

Commit Message

Marek Olšák April 23, 2019, 8:16 p.m.
No, the correct backport is attached.

Marek

On Tue, Apr 23, 2019 at 2:51 PM Dylan Baker <dylan@pnwbakers.com> wrote:

> Hi Marek and Samuel,
>
> I've staged this for 19.0, but I had to fix some very minor rebase
> conflicts.
> I'm getting ready to make a release, could one of you take a peak at the
> tip of
> the staging/19.0 branch and let me know if what I did looks okay?
>
> Thanks,
> Dylan
>
> Quoting Samuel Pitoiset (2019-04-16 08:24:01)
> > I don't have much context for that issue, so:
> >
> > Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> >
> > On 4/12/19 10:15 PM, Marek Ol\u0161 k wrote:
> >
> >     Done locally.
> >
> >     Marek
> >
> >     On Fri, Apr 12, 2019 at 12:20 PM Samuel Pitoiset <
> samuel.pitoiset@gmail.com
> >     > wrote:
> >
> >         I would suggest to document that workaround somewhere in the
> code.
> >
> >         On 4/12/19 5:17 PM, Marek Ol\u0161 k wrote:
> >         > From: Marek Ol\u0161 k <marek.olsak@amd.com>
> >         >
> >         > This is a workaround for a thread deadlock that I have no idea
> >         > why it occurs.
> >         >
> >         > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879
> >         > Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e
> >         > ---
> >         >   src/gallium/drivers/radeonsi/si_clear.c        | 6 +++---
> >         >   src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++---
> >         >   src/gallium/drivers/radeonsi/si_pipe.c         | 2 +-
> >         >   src/gallium/drivers/radeonsi/si_pipe.h         | 3 ++-
> >         >   src/gallium/drivers/radeonsi/si_test_dma.c     | 2 +-
> >         >   5 files changed, 12 insertions(+), 9 deletions(-)
> >         >
> >         > diff --git a/src/gallium/drivers/radeonsi/si_clear.c
> b/src/gallium/
> >         drivers/radeonsi/si_clear.c
> >         > index e1805f2a1c9..ead680b857b 100644
> >         > --- a/src/gallium/drivers/radeonsi/si_clear.c
> >         > +++ b/src/gallium/drivers/radeonsi/si_clear.c
> >         > @@ -256,21 +256,21 @@ void vi_dcc_clear_level(struct si_context
> >         *sctx,
> >         >                * would be more efficient than separate
> per-layer
> >         clear operations.
> >         >                */
> >         >               assert(tex->buffer.b.b.nr_storage_samples <= 2 ||
> >         num_layers == 1);
> >         >
> >         >               dcc_offset += tex->surface.u.legacy.level
> >         [level].dcc_offset;
> >         >               clear_size = tex->surface.u.legacy.level
> >         [level].dcc_fast_clear_size *
> >         >                            num_layers;
> >         >       }
> >         >
> >         >       si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
> >         > -                     &clear_value, 4, SI_COHERENCY_CB_META);
> >         > +                     &clear_value, 4, SI_COHERENCY_CB_META,
> false);
> >         >   }
> >         >
> >         >   /* Set the same micro tile mode as the destination of the
> last MSAA
> >         resolve.
> >         >    * This allows hitting the MSAA resolve fast path, which
> requires
> >         that both
> >         >    * src and dst micro tile modes match.
> >         >    */
> >         >   static void si_set_optimal_micro_tile_mode(struct si_screen
> >         *sscreen,
> >         >                                          struct si_texture
> *tex)
> >         >   {
> >         >       if (tex->buffer.b.is_shared ||
> >         > @@ -489,21 +489,21 @@ static void si_do_fast_color_clear(struct
> >         si_context *sctx,
> >         >
> >         >                       /* DCC fast clear with MSAA should clear
> CMASK
> >         to 0xC. */
> >         >                       if (tex->buffer.b.b.nr_samples >= 2 &&
> tex->
> >         cmask_buffer) {
> >         >                               /* TODO: This doesn't work with
> MSAA. *
> >         /
> >         >                               if (eliminate_needed)
> >         >                                       continue;
> >         >
> >         >                               uint32_t clear_value =
> 0xCCCCCCCC;
> >         >                               si_clear_buffer(sctx, &tex->
> >         cmask_buffer->b.b,
> >         >
>  tex->cmask_offset,
> >         tex->surface.cmask_size,
> >         > -                                             &clear_value, 4,
> >         SI_COHERENCY_CB_META);
> >         > +                                             &clear_value, 4,
> >         SI_COHERENCY_CB_META, false);
> >         >                               fmask_decompress_needed = true;
> >         >                       }
> >         >
> >         >                       vi_dcc_clear_level(sctx, tex, 0,
> reset_value);
> >         >                       tex->separate_dcc_dirty = true;
> >         >               } else {
> >         >                       if (too_small)
> >         >                               continue;
> >         >
> >         >                       /* 128-bit formats are unusupported */
> >         > @@ -517,21 +517,21 @@ static void si_do_fast_color_clear(struct
> >         si_context *sctx,
> >         >
> >         >                       /* ensure CMASK is enabled */
> >         >                       si_alloc_separate_cmask(sctx->screen,
> tex);
> >         >                       if (!tex->cmask_buffer)
> >         >                               continue;
> >         >
> >         >                       /* Do the fast clear. */
> >         >                       uint32_t clear_value = 0;
> >         >                       si_clear_buffer(sctx,
> &tex->cmask_buffer->b.b,
> >         >                                       tex->cmask_offset, tex->
> >         surface.cmask_size,
> >         > -                                     &clear_value, 4,
> >         SI_COHERENCY_CB_META);
> >         > +                                     &clear_value, 4,
> >         SI_COHERENCY_CB_META, false);
> >         >                       eliminate_needed = true;
> >         >               }
> >         >
> >         >               if ((eliminate_needed ||
> fmask_decompress_needed) &&
> >         >                   !(tex->dirty_level_mask & (1 << level))) {
> >         >                       tex->dirty_level_mask |= 1 << level;
> >         >                       p_atomic_inc(&sctx->screen->
> >         compressed_colortex_counter);
> >         >               }
> >         >
> >         >               /* We can change the micro tile mode before a
> full
> >         clear. */
> >         > diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c
> b/src/
> >         gallium/drivers/radeonsi/si_compute_blit.c
> >         > index 1abeac6adb0..fb0d8d2f1b6 100644
> >         > --- a/src/gallium/drivers/radeonsi/si_compute_blit.c
> >         > +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
> >         > @@ -179,21 +179,22 @@ static void
> si_compute_do_clear_or_copy(struct
> >         si_context *sctx,
> >         >
> >         >       /* Restore states. */
> >         >       ctx->bind_compute_state(ctx, saved_cs);
> >         >       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src
> ? 2 :
> >         1, saved_sb,
> >         >                               saved_writable_mask);
> >         >       si_compute_internal_end(sctx);
> >         >   }
> >         >
> >         >   void si_clear_buffer(struct si_context *sctx, struct
> pipe_resource
> >         *dst,
> >         >                    uint64_t offset, uint64_t size, uint32_t
> >         *clear_value,
> >         > -                  uint32_t clear_value_size, enum si_coherency
> >         coher)
> >         > +                  uint32_t clear_value_size, enum si_coherency
> >         coher,
> >         > +                  bool force_cpdma)
> >         >   {
> >         >       if (!size)
> >         >               return;
> >         >
> >         >       unsigned clear_alignment = MIN2(clear_value_size, 4);
> >         >
> >         >       assert(clear_value_size != 3 && clear_value_size != 6);
> /* 12
> >         is allowed. */
> >         >       assert(offset % clear_alignment == 0);
> >         >       assert(size % clear_alignment == 0);
> >         >       assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit
> sizes in
> >         all codepaths */
> >         > @@ -243,21 +244,22 @@ void si_clear_buffer(struct si_context
> *sctx,
> >         struct pipe_resource *dst,
> >         >               return;
> >         >       }
> >         >
> >         >       uint64_t aligned_size = size & ~3ull;
> >         >       if (aligned_size >= 4) {
> >         >               /* Before GFX9, CP DMA was very slow when
> clearing GTT,
> >         so never
> >         >                * use CP DMA clears on those chips, because we
> can't
> >         be certain
> >         >                * about buffer placements.
> >         >                */
> >         >               if (clear_value_size > 4 ||
> >         > -                 (clear_value_size == 4 &&
> >         > +                 (!force_cpdma &&
> >         > +                  clear_value_size == 4 &&
> >         >                    offset % 4 == 0 &&
> >         >                    (size > 32*1024 || sctx->chip_class <=
> VI))) {
> >         >                       si_compute_do_clear_or_copy(sctx, dst,
> offset,
> >         NULL, 0,
> >         >                                                   aligned_size,
> >         clear_value,
> >         >
>  clear_value_size,
> >         coher);
> >         >               } else {
> >         >                       assert(clear_value_size == 4);
> >         >                       si_cp_dma_clear_buffer(sctx,
> sctx->gfx_cs, dst,
> >         offset,
> >         >                                              aligned_size,
> >         *clear_value, 0, coher,
> >         >
> get_cache_policy(sctx,
> >         coher, size));
> >         > @@ -277,21 +279,21 @@ void si_clear_buffer(struct si_context
> *sctx,
> >         struct pipe_resource *dst,
> >         >       }
> >         >   }
> >         >
> >         >   static void si_pipe_clear_buffer(struct pipe_context *ctx,
> >         >                                struct pipe_resource *dst,
> >         >                                unsigned offset, unsigned size,
> >         >                                const void *clear_value,
> >         >                                int clear_value_size)
> >         >   {
> >         >       si_clear_buffer((struct si_context*)ctx, dst, offset,
> size,
> >         (uint32_t*)clear_value,
> >         > -                     clear_value_size, SI_COHERENCY_SHADER);
> >         > +                     clear_value_size, SI_COHERENCY_SHADER,
> false);
> >         >   }
> >         >
> >         >   void si_copy_buffer(struct si_context *sctx,
> >         >                   struct pipe_resource *dst, struct
> pipe_resource
> >         *src,
> >         >                   uint64_t dst_offset, uint64_t src_offset,
> unsigned
> >         size)
> >         >   {
> >         >       if (!size)
> >         >               return;
> >         >
> >         >       enum si_coherency coher = SI_COHERENCY_SHADER;
> >         > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
> b/src/gallium/
> >         drivers/radeonsi/si_pipe.c
> >         > index 5caeb575623..5d376e6181a 100644
> >         > --- a/src/gallium/drivers/radeonsi/si_pipe.c
> >         > +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> >         > @@ -634,21 +634,21 @@ static struct pipe_context
> *si_create_context
> >         (struct pipe_screen *screen,
> >         >                         sizeof(sctx->sample_positions), &sctx->
> >         sample_positions);
> >         >
> >         >       /* this must be last */
> >         >       si_begin_new_gfx_cs(sctx);
> >         >
> >         >       if (sctx->chip_class == CIK) {
> >         >               /* Clear the NULL constant buffer, because loads
> should
> >         return zeros. */
> >         >               uint32_t clear_value = 0;
> >         >               si_clear_buffer(sctx,
> sctx->null_const_buf.buffer, 0,
> >         >
>  sctx->null_const_buf.buffer->width0,
> >         > -                             &clear_value, 4,
> SI_COHERENCY_SHADER);
> >         > +                             &clear_value, 4,
> SI_COHERENCY_SHADER,
> >         true);
> >         >       }
> >         >       return &sctx->b;
> >         >   fail:
> >         >       fprintf(stderr, "radeonsi: Failed to create a
> context.\n");
> >         >       si_destroy_context(&sctx->b);
> >         >       return NULL;
> >         >   }
> >         >
> >         >   static struct pipe_context *si_pipe_create_context(struct
> >         pipe_screen *screen,
> >         >                                                  void *priv,
> unsigned
> >         flags)
> >         > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
> b/src/gallium/
> >         drivers/radeonsi/si_pipe.h
> >         > index 301d38649bf..aaa95f32d20 100644
> >         > --- a/src/gallium/drivers/radeonsi/si_pipe.h
> >         > +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> >         > @@ -1182,21 +1182,22 @@ bool vi_alpha_is_on_msb(enum
> pipe_format
> >         format);
> >         >   void vi_dcc_clear_level(struct si_context *sctx,
> >         >                       struct si_texture *tex,
> >         >                       unsigned level, unsigned clear_value);
> >         >   void si_init_clear_functions(struct si_context *sctx);
> >         >
> >         >   /* si_compute_blit.c */
> >         >   unsigned si_get_flush_flags(struct si_context *sctx, enum
> >         si_coherency coher,
> >         >                           enum si_cache_policy cache_policy);
> >         >   void si_clear_buffer(struct si_context *sctx, struct
> pipe_resource
> >         *dst,
> >         >                    uint64_t offset, uint64_t size, uint32_t
> >         *clear_value,
> >         > -                  uint32_t clear_value_size, enum si_coherency
> >         coher);
> >         > +                  uint32_t clear_value_size, enum si_coherency
> >         coher,
> >         > +                  bool force_cpdma);
> >         >   void si_copy_buffer(struct si_context *sctx,
> >         >                   struct pipe_resource *dst, struct
> pipe_resource
> >         *src,
> >         >                   uint64_t dst_offset, uint64_t src_offset,
> unsigned
> >         size);
> >         >   void si_compute_copy_image(struct si_context *sctx,
> >         >                          struct pipe_resource *dst,
> >         >                          unsigned dst_level,
> >         >                          struct pipe_resource *src,
> >         >                          unsigned src_level,
> >         >                          unsigned dstx, unsigned dsty,
> unsigned dstz,
> >         >                          const struct pipe_box *src_box);
> >         > diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c
> b/src/gallium
> >         /drivers/radeonsi/si_test_dma.c
> >         > index 90a2032cd80..7e396e671be 100644
> >         > --- a/src/gallium/drivers/radeonsi/si_test_dma.c
> >         > +++ b/src/gallium/drivers/radeonsi/si_test_dma.c
> >         > @@ -302,21 +302,21 @@ void si_test_dma(struct si_screen
> *sscreen)
> >         >                      tsrc.width0, tsrc.height0,
> tsrc.array_size,
> >         >                      array_mode_to_string(sscreen,
> &ssrc->surface),
> >         bpp);
> >         >               fflush(stdout);
> >         >
> >         >               /* set src pixels */
> >         >               set_random_pixels(ctx, src, &src_cpu);
> >         >
> >         >               /* clear dst pixels */
> >         >               uint32_t zero = 0;
> >         >               si_clear_buffer(sctx, dst, 0,
> sdst->surface.surf_size,
> >         &zero, 4,
> >         > -                             SI_COHERENCY_SHADER);
> >         > +                             SI_COHERENCY_SHADER, false);
> >         >               memset(dst_cpu.ptr, 0, dst_cpu.layer_stride *
> >         tdst.array_size);
> >         >
> >         >               /* preparation */
> >         >               max_width = MIN2(tsrc.width0, tdst.width0);
> >         >               max_height = MIN2(tsrc.height0, tdst.height0);
> >         >               max_depth = MIN2(tsrc.array_size,
> tdst.array_size);
> >         >
> >         >               num = do_partial_copies ? num_partial_copies : 1;
> >         >               for (j = 0; j < num; j++) {
> >         >                       int width, height, depth;
> >
>

Patch hide | download patch | download mbox

From 579072d03ab79bd46a19e0ae10f02211df76ccff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 12 Apr 2019 11:12:34 -0400
Subject: [PATCH] radeonsi: use CP DMA for the null const buffer clear on CIK

This is a workaround for a thread deadlock that I have no idea
why it occurs.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108879
Fixes: 9b331e462e5021d994859756d46cd2519d9c9c6e

Acked-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
(cherry picked from commit b58e5fb6f317be771326f98d498483e45942beaf)
---
 src/gallium/drivers/radeonsi/si_clear.c        | 6 +++---
 src/gallium/drivers/radeonsi/si_compute_blit.c | 8 +++++---
 src/gallium/drivers/radeonsi/si_pipe.c         | 7 +++++--
 src/gallium/drivers/radeonsi/si_pipe.h         | 3 ++-
 src/gallium/drivers/radeonsi/si_test_dma.c     | 2 +-
 5 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c
index 9026f61dc0a..ff98452a5bc 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -272,7 +272,7 @@  void vi_dcc_clear_level(struct si_context *sctx,
 	}
 
 	si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-			&clear_value, 4, SI_COHERENCY_CB_META);
+			&clear_value, 4, SI_COHERENCY_CB_META, false);
 }
 
 /* Set the same micro tile mode as the destination of the last MSAA resolve.
@@ -505,7 +505,7 @@  static void si_do_fast_color_clear(struct si_context *sctx,
 				uint32_t clear_value = 0xCCCCCCCC;
 				si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 						tex->cmask_offset, tex->surface.cmask_size,
-						&clear_value, 4, SI_COHERENCY_CB_META);
+						&clear_value, 4, SI_COHERENCY_CB_META, false);
 				fmask_decompress_needed = true;
 			}
 
@@ -533,7 +533,7 @@  static void si_do_fast_color_clear(struct si_context *sctx,
 			uint32_t clear_value = 0;
 			si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
 					tex->cmask_offset, tex->surface.cmask_size,
-					&clear_value, 4, SI_COHERENCY_CB_META);
+					&clear_value, 4, SI_COHERENCY_CB_META, false);
 			eliminate_needed = true;
 		}
 
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c
index 38c48c30be9..304296c4a52 100644
--- a/src/gallium/drivers/radeonsi/si_compute_blit.c
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -177,7 +177,8 @@  static void si_compute_do_clear_or_copy(struct si_context *sctx,
 
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher)
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma)
 {
 	if (!size)
 		return;
@@ -241,7 +242,8 @@  void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		 * about buffer placements.
 		 */
 		if (clear_value_size > 4 ||
-		    (clear_value_size == 4 &&
+		    (!force_cpdma &&
+		     clear_value_size == 4 &&
 		     offset % 4 == 0 &&
 		     (size > 32*1024 || sctx->chip_class <= VI))) {
 			si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
@@ -282,7 +284,7 @@  static void si_pipe_clear_buffer(struct pipe_context *ctx,
 		coher = SI_COHERENCY_SHADER;
 
 	si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value,
-			clear_value_size, coher);
+			clear_value_size, coher, false);
 }
 
 void si_copy_buffer(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 2656bdc2068..5312ff99e62 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -609,11 +609,14 @@  static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	si_begin_new_gfx_cs(sctx);
 
 	if (sctx->chip_class == CIK) {
-		/* Clear the NULL constant buffer, because loads should return zeros. */
+		/* Clear the NULL constant buffer, because loads should return zeros.
+		 * Note that this forces CP DMA to be used, because clover deadlocks
+		 * for some reason when the compute codepath is used.
+		 */
 		uint32_t clear_value = 0;
 		si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
 				sctx->null_const_buf.buffer->width0,
-				&clear_value, 4, SI_COHERENCY_SHADER);
+				&clear_value, 4, SI_COHERENCY_SHADER, true);
 	}
 	return &sctx->b;
 fail:
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index eb3ba951dae..d63943c19ed 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -1168,7 +1168,8 @@  unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
 			    enum si_cache_policy cache_policy);
 void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
 		     uint64_t offset, uint64_t size, uint32_t *clear_value,
-		     uint32_t clear_value_size, enum si_coherency coher);
+		     uint32_t clear_value_size, enum si_coherency coher,
+		     bool force_cpdma);
 void si_copy_buffer(struct si_context *sctx,
 		    struct pipe_resource *dst, struct pipe_resource *src,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size);
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c
index 90a2032cd80..7e396e671be 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -309,7 +309,7 @@  void si_test_dma(struct si_screen *sscreen)
 		/* clear dst pixels */
 		uint32_t zero = 0;
 		si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
-		                SI_COHERENCY_SHADER);
+		                SI_COHERENCY_SHADER, false);
 		memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
 
 		/* preparation */
-- 
2.17.1