[Mesa-dev] radeonsi: add basic glClearBufferSubData acceleration

Submitted by Marek Olšák on Nov. 3, 2015, 11:47 p.m.

Details

Message ID 1446594431-4080-1-git-send-email-maraeo@gmail.com
State New
Headers show
Series "radeonsi: add basic glClearBufferSubData acceleration" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Marek Olšák Nov. 3, 2015, 11:47 p.m.
From: Marek Olšák <marek.olsak@amd.com>

---
 src/gallium/drivers/radeonsi/si_blit.c | 55 ++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index fce014a..e934146 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -731,9 +731,64 @@  static void si_flush_resource(struct pipe_context *ctx,
 	}
 }
 
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+				 struct pipe_resource *dst,
+				 unsigned offset, unsigned size,
+				 const void *clear_value,
+				 int clear_value_size)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	const uint32_t *u32 = clear_value;
+	unsigned i;
+	bool clear_value_fits_dword = true;
+	uint8_t *map;
+
+	if (clear_value_size > 4)
+		for (i = 1; i < clear_value_size / 4; i++)
+			if (u32[0] != u32[i]) {
+				clear_value_fits_dword = false;
+				break;
+			}
+
+	/* Use CP DMA for the simple case. */
+	if (offset % 4 == 0 && size % 4 == 0 && clear_value_fits_dword) {
+		uint32_t value = u32[0];
+
+		switch (clear_value_size) {
+		case 1:
+			value &= 0xff;
+			value |= (value << 8) | (value << 16) | (value << 24);
+			break;
+		case 2:
+			value &= 0xffff;
+			value |= value << 16;
+			break;
+		}
+
+		sctx->b.clear_buffer(ctx, dst, offset, size, value, false);
+		return;
+	}
+
+	/* TODO: use a compute shader for other cases. */
+
+	/* Software fallback. */
+	map = r600_buffer_map_sync_with_rings(&sctx->b, r600_resource(dst),
+					      PIPE_TRANSFER_WRITE);
+	if (!map)
+		return;
+
+	map += offset;
+	size /= clear_value_size;
+	for (i = 0; i < size; i++) {
+		memcpy(map, clear_value, clear_value_size);
+		map += clear_value_size;
+	}
+}
+
 void si_init_blit_functions(struct si_context *sctx)
 {
 	sctx->b.b.clear = si_clear;
+	sctx->b.b.clear_buffer = si_pipe_clear_buffer;
 	sctx->b.b.clear_render_target = si_clear_render_target;
 	sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
 	sctx->b.b.resource_copy_region = si_resource_copy_region;

Comments

On Tue, Nov 3, 2015 at 6:47 PM, Marek Olšák <maraeo@gmail.com> wrote:
> From: Marek Olšák <marek.olsak@amd.com>
>
> ---
>  src/gallium/drivers/radeonsi/si_blit.c | 55 ++++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
>
> diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
> index fce014a..e934146 100644
> --- a/src/gallium/drivers/radeonsi/si_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_blit.c
> @@ -731,9 +731,64 @@ static void si_flush_resource(struct pipe_context *ctx,
>         }
>  }
>
> +static void si_pipe_clear_buffer(struct pipe_context *ctx,
> +                                struct pipe_resource *dst,
> +                                unsigned offset, unsigned size,
> +                                const void *clear_value,
> +                                int clear_value_size)
> +{
> +       struct si_context *sctx = (struct si_context*)ctx;
> +       const uint32_t *u32 = clear_value;
> +       unsigned i;
> +       bool clear_value_fits_dword = true;
> +       uint8_t *map;
> +
> +       if (clear_value_size > 4)
> +               for (i = 1; i < clear_value_size / 4; i++)
> +                       if (u32[0] != u32[i]) {
> +                               clear_value_fits_dword = false;
> +                               break;
> +                       }
> +
> +       /* Use CP DMA for the simple case. */
> +       if (offset % 4 == 0 && size % 4 == 0 && clear_value_fits_dword) {
> +               uint32_t value = u32[0];
> +
> +               switch (clear_value_size) {
> +               case 1:
> +                       value &= 0xff;
> +                       value |= (value << 8) | (value << 16) | (value << 24);
> +                       break;
> +               case 2:
> +                       value &= 0xffff;
> +                       value |= value << 16;
> +                       break;
> +               }
> +
> +               sctx->b.clear_buffer(ctx, dst, offset, size, value, false);
> +               return;
> +       }
> +
> +       /* TODO: use a compute shader for other cases. */

What about using SDMA?  It supports byte aligned constant fills at
least on CIK+.

Alex


> +
> +       /* Software fallback. */
> +       map = r600_buffer_map_sync_with_rings(&sctx->b, r600_resource(dst),
> +                                             PIPE_TRANSFER_WRITE);
> +       if (!map)
> +               return;
> +
> +       map += offset;
> +       size /= clear_value_size;
> +       for (i = 0; i < size; i++) {
> +               memcpy(map, clear_value, clear_value_size);
> +               map += clear_value_size;
> +       }
> +}
> +
>  void si_init_blit_functions(struct si_context *sctx)
>  {
>         sctx->b.b.clear = si_clear;
> +       sctx->b.b.clear_buffer = si_pipe_clear_buffer;
>         sctx->b.b.clear_render_target = si_clear_render_target;
>         sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
>         sctx->b.b.resource_copy_region = si_resource_copy_region;
> --
> 2.1.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Nov 4, 2015 4:02 AM, "Alex Deucher" <alexdeucher@gmail.com> wrote:
>
> On Tue, Nov 3, 2015 at 6:47 PM, Marek Olšák <maraeo@gmail.com> wrote:
> > From: Marek Olšák <marek.olsak@amd.com>
> >
> > ---
> >  src/gallium/drivers/radeonsi/si_blit.c | 55
++++++++++++++++++++++++++++++++++
> >  1 file changed, 55 insertions(+)
> >
> > diff --git a/src/gallium/drivers/radeonsi/si_blit.c
b/src/gallium/drivers/radeonsi/si_blit.c
> > index fce014a..e934146 100644
> > --- a/src/gallium/drivers/radeonsi/si_blit.c
> > +++ b/src/gallium/drivers/radeonsi/si_blit.c
> > @@ -731,9 +731,64 @@ static void si_flush_resource(struct pipe_context
*ctx,
> >         }
> >  }
> >
> > +static void si_pipe_clear_buffer(struct pipe_context *ctx,
> > +                                struct pipe_resource *dst,
> > +                                unsigned offset, unsigned size,
> > +                                const void *clear_value,
> > +                                int clear_value_size)
> > +{
> > +       struct si_context *sctx = (struct si_context*)ctx;
> > +       const uint32_t *u32 = clear_value;
> > +       unsigned i;
> > +       bool clear_value_fits_dword = true;
> > +       uint8_t *map;
> > +
> > +       if (clear_value_size > 4)
> > +               for (i = 1; i < clear_value_size / 4; i++)
> > +                       if (u32[0] != u32[i]) {
> > +                               clear_value_fits_dword = false;
> > +                               break;
> > +                       }
> > +
> > +       /* Use CP DMA for the simple case. */
> > +       if (offset % 4 == 0 && size % 4 == 0 && clear_value_fits_dword)
{
> > +               uint32_t value = u32[0];
> > +
> > +               switch (clear_value_size) {
> > +               case 1:
> > +                       value &= 0xff;
> > +                       value |= (value << 8) | (value << 16) | (value
<< 24);
> > +                       break;
> > +               case 2:
> > +                       value &= 0xffff;
> > +                       value |= value << 16;
> > +                       break;
> > +               }
> > +
> > +               sctx->b.clear_buffer(ctx, dst, offset, size, value,
false);
> > +               return;
> > +       }
> > +
> > +       /* TODO: use a compute shader for other cases. */
>
> What about using SDMA?  It supports byte aligned constant fills at
> least on CIK+.

I think CP DMA supports byte aligned constant fills as well, I just need to
test it. The bigger problem is 64 and 128 bit fills. Those can only be done
with a shader AFAIK.

>
>
> > +
> > +       /* Software fallback. */
> > +       map = r600_buffer_map_sync_with_rings(&sctx->b,
r600_resource(dst),
> > +                                             PIPE_TRANSFER_WRITE);
> > +       if (!map)
> > +               return;
> > +
> > +       map += offset;
> > +       size /= clear_value_size;
> > +       for (i = 0; i < size; i++) {
> > +               memcpy(map, clear_value, clear_value_size);
> > +               map += clear_value_size;
> > +       }
> > +}
> > +
> >  void si_init_blit_functions(struct si_context *sctx)
> >  {
> >         sctx->b.b.clear = si_clear;
> > +       sctx->b.b.clear_buffer = si_pipe_clear_buffer;
> >         sctx->b.b.clear_render_target = si_clear_render_target;
> >         sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
> >         sctx->b.b.resource_copy_region = si_resource_copy_region;
> > --
> > 2.1.4
> >
> > _______________________________________________
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > http://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Wed, Nov 4, 2015 at 4:08 AM, Marek Olšák <maraeo@gmail.com> wrote:
> On Nov 4, 2015 4:02 AM, "Alex Deucher" <alexdeucher@gmail.com> wrote:
>>
>> On Tue, Nov 3, 2015 at 6:47 PM, Marek Olšák <maraeo@gmail.com> wrote:
>> > From: Marek Olšák <marek.olsak@amd.com>
>> >
>> > ---
>> >  src/gallium/drivers/radeonsi/si_blit.c | 55
>> > ++++++++++++++++++++++++++++++++++
>> >  1 file changed, 55 insertions(+)
>> >
>> > diff --git a/src/gallium/drivers/radeonsi/si_blit.c
>> > b/src/gallium/drivers/radeonsi/si_blit.c
>> > index fce014a..e934146 100644
>> > --- a/src/gallium/drivers/radeonsi/si_blit.c
>> > +++ b/src/gallium/drivers/radeonsi/si_blit.c
>> > @@ -731,9 +731,64 @@ static void si_flush_resource(struct pipe_context
>> > *ctx,
>> >         }
>> >  }
>> >
>> > +static void si_pipe_clear_buffer(struct pipe_context *ctx,
>> > +                                struct pipe_resource *dst,
>> > +                                unsigned offset, unsigned size,
>> > +                                const void *clear_value,
>> > +                                int clear_value_size)
>> > +{
>> > +       struct si_context *sctx = (struct si_context*)ctx;
>> > +       const uint32_t *u32 = clear_value;
>> > +       unsigned i;
>> > +       bool clear_value_fits_dword = true;
>> > +       uint8_t *map;
>> > +
>> > +       if (clear_value_size > 4)
>> > +               for (i = 1; i < clear_value_size / 4; i++)
>> > +                       if (u32[0] != u32[i]) {
>> > +                               clear_value_fits_dword = false;
>> > +                               break;
>> > +                       }
>> > +
>> > +       /* Use CP DMA for the simple case. */
>> > +       if (offset % 4 == 0 && size % 4 == 0 && clear_value_fits_dword)
>> > {
>> > +               uint32_t value = u32[0];
>> > +
>> > +               switch (clear_value_size) {
>> > +               case 1:
>> > +                       value &= 0xff;
>> > +                       value |= (value << 8) | (value << 16) | (value
>> > << 24);
>> > +                       break;
>> > +               case 2:
>> > +                       value &= 0xffff;
>> > +                       value |= value << 16;
>> > +                       break;
>> > +               }
>> > +
>> > +               sctx->b.clear_buffer(ctx, dst, offset, size, value,
>> > false);
>> > +               return;
>> > +       }
>> > +
>> > +       /* TODO: use a compute shader for other cases. */
>>
>> What about using SDMA?  It supports byte aligned constant fills at
>> least on CIK+.
>
> I think CP DMA supports byte aligned constant fills as well, I just need to
> test it. The bigger problem is 64 and 128 bit fills. Those can only be done
> with a shader AFAIK.

Just to briefly interject, don't forget abotu 96-bit fills for
tbo_rgb32 buffers. I assume this won't really matter if you're using a
shader.

Cheers,

  -ilia
On 04.11.2015 00:47, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak@amd.com>
>
> ---
>   src/gallium/drivers/radeonsi/si_blit.c | 55 ++++++++++++++++++++++++++++++++++
>   1 file changed, 55 insertions(+)
>
> diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
> index fce014a..e934146 100644
> --- a/src/gallium/drivers/radeonsi/si_blit.c
> +++ b/src/gallium/drivers/radeonsi/si_blit.c
> @@ -731,9 +731,64 @@ static void si_flush_resource(struct pipe_context *ctx,
>   	}
>   }
>
> +static void si_pipe_clear_buffer(struct pipe_context *ctx,
> +				 struct pipe_resource *dst,
> +				 unsigned offset, unsigned size,
> +				 const void *clear_value,
> +				 int clear_value_size)
> +{
> +	struct si_context *sctx = (struct si_context*)ctx;
> +	const uint32_t *u32 = clear_value;
> +	unsigned i;
> +	bool clear_value_fits_dword = true;
> +	uint8_t *map;
> +
> +	if (clear_value_size > 4)
> +		for (i = 1; i < clear_value_size / 4; i++)
> +			if (u32[0] != u32[i]) {
> +				clear_value_fits_dword = false;
> +				break;
> +			}
> +
> +	/* Use CP DMA for the simple case. */
> +	if (offset % 4 == 0 && size % 4 == 0 && clear_value_fits_dword) {
> +		uint32_t value = u32[0];
> +
> +		switch (clear_value_size) {
> +		case 1:
> +			value &= 0xff;
> +			value |= (value << 8) | (value << 16) | (value << 24);
> +			break;
> +		case 2:
> +			value &= 0xffff;
> +			value |= value << 16;
> +			break;
> +		}

To reduce the chance of complaints by valgrind et al:

switch (clear_value_size) {
case 1:
	value = *(uint8_t *)u32;
	value |= (value << 8) | (value << 16) | (value << 24);
	break;
case 2:
	value = *(uint16_t *)u32;
	value |= value << 16;
	break;
default:
	value = *u32;
	break;
}

Cheers,
Nicolai

> +
> +		sctx->b.clear_buffer(ctx, dst, offset, size, value, false);
> +		return;
> +	}
> +
> +	/* TODO: use a compute shader for other cases. */
> +
> +	/* Software fallback. */
> +	map = r600_buffer_map_sync_with_rings(&sctx->b, r600_resource(dst),
> +					      PIPE_TRANSFER_WRITE);
> +	if (!map)
> +		return;
> +
> +	map += offset;
> +	size /= clear_value_size;
> +	for (i = 0; i < size; i++) {
> +		memcpy(map, clear_value, clear_value_size);
> +		map += clear_value_size;
> +	}
> +}
> +
>   void si_init_blit_functions(struct si_context *sctx)
>   {
>   	sctx->b.b.clear = si_clear;
> +	sctx->b.b.clear_buffer = si_pipe_clear_buffer;
>   	sctx->b.b.clear_render_target = si_clear_render_target;
>   	sctx->b.b.clear_depth_stencil = si_clear_depth_stencil;
>   	sctx->b.b.resource_copy_region = si_resource_copy_region;
>
On Thu, Nov 5, 2015 at 7:58 PM, Nicolai Hähnle <nhaehnle@gmail.com> wrote:
> On 04.11.2015 00:47, Marek Olšák wrote:
>>
>> From: Marek Olšák <marek.olsak@amd.com>
>>
>> ---
>>   src/gallium/drivers/radeonsi/si_blit.c | 55
>> ++++++++++++++++++++++++++++++++++
>>   1 file changed, 55 insertions(+)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_blit.c
>> b/src/gallium/drivers/radeonsi/si_blit.c
>> index fce014a..e934146 100644
>> --- a/src/gallium/drivers/radeonsi/si_blit.c
>> +++ b/src/gallium/drivers/radeonsi/si_blit.c
>> @@ -731,9 +731,64 @@ static void si_flush_resource(struct pipe_context
>> *ctx,
>>         }
>>   }
>>
>> +static void si_pipe_clear_buffer(struct pipe_context *ctx,
>> +                                struct pipe_resource *dst,
>> +                                unsigned offset, unsigned size,
>> +                                const void *clear_value,
>> +                                int clear_value_size)
>> +{
>> +       struct si_context *sctx = (struct si_context*)ctx;
>> +       const uint32_t *u32 = clear_value;
>> +       unsigned i;
>> +       bool clear_value_fits_dword = true;
>> +       uint8_t *map;
>> +
>> +       if (clear_value_size > 4)
>> +               for (i = 1; i < clear_value_size / 4; i++)
>> +                       if (u32[0] != u32[i]) {
>> +                               clear_value_fits_dword = false;
>> +                               break;
>> +                       }
>> +
>> +       /* Use CP DMA for the simple case. */
>> +       if (offset % 4 == 0 && size % 4 == 0 && clear_value_fits_dword) {
>> +               uint32_t value = u32[0];
>> +
>> +               switch (clear_value_size) {
>> +               case 1:
>> +                       value &= 0xff;
>> +                       value |= (value << 8) | (value << 16) | (value <<
>> 24);
>> +                       break;
>> +               case 2:
>> +                       value &= 0xffff;
>> +                       value |= value << 16;
>> +                       break;
>> +               }
>
>
> To reduce the chance of complaints by valgrind et al:
>
> switch (clear_value_size) {
> case 1:
>         value = *(uint8_t *)u32;
>         value |= (value << 8) | (value << 16) | (value << 24);
>         break;
> case 2:
>         value = *(uint16_t *)u32;
>         value |= value << 16;
>         break;
> default:
>         value = *u32;
>         break;
> }

Thanks.

The preliminary plan is to use transform feedback for fills>=64 bits
(already implemented by u_blitter), and CP DMA should be used for
32-bit fills and any fills that can be lowered to 32-bit.

Unaligned 8-bit and 16-bit fills should fill the largest aligned
subrange using CP DMA. Then, the unaligned beginning and ending dwords
will be filled separately using:
COPY_DATA from mem to reg
REG_RMW to fill the requested bytes
COPY_DATA from reg to mem

Marek