gallium: add pipe_grid_info::partial_block

Submitted by Jiang, Sonny on Jan. 8, 2019, 7:47 p.m.

Details

Message ID 20190108194634.10694-1-sonny.jiang@amd.com
State New
Headers show
Series "gallium: add pipe_grid_info::partial_block" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Jiang, Sonny Jan. 8, 2019, 7:47 p.m.
and add radeonsi support. This will be used by radeonsi internally.

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
---
 src/gallium/drivers/radeonsi/si_compute.c | 33 +++++++++++++++++++----
 src/gallium/include/pipe/p_state.h        |  7 +++++
 2 files changed, 35 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index cbcd8e79c7b..69ffad45cd9 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -797,11 +797,6 @@  static void si_emit_dispatch_packets(struct si_context *sctx,
 	radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
 			  compute_resource_limits);
 
-	radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
-	radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
-	radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
-	radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
-
 	unsigned dispatch_initiator =
 		S_00B800_COMPUTE_SHADER_EN(1) |
 		S_00B800_FORCE_START_AT_000(1) |
@@ -809,6 +804,34 @@  static void si_emit_dispatch_packets(struct si_context *sctx,
 		 * allow launching waves out-of-order. (same as Vulkan) */
 		S_00B800_ORDER_MODE(sctx->chip_class >= CIK);
 
+	bool partial_block_en = info->partial_block[0] ||
+				info->partial_block[1] ||
+				info->partial_block[2];
+
+	radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
+
+	if (partial_block_en) {
+		unsigned partial[3];
+
+		/* If no partial_block, these should be an entire block size, not 0. */
+		partial[0] = info->partial_block[0] ? info->partial_block[0] : info->block[0];
+		partial[1] = info->partial_block[1] ? info->partial_block[1] : info->block[1];
+		partial[2] = info->partial_block[2] ? info->partial_block[2] : info->block[2];
+
+		radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) |
+				S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
+		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) |
+				S_00B820_NUM_THREAD_PARTIAL(partial[1]));
+		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) |
+				S_00B824_NUM_THREAD_PARTIAL(partial[2]));
+
+		dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
+	} else {
+		radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
+		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
+		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
+	}
+
 	if (info->indirect) {
 		uint64_t base_va = r600_resource(info->indirect)->gpu_address;
 
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 38052e5fd3d..56f5bdd4c85 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -838,6 +838,13 @@  struct pipe_grid_info
     */
    uint block[3];
 
+   /**
+    * Number of threads to add to the grid in X, Y, and Z directions for
+    * compute dispatches that are not aligned to the block size.
+    * The added threads will be launched as partial thread blocks.
+    */
+   uint partial_block[3];
+
    /**
     * Determine the layout of the grid (in block units) to be used.
     */

Comments

Why does this need to be in p_state? And who is responsible for
setting it (and how will it be set)?

On Tue, Jan 8, 2019 at 2:47 PM Jiang, Sonny <Sonny.Jiang@amd.com> wrote:
>
> and add radeonsi support. This will be used by radeonsi internally.
>
> Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
> ---
>  src/gallium/drivers/radeonsi/si_compute.c | 33 +++++++++++++++++++----
>  src/gallium/include/pipe/p_state.h        |  7 +++++
>  2 files changed, 35 insertions(+), 5 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
> index cbcd8e79c7b..69ffad45cd9 100644
> --- a/src/gallium/drivers/radeonsi/si_compute.c
> +++ b/src/gallium/drivers/radeonsi/si_compute.c
> @@ -797,11 +797,6 @@ static void si_emit_dispatch_packets(struct si_context *sctx,
>         radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
>                           compute_resource_limits);
>
> -       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
> -       radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
> -       radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
> -       radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
> -
>         unsigned dispatch_initiator =
>                 S_00B800_COMPUTE_SHADER_EN(1) |
>                 S_00B800_FORCE_START_AT_000(1) |
> @@ -809,6 +804,34 @@ static void si_emit_dispatch_packets(struct si_context *sctx,
>                  * allow launching waves out-of-order. (same as Vulkan) */
>                 S_00B800_ORDER_MODE(sctx->chip_class >= CIK);
>
> +       bool partial_block_en = info->partial_block[0] ||
> +                               info->partial_block[1] ||
> +                               info->partial_block[2];
> +
> +       radeon_set_sh_reg_seq(cs, R_00B81C_COMPUTE_NUM_THREAD_X, 3);
> +
> +       if (partial_block_en) {
> +               unsigned partial[3];
> +
> +               /* If no partial_block, these should be an entire block size, not 0. */
> +               partial[0] = info->partial_block[0] ? info->partial_block[0] : info->block[0];
> +               partial[1] = info->partial_block[1] ? info->partial_block[1] : info->block[1];
> +               partial[2] = info->partial_block[2] ? info->partial_block[2] : info->block[2];
> +
> +               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]) |
> +                               S_00B81C_NUM_THREAD_PARTIAL(partial[0]));
> +               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]) |
> +                               S_00B820_NUM_THREAD_PARTIAL(partial[1]));
> +               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]) |
> +                               S_00B824_NUM_THREAD_PARTIAL(partial[2]));
> +
> +               dispatch_initiator |= S_00B800_PARTIAL_TG_EN(1);
> +       } else {
> +               radeon_emit(cs, S_00B81C_NUM_THREAD_FULL(info->block[0]));
> +               radeon_emit(cs, S_00B820_NUM_THREAD_FULL(info->block[1]));
> +               radeon_emit(cs, S_00B824_NUM_THREAD_FULL(info->block[2]));
> +       }
> +
>         if (info->indirect) {
>                 uint64_t base_va = r600_resource(info->indirect)->gpu_address;
>
> diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
> index 38052e5fd3d..56f5bdd4c85 100644
> --- a/src/gallium/include/pipe/p_state.h
> +++ b/src/gallium/include/pipe/p_state.h
> @@ -838,6 +838,13 @@ struct pipe_grid_info
>      */
>     uint block[3];
>
> +   /**
> +    * Number of threads to add to the grid in X, Y, and Z directions for
> +    * compute dispatches that are not aligned to the block size.
> +    * The added threads will be launched as partial thread blocks.
> +    */
> +   uint partial_block[3];
> +
>     /**
>      * Determine the layout of the grid (in block units) to be used.
>      */
> --
> 2.17.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Tue, Jan 8, 2019 at 5:25 PM Ilia Mirkin <imirkin@alum.mit.edu> wrote:

> Why does this need to be in p_state? And who is responsible for
> setting it (and how will it be set)?
>

Oh right, there is a way to get it out of p_state.h if needed.

It should be set to 0 by default.

If your thread block is 8x8x1, but you need to launch 10x8x1 threads, set
partial_block = {2, 0, 0}. It will launch the following thread blocks:
8x8x1
2x8x1

It's the same as launching 16x8x1 threads and doing this at the beginning
of the compute shader:
  if (globalThreadID.x >= 10) return;

Marek
On Tue, Jan 8, 2019 at 6:21 PM Marek Olšák <maraeo@gmail.com> wrote:
>
> On Tue, Jan 8, 2019 at 5:25 PM Ilia Mirkin <imirkin@alum.mit.edu> wrote:
>>
>> Why does this need to be in p_state? And who is responsible for
>> setting it (and how will it be set)?
>
>
> Oh right, there is a way to get it out of p_state.h if needed.
>
> It should be set to 0 by default.
>
> If your thread block is 8x8x1, but you need to launch 10x8x1 threads, set partial_block = {2, 0, 0}. It will launch the following thread blocks:
> 8x8x1
> 2x8x1
>
> It's the same as launching 16x8x1 threads and doing this at the beginning of the compute shader:
>   if (globalThreadID.x >= 10) return;

But that all sounds like something a state tracker wouldn't care
about, right? In e.g. GLSL you can specify the block to be 10x8x1 and
let the backend work it all out. Should st/mesa care about this (or
clover or whatever)?

  -ilia
On Tue, Jan 8, 2019 at 7:18 PM Ilia Mirkin <imirkin@alum.mit.edu> wrote:

> On Tue, Jan 8, 2019 at 6:21 PM Marek Olšák <maraeo@gmail.com> wrote:
> >
> > On Tue, Jan 8, 2019 at 5:25 PM Ilia Mirkin <imirkin@alum.mit.edu> wrote:
> >>
> >> Why does this need to be in p_state? And who is responsible for
> >> setting it (and how will it be set)?
> >
> >
> > Oh right, there is a way to get it out of p_state.h if needed.
> >
> > It should be set to 0 by default.
> >
> > If your thread block is 8x8x1, but you need to launch 10x8x1 threads,
> set partial_block = {2, 0, 0}. It will launch the following thread blocks:
> > 8x8x1
> > 2x8x1
> >
> > It's the same as launching 16x8x1 threads and doing this at the
> beginning of the compute shader:
> >   if (globalThreadID.x >= 10) return;
>
> But that all sounds like something a state tracker wouldn't care
> about, right? In e.g. GLSL you can specify the block to be 10x8x1 and
> let the backend work it all out. Should st/mesa care about this (or
> clover or whatever)?
>

The block size should be a multiple of 64 on radeonsi to utilize all SIMD
lanes. If you want to launch 8192+1 threads with the block size of 64, you
need to launch 1 partial block with the block size of 1 at the end. OpenGL
can't do this.

Marek
On Tue, Jan 8, 2019 at 7:26 PM Marek Olšák <maraeo@gmail.com> wrote:
>
> On Tue, Jan 8, 2019 at 7:18 PM Ilia Mirkin <imirkin@alum.mit.edu> wrote:
>>
>> On Tue, Jan 8, 2019 at 6:21 PM Marek Olšák <maraeo@gmail.com> wrote:
>> >
>> > On Tue, Jan 8, 2019 at 5:25 PM Ilia Mirkin <imirkin@alum.mit.edu> wrote:
>> >>
>> >> Why does this need to be in p_state? And who is responsible for
>> >> setting it (and how will it be set)?
>> >
>> >
>> > Oh right, there is a way to get it out of p_state.h if needed.
>> >
>> > It should be set to 0 by default.
>> >
>> > If your thread block is 8x8x1, but you need to launch 10x8x1 threads, set partial_block = {2, 0, 0}. It will launch the following thread blocks:
>> > 8x8x1
>> > 2x8x1
>> >
>> > It's the same as launching 16x8x1 threads and doing this at the beginning of the compute shader:
>> >   if (globalThreadID.x >= 10) return;
>>
>> But that all sounds like something a state tracker wouldn't care
>> about, right? In e.g. GLSL you can specify the block to be 10x8x1 and
>> let the backend work it all out. Should st/mesa care about this (or
>> clover or whatever)?
>
>
> The block size should be a multiple of 64 on radeonsi to utilize all SIMD lanes. If you want to launch 8192+1 threads with the block size of 64, you need to launch 1 partial block with the block size of 1 at the end. OpenGL can't do this.

Ohhhhhhhhhh. So the partial-ness applies to the last-executed block.
If you have a local_size=(2,2,1), and you want your global grid to be,
say, (5,4,1), with unextended GL you might run it as groups = (3,2,1)
which would end up invoking a bunch of bits you don't want, and so
this partial_size is a way to say that you don't want the last "line"
to be executed at all.

That makes sense, and seems like a reasonable thing to have in
pipe_grid_info. The documentation did not make that clear the first
time I read it, but now I'm having trouble suggesting improvements to
it. So I think it's fine.

The p_state.h bits are Acked-by: Ilia Mirkin <imirkin@alum.mit.edu> .
Can't speak as to the radeonsi bits.

Cheers,

  -ilia
On Tue, Jan 8, 2019, 7:55 PM Ilia Mirkin <imirkin@alum.mit.edu wrote:

> On Tue, Jan 8, 2019 at 7:26 PM Marek Olšák <maraeo@gmail.com> wrote:
> >
> > On Tue, Jan 8, 2019 at 7:18 PM Ilia Mirkin <imirkin@alum.mit.edu> wrote:
> >>
> >> On Tue, Jan 8, 2019 at 6:21 PM Marek Olšák <maraeo@gmail.com> wrote:
> >> >
> >> > On Tue, Jan 8, 2019 at 5:25 PM Ilia Mirkin <imirkin@alum.mit.edu>
> wrote:
> >> >>
> >> >> Why does this need to be in p_state? And who is responsible for
> >> >> setting it (and how will it be set)?
> >> >
> >> >
> >> > Oh right, there is a way to get it out of p_state.h if needed.
> >> >
> >> > It should be set to 0 by default.
> >> >
> >> > If your thread block is 8x8x1, but you need to launch 10x8x1 threads,
> set partial_block = {2, 0, 0}. It will launch the following thread blocks:
> >> > 8x8x1
> >> > 2x8x1
> >> >
> >> > It's the same as launching 16x8x1 threads and doing this at the
> beginning of the compute shader:
> >> >   if (globalThreadID.x >= 10) return;
> >>
> >> But that all sounds like something a state tracker wouldn't care
> >> about, right? In e.g. GLSL you can specify the block to be 10x8x1 and
> >> let the backend work it all out. Should st/mesa care about this (or
> >> clover or whatever)?
> >
> >
> > The block size should be a multiple of 64 on radeonsi to utilize all
> SIMD lanes. If you want to launch 8192+1 threads with the block size of 64,
> you need to launch 1 partial block with the block size of 1 at the end.
> OpenGL can't do this.
>
> Ohhhhhhhhhh. So the partial-ness applies to the last-executed block.
> If you have a local_size=(2,2,1), and you want your global grid to be,
> say, (5,4,1), with unextended GL you might run it as groups = (3,2,1)
> which would end up invoking a bunch of bits you don't want, and so
> this partial_size is a way to say that you don't want the last "line"
> to be executed at all.
>
> That makes sense, and seems like a reasonable thing to have in
> pipe_grid_info. The documentation did not make that clear the first
> time I read it, but now I'm having trouble suggesting improvements to
> it. So I think it's fine.
>

Yep. The partialness adds additional blocks to the grid with disabled
threads (lanes). I might rename it to grid_padding[3]. Or I might keep the
whole thing private in radeonsi.

It's useful for e.g. compute-based image blits when the blit box is not
aligned to the block size.

Marek

The p_state.h bits are Acked-by: Ilia Mirkin <imirkin@alum.mit.edu> .
> Can't speak as to the radeonsi bits.
>
> Cheers,
>
>   -ilia
>