[Mesa-dev,07/37] i965/gen6/gs: Implement GS_OPCODE_FF_SYNC.

Submitted by Iago Toral Quiroga on Aug. 14, 2014, 11:11 a.m.

Details

Message ID 1408014729-12708-8-git-send-email-itoral@igalia.com
State Accepted
Headers show

Not browsing as part of any series.

Commit Message

Iago Toral Quiroga Aug. 14, 2014, 11:11 a.m.
This implements the FF_SYNC message required in gen6  geometry shaders to
get the initial URB handle.
---
 src/mesa/drivers/dri/i965/brw_defines.h          | 14 +++++++++
 src/mesa/drivers/dri/i965/brw_shader.cpp         |  2 ++
 src/mesa/drivers/dri/i965/brw_vec4.h             |  3 ++
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 40 ++++++++++++++++++++++++
 4 files changed, 59 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 3564041..125d728 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1002,6 +1002,20 @@  enum opcode {
     * - dst is the GRF for gl_InvocationID.
     */
    GS_OPCODE_GET_INSTANCE_ID,
+
+   /**
+    * Send a FF_SYNC message to allocate initial URB handles (gen6).
+    *
+    * - dst will hold the newly allocated VUE handle. It is expected to be
+    *   be initialized so that it can be used to as the FF_SYNC message header
+    *   (that is, it won't do an implied move from R0).
+    *
+    * - src0 is a temporary that will be used as writeback register for the
+    *   FF_SYNC operation.
+    *
+    * - src1 is the number of primitives written.
+    */
+   GS_OPCODE_FF_SYNC,
 };
 
 enum brw_urb_write_flags {
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 0033135..5749061 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -528,6 +528,8 @@  brw_instruction_name(enum opcode op)
       return "set_channel_masks";
    case GS_OPCODE_GET_INSTANCE_ID:
       return "get_instance_id";
+   case GS_OPCODE_FF_SYNC:
+      return "ff_sync";
 
    default:
       /* Yes, this leaks.  It's in debug code, it should never occur, and if
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 67132c0..72fabdd 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -659,6 +659,9 @@  private:
    void generate_gs_prepare_channel_masks(struct brw_reg dst);
    void generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src);
    void generate_gs_get_instance_id(struct brw_reg dst);
+   void generate_gs_ff_sync(struct brw_reg dst,
+                            struct brw_reg src0,
+                            struct brw_reg src1);
    void generate_oword_dual_block_offsets(struct brw_reg m1,
 					  struct brw_reg index);
    void generate_scratch_write(vec4_instruction *inst,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index c63b47a..05f4892 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -621,6 +621,42 @@  vec4_generator::generate_gs_get_instance_id(struct brw_reg dst)
 }
 
 void
+vec4_generator::generate_gs_ff_sync(struct brw_reg dst,
+                                    struct brw_reg src0,
+                                    struct brw_reg src1)
+{
+   /* We use dst to setup the ff_sync header, so we expect it to be
+    * initialized to R0 by the caller. Here we overwrite dword 0 (cleared
+    * for now since we are not doing transform feedback) and dword 1
+    * (to hold the number of primitives written).
+    */
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(dst, 0), brw_imm_ud(0));
+   brw_MOV(p, get_element_ud(dst, 1), get_element_ud(src1, 0));
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_pop_insn_state(p);
+
+   /* Write allocated URB handle to temporary passed in src0 */
+   brw_ff_sync(p,
+               src0,
+               0,
+               dst,
+               1, /* allocate */
+               1, /* response length */
+               0 /* eot */);
+
+   /* Now put allocated urb handle in dst.0 */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src0, 0));
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+   brw_pop_insn_state(p);
+}
+
+void
 vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
                                                   struct brw_reg index)
 {
@@ -1198,6 +1234,10 @@  vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
       generate_gs_get_instance_id(dst);
       break;
 
+   case GS_OPCODE_FF_SYNC:
+      generate_gs_ff_sync(dst, src[0], src[1]);
+      break;
+
    case SHADER_OPCODE_SHADER_TIME_ADD:
       brw_shader_time_add(p, src[0],
                           prog_data->base.binding_table.shader_time_start);

Comments

On Thursday, August 14, 2014 01:11:39 PM Iago Toral Quiroga wrote:
> This implements the FF_SYNC message required in gen6  geometry shaders to
> get the initial URB handle.
> ---
>  src/mesa/drivers/dri/i965/brw_defines.h          | 14 +++++++++
>  src/mesa/drivers/dri/i965/brw_shader.cpp         |  2 ++
>  src/mesa/drivers/dri/i965/brw_vec4.h             |  3 ++
>  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 40 ++++++++++++++++++++++++
>  4 files changed, 59 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> index 3564041..125d728 100644
> --- a/src/mesa/drivers/dri/i965/brw_defines.h
> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> @@ -1002,6 +1002,20 @@ enum opcode {
>      * - dst is the GRF for gl_InvocationID.
>      */
>     GS_OPCODE_GET_INSTANCE_ID,
> +
> +   /**
> +    * Send a FF_SYNC message to allocate initial URB handles (gen6).
> +    *
> +    * - dst will hold the newly allocated VUE handle. It is expected to be
> +    *   be initialized so that it can be used to as the FF_SYNC message header
> +    *   (that is, it won't do an implied move from R0).
> +    *
> +    * - src0 is a temporary that will be used as writeback register for the
> +    *   FF_SYNC operation.

I was originally a bit concerned that writing over a source register could get your into trouble - say, optimization passes not noticing that you're modifying src0, or the scheduler not noticing dependencies.  But, since you allocate a temporary, and only use it in this one instruction, there's not really any chance for optimizations to combine it with anything.

So, I suppose this will work.

Another way you might consider doing this...

1. Add GS_OPCODE_FF_SYNC to vec4_visitor::implied_mrf_writes, and return 1.
   This tells the scheduler etc. that you implicitly trash 1 MRF.
2. Make the visitor set inst->base_mrf, and MOV <base_mrf> g0.
3. Make the generator use brw_message_reg(inst->base_mrf) for the header MOVs.

Then, src0 can be the number of primitives written, and you don't need src1.
That seems simpler to me.

brw_ff_sync will resolve your GRF (dst) to an MRF anyway; the above approach would write directly into the MRF, saving you a GRF.

Thoughts?

> +    *
> +    * - src1 is the number of primitives written.
> +    */
> +   GS_OPCODE_FF_SYNC,
>  };
>  
>  enum brw_urb_write_flags {
> diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
> index 0033135..5749061 100644
> --- a/src/mesa/drivers/dri/i965/brw_shader.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
> @@ -528,6 +528,8 @@ brw_instruction_name(enum opcode op)
>        return "set_channel_masks";
>     case GS_OPCODE_GET_INSTANCE_ID:
>        return "get_instance_id";
> +   case GS_OPCODE_FF_SYNC:
> +      return "ff_sync";
>  
>     default:
>        /* Yes, this leaks.  It's in debug code, it should never occur, and if
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
> index 67132c0..72fabdd 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4.h
> +++ b/src/mesa/drivers/dri/i965/brw_vec4.h
> @@ -659,6 +659,9 @@ private:
>     void generate_gs_prepare_channel_masks(struct brw_reg dst);
>     void generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src);
>     void generate_gs_get_instance_id(struct brw_reg dst);
> +   void generate_gs_ff_sync(struct brw_reg dst,
> +                            struct brw_reg src0,
> +                            struct brw_reg src1);
>     void generate_oword_dual_block_offsets(struct brw_reg m1,
>  					  struct brw_reg index);
>     void generate_scratch_write(vec4_instruction *inst,
> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> index c63b47a..05f4892 100644
> --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
> @@ -621,6 +621,42 @@ vec4_generator::generate_gs_get_instance_id(struct brw_reg dst)
>  }
>  
>  void
> +vec4_generator::generate_gs_ff_sync(struct brw_reg dst,
> +                                    struct brw_reg src0,
> +                                    struct brw_reg src1)
> +{
> +   /* We use dst to setup the ff_sync header, so we expect it to be
> +    * initialized to R0 by the caller. Here we overwrite dword 0 (cleared
> +    * for now since we are not doing transform feedback) and dword 1
> +    * (to hold the number of primitives written).
> +    */
> +   brw_push_insn_state(p);
> +   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +   brw_MOV(p, get_element_ud(dst, 0), brw_imm_ud(0));
> +   brw_MOV(p, get_element_ud(dst, 1), get_element_ud(src1, 0));
> +   brw_set_default_access_mode(p, BRW_ALIGN_16);
> +   brw_pop_insn_state(p);
> +
> +   /* Write allocated URB handle to temporary passed in src0 */
> +   brw_ff_sync(p,
> +               src0,
> +               0,
> +               dst,
> +               1, /* allocate */
> +               1, /* response length */
> +               0 /* eot */);
> +
> +   /* Now put allocated urb handle in dst.0 */
> +   brw_push_insn_state(p);
> +   brw_set_default_access_mode(p, BRW_ALIGN_1);
> +   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
> +   brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src0, 0));
> +   brw_set_default_access_mode(p, BRW_ALIGN_16);
> +   brw_pop_insn_state(p);
> +}
> +
> +void
>  vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1,
>                                                    struct brw_reg index)
>  {
> @@ -1198,6 +1234,10 @@ vec4_generator::generate_vec4_instruction(vec4_instruction *instruction,
>        generate_gs_get_instance_id(dst);
>        break;
>  
> +   case GS_OPCODE_FF_SYNC:
> +      generate_gs_ff_sync(dst, src[0], src[1]);
> +      break;
> +
>     case SHADER_OPCODE_SHADER_TIME_ADD:
>        brw_shader_time_add(p, src[0],
>                            prog_data->base.binding_table.shader_time_start);
>
On lun, 2014-09-01 at 11:17 -0700, Kenneth Graunke wrote:
> On Thursday, August 14, 2014 01:11:39 PM Iago Toral Quiroga wrote:
> > This implements the FF_SYNC message required in gen6  geometry shaders to
> > get the initial URB handle.
> > ---
> >  src/mesa/drivers/dri/i965/brw_defines.h          | 14 +++++++++
> >  src/mesa/drivers/dri/i965/brw_shader.cpp         |  2 ++
> >  src/mesa/drivers/dri/i965/brw_vec4.h             |  3 ++
> >  src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 40 ++++++++++++++++++++++++
> >  4 files changed, 59 insertions(+)
> > 
> > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
> > index 3564041..125d728 100644
> > --- a/src/mesa/drivers/dri/i965/brw_defines.h
> > +++ b/src/mesa/drivers/dri/i965/brw_defines.h
> > @@ -1002,6 +1002,20 @@ enum opcode {
> >      * - dst is the GRF for gl_InvocationID.
> >      */
> >     GS_OPCODE_GET_INSTANCE_ID,
> > +
> > +   /**
> > +    * Send a FF_SYNC message to allocate initial URB handles (gen6).
> > +    *
> > +    * - dst will hold the newly allocated VUE handle. It is expected to be
> > +    *   be initialized so that it can be used to as the FF_SYNC message header
> > +    *   (that is, it won't do an implied move from R0).
> > +    *
> > +    * - src0 is a temporary that will be used as writeback register for the
> > +    *   FF_SYNC operation.
> 
> I was originally a bit concerned that writing over a source register could get your into trouble - say, optimization passes not noticing that you're modifying src0, or the scheduler not noticing dependencies.  But, since you allocate a temporary, and only use it in this one instruction, there's not really any chance for optimizations to combine it with anything.
> 
> So, I suppose this will work.
> 
> Another way you might consider doing this...
> 
> 1. Add GS_OPCODE_FF_SYNC to vec4_visitor::implied_mrf_writes, and return 1.
>    This tells the scheduler etc. that you implicitly trash 1 MRF.
> 2. Make the visitor set inst->base_mrf, and MOV <base_mrf> g0.
> 3. Make the generator use brw_message_reg(inst->base_mrf) for the header MOVs.
> 
> Then, src0 can be the number of primitives written, and you don't need src1.
> That seems simpler to me.
> 
> brw_ff_sync will resolve your GRF (dst) to an MRF anyway; the above approach would write directly into the MRF, saving you a GRF.
> 
> Thoughts?

That looks like a better idea. I'll give it a go.
Thanks!

Iago