[Mesa-dev] Remove unneeded stall calls from batches on Baytrail.

Submitted by Gregory Hunt on June 18, 2014, 10:21 a.m.

Details

Message ID 1403086860-28010-1-git-send-email-greg.hunt@mobica.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Gregory Hunt June 18, 2014, 10:21 a.m.
From: Greg Hunt <greg.hunt@mobica.com>

These cause a small slowdown when we are sending a large number of small batches to the GPU.

Signed-off-by: Gregory Hunt <greg.hunt@mobica.com>
---
 src/mesa/drivers/dri/i965/gen6_vs_state.c      | 2 +-
 src/mesa/drivers/dri/i965/gen7_blorp.cpp       | 2 +-
 src/mesa/drivers/dri/i965/gen7_gs_state.c      | 2 +-
 src/mesa/drivers/dri/i965/gen7_sampler_state.c | 2 +-
 src/mesa/drivers/dri/i965/gen7_urb.c           | 6 +++---
 src/mesa/drivers/dri/i965/gen7_vs_state.c      | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c b/src/mesa/drivers/dri/i965/gen6_vs_state.c
index 9764645..a46cc48 100644
--- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
@@ -100,7 +100,7 @@  gen6_upload_vs_push_constants(struct brw_context *brw)
                                    stage_state, AUB_TRACE_VS_CONSTANTS);
 
    if (brw->gen >= 7) {
-      if (brw->gen == 7 && !brw->is_haswell)
+      if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
          gen7_emit_vs_workaround_flush(brw);
 
       gen7_upload_constant_state(brw, stage_state, true /* active */,
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 448b505..a1337fe 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -414,7 +414,7 @@  gen7_blorp_emit_gs_disable(struct brw_context *brw,
     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
     * Stall" bit set.
     */
-   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled)
+   if (!brw->is_haswell && !brw->is_baytrail && brw->gt == 2 && brw->gs.enabled)
       gen7_emit_cs_stall_flush(brw);
 
    BEGIN_BATCH(7);
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index 30dfa6b..786e1fb 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -82,7 +82,7 @@  upload_gs_state(struct brw_context *brw)
     * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
     * Stall" bit set.
     */
-   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled != active)
+   if (!brw->is_haswell && !brw->is_baytrail && brw->gt == 2 && brw->gs.enabled != active)
       gen7_emit_cs_stall_flush(brw);
 
    if (active) {
diff --git a/src/mesa/drivers/dri/i965/gen7_sampler_state.c b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
index 6077ff2..219a174 100644
--- a/src/mesa/drivers/dri/i965/gen7_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
@@ -212,7 +212,7 @@  gen7_upload_sampler_state_table(struct brw_context *brw,
       }
    }
 
-  if (brw->gen == 7 && !brw->is_haswell &&
+  if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail &&
       stage_state->stage == MESA_SHADER_VERTEX) {
       gen7_emit_vs_workaround_flush(brw);
   }
diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c b/src/mesa/drivers/dri/i965/gen7_urb.c
index 2653e9c..190d6f0 100644
--- a/src/mesa/drivers/dri/i965/gen7_urb.c
+++ b/src/mesa/drivers/dri/i965/gen7_urb.c
@@ -121,9 +121,9 @@  gen7_emit_push_constant_state(struct brw_context *brw, unsigned vs_size,
     *     A PIPE_CONTOL command with the CS Stall bit set must be programmed
     *     in the ring after this instruction.
     *
-    * No such restriction exists for Haswell.
+    * No such restriction exists for Haswell or Baytrail.
     */
-   if (brw->gen < 8 && !brw->is_haswell)
+   if (brw->gen < 8 && !brw->is_haswell && !brw->is_baytrail)
       gen7_emit_cs_stall_flush(brw);
 }
 
@@ -263,7 +263,7 @@  gen7_upload_urb(struct brw_context *brw)
    brw->urb.vs_start = push_constant_chunks;
    brw->urb.gs_start = push_constant_chunks + vs_chunks;
 
-   if (brw->gen == 7 && !brw->is_haswell)
+   if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
       gen7_emit_vs_workaround_flush(brw);
    gen7_emit_urb_state(brw,
                        brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 4d99150..01be756 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -72,7 +72,7 @@  upload_vs_state(struct brw_context *brw)
    const int max_threads_shift = brw->is_haswell ?
       HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
 
-   if (!brw->is_haswell)
+   if (!brw->is_haswell && !brw->is_baytrail)
       gen7_emit_vs_workaround_flush(brw);
 
    /* Use ALT floating point mode for ARB vertex programs, because they

Comments

On Wednesday, June 18, 2014 11:21:00 AM Gregory Hunt wrote:
> From: Greg Hunt <greg.hunt@mobica.com>
> 
> These cause a small slowdown when we are sending a large number of small 
batches to the GPU.

Hello!

Do you have any more specific data?  For example, "improves performance by X% 
in application Y" would be great.  We don't need absolute FPS numbers, but 
relative percentages are useful.  I've heard 5% through the grapevine, but I 
don't know what application/test that was in.

> 
> Signed-off-by: Gregory Hunt <greg.hunt@mobica.com>
> ---
>  src/mesa/drivers/dri/i965/gen6_vs_state.c      | 2 +-
>  src/mesa/drivers/dri/i965/gen7_blorp.cpp       | 2 +-
>  src/mesa/drivers/dri/i965/gen7_gs_state.c      | 2 +-
>  src/mesa/drivers/dri/i965/gen7_sampler_state.c | 2 +-
>  src/mesa/drivers/dri/i965/gen7_urb.c           | 6 +++---
>  src/mesa/drivers/dri/i965/gen7_vs_state.c      | 2 +-
>  6 files changed, 8 insertions(+), 8 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c 
b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> index 9764645..a46cc48 100644
> --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> @@ -100,7 +100,7 @@ gen6_upload_vs_push_constants(struct brw_context *brw)
>                                     stage_state, AUB_TRACE_VS_CONSTANTS);
>  
>     if (brw->gen >= 7) {
> -      if (brw->gen == 7 && !brw->is_haswell)
> +      if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
>           gen7_emit_vs_workaround_flush(brw);
>  
>        gen7_upload_constant_state(brw, stage_state, true /* active */,
> diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp 
b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> index 448b505..a1337fe 100644
> --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> @@ -414,7 +414,7 @@ gen7_blorp_emit_gs_disable(struct brw_context *brw,
>      * whole fixed function pipeline" means to emit a PIPE_CONTROL with the 
"CS
>      * Stall" bit set.
>      */
> -   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled)
> +   if (!brw->is_haswell && !brw->is_baytrail && brw->gt == 2 && brw-
>gs.enabled)
>        gen7_emit_cs_stall_flush(brw);

This hunk doesn't do anything: brw->gt is always 1 on Baytrail, so this should 
never have triggered in the first place.  I'd drop this hunk.

However, there's some evidence that we may actually need to do this flush: I 
believe the Windows VLV mobile driver does so.  Not sure if they're just 
carrying that over from IVB, or if it's actually necessary.

We can always investigate/fix that later.

>     BEGIN_BATCH(7);
> diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c 
b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> index 30dfa6b..786e1fb 100644
> --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> @@ -82,7 +82,7 @@ upload_gs_state(struct brw_context *brw)
>      * whole fixed function pipeline" means to emit a PIPE_CONTROL with the 
"CS
>      * Stall" bit set.
>      */
> -   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled != active)
> +   if (!brw->is_haswell && !brw->is_baytrail && brw->gt == 2 && brw-
>gs.enabled != active)
>        gen7_emit_cs_stall_flush(brw);

Likewise, this hunk doesn't do anything either.  Please drop it.

>     if (active) {
> diff --git a/src/mesa/drivers/dri/i965/gen7_sampler_state.c 
b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
> index 6077ff2..219a174 100644
> --- a/src/mesa/drivers/dri/i965/gen7_sampler_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
> @@ -212,7 +212,7 @@ gen7_upload_sampler_state_table(struct brw_context *brw,
>        }
>     }
>  
> -  if (brw->gen == 7 && !brw->is_haswell &&
> +  if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail &&
>        stage_state->stage == MESA_SHADER_VERTEX) {
>        gen7_emit_vs_workaround_flush(brw);
>    }
> diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c 
b/src/mesa/drivers/dri/i965/gen7_urb.c
> index 2653e9c..190d6f0 100644
> --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> @@ -121,9 +121,9 @@ gen7_emit_push_constant_state(struct brw_context *brw, 
unsigned vs_size,
>      *     A PIPE_CONTOL command with the CS Stall bit set must be 
programmed
>      *     in the ring after this instruction.
>      *
> -    * No such restriction exists for Haswell.
> +    * No such restriction exists for Haswell or Baytrail.
>      */
> -   if (brw->gen < 8 && !brw->is_haswell)
> +   if (brw->gen < 8 && !brw->is_haswell && !brw->is_baytrail)
>        gen7_emit_cs_stall_flush(brw);
>  }
>  
> @@ -263,7 +263,7 @@ gen7_upload_urb(struct brw_context *brw)
>     brw->urb.vs_start = push_constant_chunks;
>     brw->urb.gs_start = push_constant_chunks + vs_chunks;
>  
> -   if (brw->gen == 7 && !brw->is_haswell)
> +   if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
>        gen7_emit_vs_workaround_flush(brw);
>     gen7_emit_urb_state(brw,
>                         brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
> diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c 
b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> index 4d99150..01be756 100644
> --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
> +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> @@ -72,7 +72,7 @@ upload_vs_state(struct brw_context *brw)
>     const int max_threads_shift = brw->is_haswell ?
>        HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
>  
> -   if (!brw->is_haswell)
> +   if (!brw->is_haswell && !brw->is_baytrail)
>        gen7_emit_vs_workaround_flush(brw);
>  
>     /* Use ALT floating point mode for ARB vertex programs, because they
> 

I dug around in the workarounds list, and it looks like 
WaPostSyncOpBeforeVSState isn't necessary on production Baytrail.  So, I think 
we can indeed drop these flushes.  Good catch, and thanks for the patch!

Please send a V2 of the patch with the "improves performance by 5% in <app>" 
note in the commit message and the extra hunks dropped.  Then, I believe we 
can commit this.

--Ken
Kenneth Graunke <kenneth@whitecape.org> writes:
> Please send a V2 of the patch with the "improves performance by 5% in <app>" 
> note in the commit message and the extra hunks dropped.  Then, I believe we 
> can commit this.

Also, with the updated version, you can drop the CC of
mesa-stable@. This looks like a performance optimization, not a bug fix.

-Carl
On 24 June 2014 22:25, Kenneth Graunke <kenneth@whitecape.org> wrote:
>
> On Wednesday, June 18, 2014 11:21:00 AM Gregory Hunt wrote:
> > From: Greg Hunt <greg.hunt@mobica.com>
> >
> > These cause a small slowdown when we are sending a large number of small
> batches to the GPU.
>
> Hello!
>
> Do you have any more specific data?  For example, "improves performance by X%
> in application Y" would be great.  We don't need absolute FPS numbers, but
> relative percentages are useful.  I've heard 5% through the grapevine, but I
> don't know what application/test that was in.
Synmark Batch tests 4-7 improve by 3.5 to 5.3%, a couple of other CPU
bound test cases also improve (DrvState1, HdrBloom, Multithread,
ShMapPcf)

>
> >
> > Signed-off-by: Gregory Hunt <greg.hunt@mobica.com>
> > ---
> >  src/mesa/drivers/dri/i965/gen6_vs_state.c      | 2 +-
> >  src/mesa/drivers/dri/i965/gen7_blorp.cpp       | 2 +-
> >  src/mesa/drivers/dri/i965/gen7_gs_state.c      | 2 +-
> >  src/mesa/drivers/dri/i965/gen7_sampler_state.c | 2 +-
> >  src/mesa/drivers/dri/i965/gen7_urb.c           | 6 +++---
> >  src/mesa/drivers/dri/i965/gen7_vs_state.c      | 2 +-
> >  6 files changed, 8 insertions(+), 8 deletions(-)
> >
> > diff --git a/src/mesa/drivers/dri/i965/gen6_vs_state.c
> b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> > index 9764645..a46cc48 100644
> > --- a/src/mesa/drivers/dri/i965/gen6_vs_state.c
> > +++ b/src/mesa/drivers/dri/i965/gen6_vs_state.c
> > @@ -100,7 +100,7 @@ gen6_upload_vs_push_constants(struct brw_context *brw)
> >                                     stage_state, AUB_TRACE_VS_CONSTANTS);
> >
> >     if (brw->gen >= 7) {
> > -      if (brw->gen == 7 && !brw->is_haswell)
> > +      if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
> >           gen7_emit_vs_workaround_flush(brw);
> >
> >        gen7_upload_constant_state(brw, stage_state, true /* active */,
> > diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> > index 448b505..a1337fe 100644
> > --- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> > +++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
> > @@ -414,7 +414,7 @@ gen7_blorp_emit_gs_disable(struct brw_context *brw,
> >      * whole fixed function pipeline" means to emit a PIPE_CONTROL with the
> "CS
> >      * Stall" bit set.
> >      */
> > -   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled)
> > +   if (!brw->is_haswell && !brw->is_baytrail && brw->gt == 2 && brw-
> >gs.enabled)
> >        gen7_emit_cs_stall_flush(brw);
>
> This hunk doesn't do anything: brw->gt is always 1 on Baytrail, so this should
> never have triggered in the first place.  I'd drop this hunk.
Ah yes, I didn't notice that.

>
> However, there's some evidence that we may actually need to do this flush: I
> believe the Windows VLV mobile driver does so.  Not sure if they're just
> carrying that over from IVB, or if it's actually necessary.
>
I've checked the bspec and also looked for piglit regressions, and it
seems unnecessary.

> We can always investigate/fix that later.
>
> >     BEGIN_BATCH(7);
> > diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c
> b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> > index 30dfa6b..786e1fb 100644
> > --- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
> > +++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
> > @@ -82,7 +82,7 @@ upload_gs_state(struct brw_context *brw)
> >      * whole fixed function pipeline" means to emit a PIPE_CONTROL with the
> "CS
> >      * Stall" bit set.
> >      */
> > -   if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled != active)
> > +   if (!brw->is_haswell && !brw->is_baytrail && brw->gt == 2 && brw-
> >gs.enabled != active)
> >        gen7_emit_cs_stall_flush(brw);
>
> Likewise, this hunk doesn't do anything either.  Please drop it.
>
> >     if (active) {
> > diff --git a/src/mesa/drivers/dri/i965/gen7_sampler_state.c
> b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
> > index 6077ff2..219a174 100644
> > --- a/src/mesa/drivers/dri/i965/gen7_sampler_state.c
> > +++ b/src/mesa/drivers/dri/i965/gen7_sampler_state.c
> > @@ -212,7 +212,7 @@ gen7_upload_sampler_state_table(struct brw_context *brw,
> >        }
> >     }
> >
> > -  if (brw->gen == 7 && !brw->is_haswell &&
> > +  if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail &&
> >        stage_state->stage == MESA_SHADER_VERTEX) {
> >        gen7_emit_vs_workaround_flush(brw);
> >    }
> > diff --git a/src/mesa/drivers/dri/i965/gen7_urb.c
> b/src/mesa/drivers/dri/i965/gen7_urb.c
> > index 2653e9c..190d6f0 100644
> > --- a/src/mesa/drivers/dri/i965/gen7_urb.c
> > +++ b/src/mesa/drivers/dri/i965/gen7_urb.c
> > @@ -121,9 +121,9 @@ gen7_emit_push_constant_state(struct brw_context *brw,
> unsigned vs_size,
> >      *     A PIPE_CONTOL command with the CS Stall bit set must be
> programmed
> >      *     in the ring after this instruction.
> >      *
> > -    * No such restriction exists for Haswell.
> > +    * No such restriction exists for Haswell or Baytrail.
> >      */
> > -   if (brw->gen < 8 && !brw->is_haswell)
> > +   if (brw->gen < 8 && !brw->is_haswell && !brw->is_baytrail)
> >        gen7_emit_cs_stall_flush(brw);
> >  }
> >
> > @@ -263,7 +263,7 @@ gen7_upload_urb(struct brw_context *brw)
> >     brw->urb.vs_start = push_constant_chunks;
> >     brw->urb.gs_start = push_constant_chunks + vs_chunks;
> >
> > -   if (brw->gen == 7 && !brw->is_haswell)
> > +   if (brw->gen == 7 && !brw->is_haswell && !brw->is_baytrail)
> >        gen7_emit_vs_workaround_flush(brw);
> >     gen7_emit_urb_state(brw,
> >                         brw->urb.nr_vs_entries, vs_size, brw->urb.vs_start,
> > diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c
> b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> > index 4d99150..01be756 100644
> > --- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
> > +++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
> > @@ -72,7 +72,7 @@ upload_vs_state(struct brw_context *brw)
> >     const int max_threads_shift = brw->is_haswell ?
> >        HSW_VS_MAX_THREADS_SHIFT : GEN6_VS_MAX_THREADS_SHIFT;
> >
> > -   if (!brw->is_haswell)
> > +   if (!brw->is_haswell && !brw->is_baytrail)
> >        gen7_emit_vs_workaround_flush(brw);
> >
> >     /* Use ALT floating point mode for ARB vertex programs, because they
> >
>
> I dug around in the workarounds list, and it looks like
> WaPostSyncOpBeforeVSState isn't necessary on production Baytrail.  So, I think
> we can indeed drop these flushes.  Good catch, and thanks for the patch!
>
> Please send a V2 of the patch with the "improves performance by 5% in <app>"
> note in the commit message and the extra hunks dropped.  Then, I believe we
> can commit this.

Ok thanks, I'll get another patch ready.

>
> --Ken