i965: Record NOS dependencies for shader programs and only check those.

Submitted by Kenneth Graunke on Aug. 23, 2017, 2:16 a.m.

Details

Message ID 20170823021609.11749-1-kenneth@whitecape.org
State New
Headers show
Series "i965: Record NOS dependencies for shader programs and only check those." ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Kenneth Graunke Aug. 23, 2017, 2:16 a.m.
Previously, the state upload code listened to a broad set of dirty flags
that corresponded to all possible state dependencies for a shader stage.
This is somewhat overkill.  For example, if a shader has no textures,
there is no need to listen to _NEW_TEXTURE.  Although these extra
dependencies are harmless for correctness, they cause us to recompute
the program keys and search the program cache more often than necessary.

This patch introduces a new brw_program::nos field, containing the dirty
flags which cover a shader's non-orthogonal state dependencies.  We look
at what the shader actually requires, and record that at link time (or
fixed function program generation time).  Then, the state upload code
simply checks the current dirty flags against those.

This should reduce CPU overhead when drawing with the same shaders
multiple times, but changing state or resources (such as binding new
textures or changing uniforms).

Improves performance in GFXBench4's gl_driver2_off on Apollolake at
1280x720 by 3.06834% +/- 0.722141% (n=25).
---
 src/mesa/drivers/dri/i965/brw_context.h | 11 ++++++
 src/mesa/drivers/dri/i965/brw_cs.c      |  7 +++-
 src/mesa/drivers/dri/i965/brw_gs.c      | 13 ++++---
 src/mesa/drivers/dri/i965/brw_link.cpp  |  2 ++
 src/mesa/drivers/dri/i965/brw_program.c | 32 ++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_program.h | 10 ++++++
 src/mesa/drivers/dri/i965/brw_state.h   |  7 ++++
 src/mesa/drivers/dri/i965/brw_tcs.c     | 16 ++++++---
 src/mesa/drivers/dri/i965/brw_tes.c     | 10 ++++--
 src/mesa/drivers/dri/i965/brw_vs.c      | 37 +++++++++++++-------
 src/mesa/drivers/dri/i965/brw_wm.c      | 60 ++++++++++++++++++++++-----------
 11 files changed, 159 insertions(+), 46 deletions(-)

Applies on top of a bunch of my other patches.  'shader-nos' of my tree.

Patch hide | download patch | download mbox

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index a227a61f654..ab4463e0870 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -320,6 +320,17 @@  struct brw_state_flags {
 /** Subclass of Mesa program */
 struct brw_program {
    struct gl_program program;
+
+   /**
+    * Non-orthogonal state bits, indicating which dirty bits we need to
+    * listen to in order to fill out this shader's program key.
+    *
+    * This also includes BRW_NEW_*_PROGRAM, so that the state upload code
+    * can listen to the bits stored in this field, without having to OR an
+    * additional bit in to detect program changes.
+    */
+   struct brw_state_flags nos;
+
    GLuint id;
 
    bool compiled_once;
diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c
index cc564a012b6..6a6c1ba5bb0 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.c
+++ b/src/mesa/drivers/dri/i965/brw_cs.c
@@ -171,6 +171,11 @@  brw_codegen_cs_prog(struct brw_context *brw,
    return true;
 }
 
+void
+brw_cs_record_nos(const struct brw_context *brw, struct brw_program *bprog)
+{
+   bprog->nos.brw = BRW_NEW_COMPUTE_PROGRAM;
+}
 
 static void
 brw_cs_populate_key(struct brw_context *brw, struct brw_cs_prog_key *key)
@@ -200,7 +205,7 @@  brw_upload_cs_prog(struct brw_context *brw)
    if (!cp)
       return;
 
-   if (!brw_state_dirty(brw, _NEW_TEXTURE, BRW_NEW_COMPUTE_PROGRAM))
+   if (!brw_shader_state_dirty(brw, cp))
       return;
 
    brw->cs.base.sampler_count =
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index 179ccc4c6fb..5bb3fd72e9b 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -171,13 +171,12 @@  brw_codegen_gs_prog(struct brw_context *brw,
    return true;
 }
 
-static bool
-brw_gs_state_dirty(const struct brw_context *brw)
+void
+brw_gs_record_nos(const struct brw_context *brw, struct brw_program *bprog)
 {
-   return brw_state_dirty(brw,
-                          _NEW_TEXTURE,
-                          BRW_NEW_GEOMETRY_PROGRAM |
-                          BRW_NEW_TRANSFORM_FEEDBACK);
+   bprog->nos.brw = BRW_NEW_GEOMETRY_PROGRAM;
+
+   bprog->nos.brw |= BRW_NEW_TRANSFORM_FEEDBACK;
 }
 
 void
@@ -203,7 +202,7 @@  brw_upload_gs_prog(struct brw_context *brw)
    /* BRW_NEW_GEOMETRY_PROGRAM */
    struct brw_program *gp = (struct brw_program *) brw->geometry_program;
 
-   if (!brw_gs_state_dirty(brw))
+   if (!brw_shader_state_dirty(brw, gp))
       return;
 
    brw_gs_populate_key(brw, &key);
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index e9158c596c5..ec1f66906a0 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -249,6 +249,8 @@  brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
                                  compiler->scalar_stage[stage]);
       infos[stage] = &prog->nir->info;
 
+      brw_record_nos(brw, brw_program(prog));
+
       /* Make a pass over the IR to add state references for any built-in
        * uniforms that are used.  This has to be done now (during linking).
        * Code generation doesn't happen until the first time this shader is
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 257a99bc946..febdbfcee6f 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -198,6 +198,34 @@  static void brwDeleteProgram( struct gl_context *ctx,
    _mesa_delete_program( ctx, prog );
 }
 
+void
+brw_record_nos(const struct brw_context *brw, struct brw_program *bprog)
+{
+   brw_sampler_record_nos(brw, bprog);
+
+   switch (bprog->program.nir->stage) {
+   case MESA_SHADER_VERTEX:
+      brw_vs_record_nos(brw, bprog);
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      brw_tcs_record_nos(brw, bprog);
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      brw_tes_record_nos(brw, bprog);
+      break;
+   case MESA_SHADER_GEOMETRY:
+      brw_gs_record_nos(brw, bprog);
+      break;
+   case MESA_SHADER_FRAGMENT:
+      brw_fs_record_nos(brw, bprog);
+      break;
+   case MESA_SHADER_COMPUTE:
+      brw_cs_record_nos(brw, bprog);
+      break;
+   default:
+      unreachable("invalid shader stage");
+   }
+}
 
 static GLboolean
 brwProgramStringNotify(struct gl_context *ctx,
@@ -221,6 +249,8 @@  brwProgramStringNotify(struct gl_context *ctx,
 
       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_FRAGMENT, true);
 
+      brw_record_nos(brw, newFP);
+
       brw_fs_precompile(ctx, prog);
       break;
    }
@@ -243,6 +273,8 @@  brwProgramStringNotify(struct gl_context *ctx,
       prog->nir = brw_create_nir(brw, NULL, prog, MESA_SHADER_VERTEX,
                                  compiler->scalar_stage[MESA_SHADER_VERTEX]);
 
+      brw_record_nos(brw, newVP);
+
       brw_vs_precompile(ctx, prog);
       break;
    }
diff --git a/src/mesa/drivers/dri/i965/brw_program.h b/src/mesa/drivers/dri/i965/brw_program.h
index e62b7d366c8..c439f4e01a7 100644
--- a/src/mesa/drivers/dri/i965/brw_program.h
+++ b/src/mesa/drivers/dri/i965/brw_program.h
@@ -81,6 +81,16 @@  void brw_upload_tes_prog(struct brw_context *brw);
 void brw_tes_populate_key(struct brw_context *brw,
                           struct brw_tes_prog_key *key);
 
+void brw_record_nos(const struct brw_context *brw, struct brw_program *bp);
+void brw_sampler_record_nos(const struct brw_context *brw,
+                            struct brw_program *bp);
+void brw_vs_record_nos(const struct brw_context *brw, struct brw_program *bp);
+void brw_tcs_record_nos(const struct brw_context *brw, struct brw_program *bp);
+void brw_tes_record_nos(const struct brw_context *brw, struct brw_program *bp);
+void brw_gs_record_nos(const struct brw_context *brw, struct brw_program *bp);
+void brw_fs_record_nos(const struct brw_context *brw, struct brw_program *bp);
+void brw_cs_record_nos(const struct brw_context *brw, struct brw_program *bp);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index dc893e5b7bd..5d598733b19 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -111,6 +111,13 @@  brw_state_dirty(const struct brw_context *brw,
            (brw->ctx.NewDriverState & brw_flags)) != 0;
 }
 
+static inline bool
+brw_shader_state_dirty(const struct brw_context *brw,
+                       const struct brw_program *prog)
+{
+   return brw_state_dirty(brw, prog->nos.mesa, prog->nos.brw);
+}
+
 /* brw_binding_tables.c */
 void brw_upload_binding_table(struct brw_context *brw,
                               uint32_t packet_name,
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 1ed622eebb1..799e935ae4b 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -290,6 +290,15 @@  brw_codegen_tcs_prog(struct brw_context *brw, struct brw_program *tcp,
    return true;
 }
 
+void
+brw_tcs_record_nos(const struct brw_context *brw, struct brw_program *bprog)
+{
+   bprog->nos.brw = BRW_NEW_TESS_PROGRAMS;
+
+   if (brw->gen < 8)
+      bprog->nos.brw |= BRW_NEW_PATCH_PRIMITIVE;
+}
+
 void
 brw_tcs_populate_key(struct brw_context *brw,
                      struct brw_tcs_prog_key *key)
@@ -341,10 +350,9 @@  brw_upload_tcs_prog(struct brw_context *brw)
       (struct brw_program *) brw->tess_eval_program;
    assert(tep);
 
-   if (!brw_state_dirty(brw,
-                        _NEW_TEXTURE,
-                        BRW_NEW_PATCH_PRIMITIVE |
-                        BRW_NEW_TESS_PROGRAMS))
+   if (!(tcp ? brw_shader_state_dirty(brw, tcp)
+             : brw_state_dirty(brw, 0, BRW_NEW_TESS_PROGRAMS |
+                                       BRW_NEW_PATCH_PRIMITIVE)))
       return;
 
    brw_tcs_populate_key(brw, &key);
diff --git a/src/mesa/drivers/dri/i965/brw_tes.c b/src/mesa/drivers/dri/i965/brw_tes.c
index 20ce1f4c15b..fd22fc4fdcc 100644
--- a/src/mesa/drivers/dri/i965/brw_tes.c
+++ b/src/mesa/drivers/dri/i965/brw_tes.c
@@ -161,6 +161,12 @@  brw_codegen_tes_prog(struct brw_context *brw,
    return true;
 }
 
+void
+brw_tes_record_nos(const struct brw_context *brw, struct brw_program *bprog)
+{
+   bprog->nos.brw = BRW_NEW_TESS_PROGRAMS;
+}
+
 void
 brw_tes_populate_key(struct brw_context *brw,
                      struct brw_tes_prog_key *key)
@@ -202,9 +208,7 @@  brw_upload_tes_prog(struct brw_context *brw)
    /* BRW_NEW_TESS_PROGRAMS */
    struct brw_program *tep = (struct brw_program *) brw->tess_eval_program;
 
-   if (!brw_state_dirty(brw,
-                        _NEW_TEXTURE,
-                        BRW_NEW_TESS_PROGRAMS))
+   if (!brw_shader_state_dirty(brw, tep))
       return;
 
    brw_tes_populate_key(brw, &key);
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index c0a0a13f230..b616637456d 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -284,18 +284,31 @@  brw_codegen_vs_prog(struct brw_context *brw,
    return true;
 }
 
-static bool
-brw_vs_state_dirty(const struct brw_context *brw)
+void
+brw_vs_record_nos(const struct brw_context *brw, struct brw_program *bprog)
 {
-   return brw_state_dirty(brw,
-                          _NEW_BUFFERS |
-                          _NEW_LIGHT |
-                          _NEW_POINT |
-                          _NEW_POLYGON |
-                          _NEW_TEXTURE |
-                          _NEW_TRANSFORM,
-                          BRW_NEW_VERTEX_PROGRAM |
-                          BRW_NEW_VS_ATTRIB_WORKAROUNDS);
+   const struct gl_context *ctx = &brw->ctx;
+   const struct shader_info *info = &bprog->program.info;
+
+   bprog->nos.brw = BRW_NEW_VERTEX_PROGRAM;
+
+   if (info->clip_distance_array_size == 0 &&
+       (ctx->API == API_OPENGL_COMPAT || ctx->API == API_OPENGLES)) {
+      bprog->nos.mesa |= _NEW_TRANSFORM;
+   }
+
+   if (brw->gen < 6) {
+      bprog->nos.mesa |= _NEW_POLYGON | _NEW_POINT;
+   }
+
+   if (info->outputs_written & (VARYING_BIT_COL0 | VARYING_BIT_COL1 |
+                                VARYING_BIT_BFC0 | VARYING_BIT_BFC1)) {
+      bprog->nos.mesa |= _NEW_LIGHT | _NEW_BUFFERS;
+   }
+
+   if (brw->gen < 8 && !brw->is_haswell) {
+      bprog->nos.brw |= BRW_NEW_VS_ATTRIB_WORKAROUNDS;
+   }
 }
 
 void
@@ -356,7 +369,7 @@  brw_upload_vs_prog(struct brw_context *brw)
    /* BRW_NEW_VERTEX_PROGRAM */
    struct brw_program *vp = (struct brw_program *)brw->vertex_program;
 
-   if (!brw_vs_state_dirty(brw))
+   if (!brw_shader_state_dirty(brw, vp))
       return;
 
    brw_vs_populate_key(brw, &key);
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index e1555d60c56..680f478401b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -301,6 +301,17 @@  gen6_gather_workaround(GLenum internalformat)
    }
 }
 
+void
+brw_sampler_record_nos(const struct brw_context *brw,
+                       struct brw_program *bprog)
+{
+   struct gl_program *prog = &bprog->program;
+
+   if (prog->SamplersUsed) {
+      bprog->nos.mesa |= _NEW_TEXTURE;
+   }
+}
+
 void
 brw_populate_sampler_prog_key_data(struct gl_context *ctx,
                                    const struct gl_program *prog,
@@ -432,27 +443,38 @@  brw_populate_sampler_prog_key_data(struct gl_context *ctx,
    }
 }
 
-static bool
-brw_wm_state_dirty(const struct brw_context *brw)
+void
+brw_fs_record_nos(const struct brw_context *brw, struct brw_program *bprog)
 {
-   return brw_state_dirty(brw,
-                          _NEW_BUFFERS |
-                          _NEW_COLOR |
-                          _NEW_DEPTH |
-                          _NEW_FRAG_CLAMP |
-                          _NEW_HINT |
-                          _NEW_LIGHT |
-                          _NEW_LINE |
-                          _NEW_MULTISAMPLE |
-                          _NEW_POLYGON |
-                          _NEW_STENCIL |
-                          _NEW_TEXTURE,
-                          BRW_NEW_FRAGMENT_PROGRAM |
-                          BRW_NEW_REDUCED_PRIMITIVE |
-                          BRW_NEW_STATS_WM |
-                          BRW_NEW_VUE_MAP_GEOM_OUT);
+   const struct gl_context *ctx = &brw->ctx;
+   struct gl_program *prog = &bprog->program;
+   const struct shader_info *info = &prog->info;
+
+   bprog->nos.brw = BRW_NEW_FRAGMENT_PROGRAM;
+
+   bprog->nos.mesa |= _NEW_BUFFERS | _NEW_MULTISAMPLE;
+   bprog->nos.brw |= BRW_NEW_VUE_MAP_GEOM_OUT;
+
+   if (brw->gen < 6) {
+      bprog->nos.mesa |= _NEW_COLOR | _NEW_DEPTH | _NEW_STENCIL;
+   }
+
+   bprog->nos.mesa |= _NEW_LINE | _NEW_POLYGON;
+   bprog->nos.brw |= BRW_NEW_REDUCED_PRIMITIVE;
+
+   bprog->nos.mesa |= _NEW_HINT;
+
+   if (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1))
+      bprog->nos.mesa |= _NEW_LIGHT;
+
+   if (ctx->API == API_OPENGL_COMPAT)
+      bprog->nos.mesa |= _NEW_FRAG_CLAMP;
+
+   if (brw->dual_color_blend_by_location)
+      bprog->nos.mesa |= _NEW_COLOR;
 }
 
+
 void
 brw_wm_populate_key(struct brw_context *brw, struct brw_wm_prog_key *key)
 {
@@ -595,7 +617,7 @@  brw_upload_wm_prog(struct brw_context *brw)
    struct brw_wm_prog_key key;
    struct brw_program *fp = (struct brw_program *) brw->fragment_program;
 
-   if (!brw_wm_state_dirty(brw))
+   if (!brw_shader_state_dirty(brw, fp))
       return;
 
    brw_wm_populate_key(brw, &key);

Comments

On 08/22, Kenneth Graunke wrote:
>Previously, the state upload code listened to a broad set of dirty flags
>that corresponded to all possible state dependencies for a shader stage.
>This is somewhat overkill.  For example, if a shader has no textures,
>there is no need to listen to _NEW_TEXTURE.  Although these extra
>dependencies are harmless for correctness, they cause us to recompute
>the program keys and search the program cache more often than necessary.
>
>This patch introduces a new brw_program::nos field, containing the dirty
>flags which cover a shader's non-orthogonal state dependencies.  We look
>at what the shader actually requires, and record that at link time (or
>fixed function program generation time).  Then, the state upload code
>simply checks the current dirty flags against those.
>
>This should reduce CPU overhead when drawing with the same shaders
>multiple times, but changing state or resources (such as binding new
>textures or changing uniforms).
>
>Improves performance in GFXBench4's gl_driver2_off on Apollolake at
>1280x720 by 3.06834% +/- 0.722141% (n=25).

That all makes sense and seems like an obviously good idea.

Reviewed-by: Matt Turner <mattst88@gmail.com>