[2/6] radeonsi:optimizing SET_CONTEXT_REG for shaders GS

Submitted by Jiang, Sonny on Sept. 18, 2018, 8:21 p.m.

Details

Message ID 20180918202115.9125-2-sonny.jiang@amd.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Jiang, Sonny Sept. 18, 2018, 8:21 p.m.
Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
---
 src/gallium/drivers/radeonsi/si_gfx_cs.c      |  15 ++-
 src/gallium/drivers/radeonsi/si_state.h       |  19 ++-
 .../drivers/radeonsi/si_state_shaders.c       | 112 ++++++++++++------
 3 files changed, 109 insertions(+), 37 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 99376c4847..39a97bf3da 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -352,9 +352,22 @@  void si_begin_new_gfx_cs(struct si_context *ctx)
 		ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]	= 0x3f800000;
 		ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE]	= 0xffff;
 		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE]    = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
+		ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
 
 		/* Set all saved registers state to saved. */
-		ctx->tracked_regs.reg_saved = 0xffffffff;
+		ctx->tracked_regs.reg_saved = 0xffffffffffffffff;
 	} else {
 		/* Set all saved registers state to unknown. */
 		ctx->tracked_regs.reg_saved = 0;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 7fb09ca953..9efb4be5a5 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -279,11 +279,28 @@  enum si_tracked_reg {
 
 	SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
 
+	SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 4 consecutive registers */
+	SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
+	SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
+	SI_TRACKED_VGT_GS_OUT_PRIM_TYPE,
+
+	SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+	SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+
+	SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
+	SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
+	SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
+	SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
+
+	SI_TRACKED_VGT_GS_INSTANCE_CNT,
+	SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+	SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+
 	SI_NUM_TRACKED_REGS,
 };
 
 struct si_tracked_regs {
-	uint32_t		reg_saved;
+	uint64_t		reg_saved;
 	uint32_t		reg_value[SI_NUM_TRACKED_REGS];
 	uint32_t		spi_ps_input_cntl[32];
 };
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 0716021d85..87b58cd6e5 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -747,48 +747,99 @@  static void gfx9_get_gs_info(struct si_shader_selector *es,
 	assert(out->max_prims_per_subgroup <= max_out_prims);
 }
 
-static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
+static void si_emit_shader_gs(struct si_context *sctx)
 {
+	struct si_shader *shader = sctx->queued.named.gs->shader;
+	if (!shader)
+		return;
+
 	struct si_shader_selector *sel = shader->selector;
 	const ubyte *num_components = sel->info.num_stream_output_components;
 	unsigned gs_num_invocations = sel->gs_num_invocations;
-	struct si_pm4_state *pm4;
 	uint64_t va;
 	unsigned max_stream = sel->max_gs_stream;
-	unsigned offset;
+	unsigned offset_1, offset_2, offset_3, vgt_gsvs_ring_itemsize;
 
-	pm4 = si_get_shader_pm4_state(shader);
-	if (!pm4)
-		return;
-
-	offset = num_components[0] * sel->gs_max_out_vertices;
-	si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset);
+	offset_3 = offset_2 = offset_1 = num_components[0] * sel->gs_max_out_vertices;
 	if (max_stream >= 1)
-		offset += num_components[1] * sel->gs_max_out_vertices;
-	si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset);
+		offset_3 = offset_2 += num_components[1] * sel->gs_max_out_vertices;
 	if (max_stream >= 2)
-		offset += num_components[2] * sel->gs_max_out_vertices;
-	si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
-	si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
-		       si_conv_prim_to_gs_out(sel->gs_output_prim));
+		offset_3 += num_components[2] * sel->gs_max_out_vertices;
+
+
+	/* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
+	 * R_028A68_VGT_GSVS_RING_OFFSET_3, R_028A6C_VGT_GS_OUT_PRIM_TYPE */
+	radeon_opt_set_context_reg4(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
+				    SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
+				    offset_1, offset_2, offset_3,
+				    si_conv_prim_to_gs_out(sel->gs_output_prim));
+
+	vgt_gsvs_ring_itemsize = offset_3;
 	if (max_stream >= 3)
-		offset += num_components[3] * sel->gs_max_out_vertices;
-	si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
+		vgt_gsvs_ring_itemsize += num_components[3] * sel->gs_max_out_vertices;
 
 	/* The GSVS_RING_ITEMSIZE register takes 15 bits */
-	assert(offset < (1 << 15));
+	assert(vgt_gsvs_ring_itemsize < (1 << 15));
+
+	/* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
+	radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
+				   SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
+				   vgt_gsvs_ring_itemsize);
+
+	/* R_028B38_VGT_GS_MAX_VERT_OUT */
+	radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
+				   SI_TRACKED_VGT_GS_MAX_VERT_OUT,
+				   sel->gs_max_out_vertices);
+
+	/* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
+	 * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
+	radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE,
+				    SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
+				     num_components[0],
+				     (max_stream >= 1) ? num_components[1] : 0,
+				     (max_stream >= 2) ? num_components[2] : 0,
+				     (max_stream >= 3) ? num_components[3] : 0);
+
+	/* R_028B90_VGT_GS_INSTANCE_CNT */
+	radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
+				   SI_TRACKED_VGT_GS_INSTANCE_CNT,
+				   S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
+				   S_028B90_ENABLE(gs_num_invocations > 0));
 
-	si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, sel->gs_max_out_vertices);
+	va = shader->bo->gpu_address;
 
-	si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
-	si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
-	si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
-	si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
+	if (sctx->chip_class >= GFX9) {
+		struct gfx9_gs_info gs_info;
+		gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);
 
-	si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
-		       S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
-		       S_028B90_ENABLE(gs_num_invocations > 0));
+		/* R_028A44_VGT_GS_ONCHIP_CNTL */
+		radeon_opt_set_context_reg(sctx, R_028A44_VGT_GS_ONCHIP_CNTL,
+				SI_TRACKED_VGT_GS_ONCHIP_CNTL,
+				S_028A44_ES_VERTS_PER_SUBGRP(gs_info.es_verts_per_subgroup) |
+				S_028A44_GS_PRIMS_PER_SUBGRP(gs_info.gs_prims_per_subgroup) |
+				S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info.gs_inst_prims_in_subgroup));
+		/* R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP */
+		radeon_opt_set_context_reg(sctx, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+				SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
+				S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup));
+		/* R_028AAC_VGT_ESGS_RING_ITEMSIZE */
+		radeon_opt_set_context_reg(sctx, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
+				SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
+				shader->key.part.gs.es->esgs_itemsize / 4);
+	}
+}
+
+static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
+{
+	struct si_shader_selector *sel = shader->selector;
+	struct si_pm4_state *pm4;
+	uint64_t va;
 
+	pm4 = si_get_shader_pm4_state(shader);
+	if (!pm4)
+		return;
+
+	pm4->atom.emit = si_emit_shader_gs;
 	va = shader->bo->gpu_address;
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY);
 
@@ -843,15 +894,6 @@  static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
 			       S_00B22C_LDS_SIZE(gs_info.lds_size) |
 			       S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 
-		si_pm4_set_reg(pm4, R_028A44_VGT_GS_ONCHIP_CNTL,
-			       S_028A44_ES_VERTS_PER_SUBGRP(gs_info.es_verts_per_subgroup) |
-			       S_028A44_GS_PRIMS_PER_SUBGRP(gs_info.gs_prims_per_subgroup) |
-			       S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info.gs_inst_prims_in_subgroup));
-		si_pm4_set_reg(pm4, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
-			       S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup));
-		si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
-			       shader->key.part.gs.es->esgs_itemsize / 4);
-
 		if (es_type == PIPE_SHADER_TESS_EVAL)
 			si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4);
 

Comments

On Tue, Sep 18, 2018 at 4:21 PM, Sonny Jiang <sonny.jiang@amd.com> wrote:
> Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
> ---
>  src/gallium/drivers/radeonsi/si_gfx_cs.c      |  15 ++-
>  src/gallium/drivers/radeonsi/si_state.h       |  19 ++-
>  .../drivers/radeonsi/si_state_shaders.c       | 112 ++++++++++++------
>  3 files changed, 109 insertions(+), 37 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> index 99376c4847..39a97bf3da 100644
> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> @@ -352,9 +352,22 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
>                 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]  = 0x3f800000;
>                 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE]     = 0xffff;
>                 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE]    = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
>
>                 /* Set all saved registers state to saved. */
> -               ctx->tracked_regs.reg_saved = 0xffffffff;
> +               ctx->tracked_regs.reg_saved = 0xffffffffffffffff;
>         } else {
>                 /* Set all saved registers state to unknown. */
>                 ctx->tracked_regs.reg_saved = 0;
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index 7fb09ca953..9efb4be5a5 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -279,11 +279,28 @@ enum si_tracked_reg {
>
>         SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
>
> +       SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 4 consecutive registers */
> +       SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
> +       SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
> +       SI_TRACKED_VGT_GS_OUT_PRIM_TYPE,
> +
> +       SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
> +       SI_TRACKED_VGT_GS_MAX_VERT_OUT,
> +
> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
> +
> +       SI_TRACKED_VGT_GS_INSTANCE_CNT,
> +       SI_TRACKED_VGT_GS_ONCHIP_CNTL,
> +       SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
> +
>         SI_NUM_TRACKED_REGS,
>  };
>
>  struct si_tracked_regs {
> -       uint32_t                reg_saved;
> +       uint64_t                reg_saved;

This is good, but the functions in si_build_pm4.h also need to be
updated. They compute the masks with "1 << ...", which results in a
32-bit number. You need to use "1ull << ..." instead.


>         uint32_t                reg_value[SI_NUM_TRACKED_REGS];
>         uint32_t                spi_ps_input_cntl[32];
>  };
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 0716021d85..87b58cd6e5 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -747,48 +747,99 @@ static void gfx9_get_gs_info(struct si_shader_selector *es,
>         assert(out->max_prims_per_subgroup <= max_out_prims);
>  }
>
> -static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
> +static void si_emit_shader_gs(struct si_context *sctx)
>  {
> +       struct si_shader *shader = sctx->queued.named.gs->shader;
> +       if (!shader)
> +               return;
> +
>         struct si_shader_selector *sel = shader->selector;
>         const ubyte *num_components = sel->info.num_stream_output_components;
>         unsigned gs_num_invocations = sel->gs_num_invocations;
> -       struct si_pm4_state *pm4;
>         uint64_t va;
>         unsigned max_stream = sel->max_gs_stream;
> -       unsigned offset;
> +       unsigned offset_1, offset_2, offset_3, vgt_gsvs_ring_itemsize;
>
> -       pm4 = si_get_shader_pm4_state(shader);
> -       if (!pm4)
> -               return;
> -
> -       offset = num_components[0] * sel->gs_max_out_vertices;
> -       si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset);
> +       offset_3 = offset_2 = offset_1 = num_components[0] * sel->gs_max_out_vertices;
>         if (max_stream >= 1)
> -               offset += num_components[1] * sel->gs_max_out_vertices;
> -       si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset);
> +               offset_3 = offset_2 += num_components[1] * sel->gs_max_out_vertices;
>         if (max_stream >= 2)
> -               offset += num_components[2] * sel->gs_max_out_vertices;
> -       si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
> -       si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
> -                      si_conv_prim_to_gs_out(sel->gs_output_prim));
> +               offset_3 += num_components[2] * sel->gs_max_out_vertices;
> +
> +
> +       /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
> +        * R_028A68_VGT_GSVS_RING_OFFSET_3, R_028A6C_VGT_GS_OUT_PRIM_TYPE */
> +       radeon_opt_set_context_reg4(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
> +                                   SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
> +                                   offset_1, offset_2, offset_3,
> +                                   si_conv_prim_to_gs_out(sel->gs_output_prim));
> +
> +       vgt_gsvs_ring_itemsize = offset_3;
>         if (max_stream >= 3)
> -               offset += num_components[3] * sel->gs_max_out_vertices;
> -       si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
> +               vgt_gsvs_ring_itemsize += num_components[3] * sel->gs_max_out_vertices;
>
>         /* The GSVS_RING_ITEMSIZE register takes 15 bits */
> -       assert(offset < (1 << 15));
> +       assert(vgt_gsvs_ring_itemsize < (1 << 15));
> +
> +       /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
> +       radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
> +                                  SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
> +                                  vgt_gsvs_ring_itemsize);
> +
> +       /* R_028B38_VGT_GS_MAX_VERT_OUT */
> +       radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
> +                                  SI_TRACKED_VGT_GS_MAX_VERT_OUT,
> +                                  sel->gs_max_out_vertices);
> +
> +       /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
> +        * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
> +       radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE,
> +                                   SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
> +                                    num_components[0],
> +                                    (max_stream >= 1) ? num_components[1] : 0,
> +                                    (max_stream >= 2) ? num_components[2] : 0,
> +                                    (max_stream >= 3) ? num_components[3] : 0);
> +
> +       /* R_028B90_VGT_GS_INSTANCE_CNT */
> +       radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
> +                                  SI_TRACKED_VGT_GS_INSTANCE_CNT,
> +                                  S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
> +                                  S_028B90_ENABLE(gs_num_invocations > 0));
>
> -       si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, sel->gs_max_out_vertices);
> +       va = shader->bo->gpu_address;
>
> -       si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
> -       si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
> -       si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
> -       si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
> +       if (sctx->chip_class >= GFX9) {
> +               struct gfx9_gs_info gs_info;
> +               gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);

This call is expensive. The register values should ideally be
precomputed in si_shader_gs and saved in struct si_shader, so that
emitting the values here is cheap.

Marek
For the remaining patches (3-6), it would be better to precompute the
register values in struct si_shader, so that the emit functions are as
cheap as possible.

Marek

On Mon, Sep 24, 2018 at 10:51 PM, Marek Olšák <maraeo@gmail.com> wrote:
> On Tue, Sep 18, 2018 at 4:21 PM, Sonny Jiang <sonny.jiang@amd.com> wrote:
>> Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
>> ---
>>  src/gallium/drivers/radeonsi/si_gfx_cs.c      |  15 ++-
>>  src/gallium/drivers/radeonsi/si_state.h       |  19 ++-
>>  .../drivers/radeonsi/si_state_shaders.c       | 112 ++++++++++++------
>>  3 files changed, 109 insertions(+), 37 deletions(-)
>>
>> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
>> index 99376c4847..39a97bf3da 100644
>> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
>> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
>> @@ -352,9 +352,22 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
>>                 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ]  = 0x3f800000;
>>                 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE]     = 0xffff;
>>                 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE]    = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL]  = 0x00000000;
>> +               ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP]  = 0x00000000;
>>
>>                 /* Set all saved registers state to saved. */
>> -               ctx->tracked_regs.reg_saved = 0xffffffff;
>> +               ctx->tracked_regs.reg_saved = 0xffffffffffffffff;
>>         } else {
>>                 /* Set all saved registers state to unknown. */
>>                 ctx->tracked_regs.reg_saved = 0;
>> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
>> index 7fb09ca953..9efb4be5a5 100644
>> --- a/src/gallium/drivers/radeonsi/si_state.h
>> +++ b/src/gallium/drivers/radeonsi/si_state.h
>> @@ -279,11 +279,28 @@ enum si_tracked_reg {
>>
>>         SI_TRACKED_VGT_ESGS_RING_ITEMSIZE,
>>
>> +       SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 4 consecutive registers */
>> +       SI_TRACKED_VGT_GSVS_RING_OFFSET_2,
>> +       SI_TRACKED_VGT_GSVS_RING_OFFSET_3,
>> +       SI_TRACKED_VGT_GS_OUT_PRIM_TYPE,
>> +
>> +       SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
>> +       SI_TRACKED_VGT_GS_MAX_VERT_OUT,
>> +
>> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */
>> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1,
>> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2,
>> +       SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3,
>> +
>> +       SI_TRACKED_VGT_GS_INSTANCE_CNT,
>> +       SI_TRACKED_VGT_GS_ONCHIP_CNTL,
>> +       SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP,
>> +
>>         SI_NUM_TRACKED_REGS,
>>  };
>>
>>  struct si_tracked_regs {
>> -       uint32_t                reg_saved;
>> +       uint64_t                reg_saved;
>
> This is good, but the functions in si_build_pm4.h also need to be
> updated. They compute the masks with "1 << ...", which results in a
> 32-bit number. You need to use "1ull << ..." instead.
>
>
>>         uint32_t                reg_value[SI_NUM_TRACKED_REGS];
>>         uint32_t                spi_ps_input_cntl[32];
>>  };
>> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
>> index 0716021d85..87b58cd6e5 100644
>> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
>> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
>> @@ -747,48 +747,99 @@ static void gfx9_get_gs_info(struct si_shader_selector *es,
>>         assert(out->max_prims_per_subgroup <= max_out_prims);
>>  }
>>
>> -static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
>> +static void si_emit_shader_gs(struct si_context *sctx)
>>  {
>> +       struct si_shader *shader = sctx->queued.named.gs->shader;
>> +       if (!shader)
>> +               return;
>> +
>>         struct si_shader_selector *sel = shader->selector;
>>         const ubyte *num_components = sel->info.num_stream_output_components;
>>         unsigned gs_num_invocations = sel->gs_num_invocations;
>> -       struct si_pm4_state *pm4;
>>         uint64_t va;
>>         unsigned max_stream = sel->max_gs_stream;
>> -       unsigned offset;
>> +       unsigned offset_1, offset_2, offset_3, vgt_gsvs_ring_itemsize;
>>
>> -       pm4 = si_get_shader_pm4_state(shader);
>> -       if (!pm4)
>> -               return;
>> -
>> -       offset = num_components[0] * sel->gs_max_out_vertices;
>> -       si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset);
>> +       offset_3 = offset_2 = offset_1 = num_components[0] * sel->gs_max_out_vertices;
>>         if (max_stream >= 1)
>> -               offset += num_components[1] * sel->gs_max_out_vertices;
>> -       si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset);
>> +               offset_3 = offset_2 += num_components[1] * sel->gs_max_out_vertices;
>>         if (max_stream >= 2)
>> -               offset += num_components[2] * sel->gs_max_out_vertices;
>> -       si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset);
>> -       si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
>> -                      si_conv_prim_to_gs_out(sel->gs_output_prim));
>> +               offset_3 += num_components[2] * sel->gs_max_out_vertices;
>> +
>> +
>> +       /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2
>> +        * R_028A68_VGT_GSVS_RING_OFFSET_3, R_028A6C_VGT_GS_OUT_PRIM_TYPE */
>> +       radeon_opt_set_context_reg4(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1,
>> +                                   SI_TRACKED_VGT_GSVS_RING_OFFSET_1,
>> +                                   offset_1, offset_2, offset_3,
>> +                                   si_conv_prim_to_gs_out(sel->gs_output_prim));
>> +
>> +       vgt_gsvs_ring_itemsize = offset_3;
>>         if (max_stream >= 3)
>> -               offset += num_components[3] * sel->gs_max_out_vertices;
>> -       si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
>> +               vgt_gsvs_ring_itemsize += num_components[3] * sel->gs_max_out_vertices;
>>
>>         /* The GSVS_RING_ITEMSIZE register takes 15 bits */
>> -       assert(offset < (1 << 15));
>> +       assert(vgt_gsvs_ring_itemsize < (1 << 15));
>> +
>> +       /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */
>> +       radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE,
>> +                                  SI_TRACKED_VGT_GSVS_RING_ITEMSIZE,
>> +                                  vgt_gsvs_ring_itemsize);
>> +
>> +       /* R_028B38_VGT_GS_MAX_VERT_OUT */
>> +       radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT,
>> +                                  SI_TRACKED_VGT_GS_MAX_VERT_OUT,
>> +                                  sel->gs_max_out_vertices);
>> +
>> +       /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1
>> +        * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 */
>> +       radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE,
>> +                                   SI_TRACKED_VGT_GS_VERT_ITEMSIZE,
>> +                                    num_components[0],
>> +                                    (max_stream >= 1) ? num_components[1] : 0,
>> +                                    (max_stream >= 2) ? num_components[2] : 0,
>> +                                    (max_stream >= 3) ? num_components[3] : 0);
>> +
>> +       /* R_028B90_VGT_GS_INSTANCE_CNT */
>> +       radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT,
>> +                                  SI_TRACKED_VGT_GS_INSTANCE_CNT,
>> +                                  S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
>> +                                  S_028B90_ENABLE(gs_num_invocations > 0));
>>
>> -       si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, sel->gs_max_out_vertices);
>> +       va = shader->bo->gpu_address;
>>
>> -       si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, num_components[0]);
>> -       si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? num_components[1] : 0);
>> -       si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? num_components[2] : 0);
>> -       si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? num_components[3] : 0);
>> +       if (sctx->chip_class >= GFX9) {
>> +               struct gfx9_gs_info gs_info;
>> +               gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info);
>
> This call is expensive. The register values should ideally be
> precomputed in si_shader_gs and saved in struct si_shader, so that
> emitting the values here is cheap.
>
> Marek