[RFC,9/9] ac/nir: do not always preload PS inputs at beginning

Submitted by Samuel Pitoiset on March 8, 2018, 2:08 p.m.

Details

Message ID 20180308140811.8538-9-samuel.pitoiset@gmail.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Samuel Pitoiset March 8, 2018, 2:08 p.m.
RadeonSI does something similar, the VGPRs decrease is a win
but not sure if we really want to implement that.

Polaris10:
Totals from affected shaders:
SGPRS: 116376 -> 116768 (0.34 %)
VGPRS: 76556 -> 74868 (-2.20 %)
Spilled SGPRs: 10347 -> 10466 (1.15 %)
Code Size: 5555072 -> 5569024 (0.25 %) bytes
Max Waves: 9854 -> 9951 (0.98 %)

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/amd/common/ac_nir_to_llvm.c | 118 +++++++++++++++++++++++++++++++---------
 src/amd/common/ac_shader_abi.h  |   7 +++
 2 files changed, 98 insertions(+), 27 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 644c85e2eb..eb0935972d 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -3131,6 +3131,7 @@  static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 				   nir_intrinsic_instr *instr)
 {
 	LLVMValueRef values[8];
+	int location = instr->variables[0]->var->data.location;
 	int idx = instr->variables[0]->var->data.driver_location;
 	int ve = instr->dest.ssa.num_components;
 	unsigned comp = instr->variables[0]->var->data.location_frac;
@@ -3167,6 +3168,19 @@  static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 						     instr->num_components, vertex_index, const_index, type);
 		}
 
+		LLVMValueRef inputs[4];
+
+		if (ctx->stage == MESA_SHADER_FRAGMENT) {
+			ctx->abi->load_fs_inputs(ctx->abi, location,
+						 indir_index, const_index,
+						 stride, inputs);
+		} else {
+			unsigned index = idx +
+				(indir_index ? 0 : const_index * stride);
+
+			memcpy(inputs, &ctx->abi->inputs[index], sizeof(inputs));
+		}
+
 		for (unsigned chan = comp; chan < ve + comp; chan++) {
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
@@ -3174,14 +3188,15 @@  static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
 						ctx->stage == MESA_SHADER_VERTEX);
 				count -= chan / 4;
 				LLVMValueRef tmp_vec = ac_build_gather_values_extended(
-						&ctx->ac, ctx->abi->inputs + idx + chan, count,
+						&ctx->ac, inputs + chan, count,
 						stride, false, true);
 
 				values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
 								       tmp_vec,
 								       indir_index, "");
-			} else
-				values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
+			} else {
+				values[chan] = inputs[chan];
+			}
 		}
 		break;
 	case nir_var_local:
@@ -5556,45 +5571,93 @@  prepare_interp_optimize(struct radv_shader_context *ctx,
 	}
 }
 
+static unsigned
+get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
+{
+	struct ac_shader_info *info = &ctx->shader_info->info;
+	uint64_t mask = info->input_mask & ((1ull << idx) - 1);
+
+	mask &= ~(1ull << VARYING_SLOT_POS);
+
+	return util_bitcount64(mask);
+}
+
+/* If this is true, preload FS inputs at the beginning of shaders. Otherwise,
+ * reload them at each use. This must be true if the shader is using
+ * derivatives and KILL, because KILL can leave the WQM and then a lazy
+ * input load isn't in the WQM anymore.
+ */
+static bool
+radv_preload_fs_inputs(struct radv_shader_context *ctx)
+{
+	return ctx->shader_info->info.ps.uses_derivatives &&
+	       ctx->shader_info->info.ps.uses_kill;
+}
+
 static void
-handle_fs_inputs(struct radv_shader_context *ctx,
-                 struct nir_shader *nir)
+radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
+		    LLVMValueRef out[4])
 {
 	struct ac_shader_info *info = &ctx->shader_info->info;
 
+	if (idx >= VARYING_SLOT_VAR0 ||
+	    idx == VARYING_SLOT_PNTC ||
+	    idx == VARYING_SLOT_PRIMITIVE_ID ||
+	    idx == VARYING_SLOT_LAYER) {
+		unsigned interp_mode = info->ps.input_interp_mode[idx];
+		unsigned interp_loc = info->ps.input_interp_loc[idx];
+		unsigned hw_index = get_input_hw_index(ctx, idx);
+		LLVMValueRef interp_param =
+			lookup_interp_param(&ctx->abi, interp_mode, interp_loc);
+
+		interp_fs_input(ctx, hw_index, interp_param, ctx->abi.prim_mask,
+				&out[0]);
+	} else if (idx == VARYING_SLOT_POS) {
+		for (int i = 0; i < 3; ++i)
+			out[i] = ctx->abi.frag_pos[i];
+
+		out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
+				       ctx->abi.frag_pos[3]);
+	}
+}
+
+static void
+load_fs_inputs(struct ac_shader_abi *abi,
+	       unsigned location,
+	       LLVMValueRef indir_index,
+	       unsigned const_index,
+	       unsigned stride,
+	       LLVMValueRef out[4])
+{
+	struct radv_shader_context *ctx = radv_shader_context_from_abi(abi);
+
+	if (!radv_preload_fs_inputs(ctx)) {
+		radv_load_fs_inputs(ctx, location, out);
+	} else {
+		unsigned index = radeon_llvm_reg_index_soa(location, 0);
+
+		index += (indir_index ? 0 : const_index * stride);
+
+		memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
+	}
+}
+
+static void
+handle_fs_inputs(struct radv_shader_context *ctx,
+                 struct nir_shader *nir)
+{
 	prepare_interp_optimize(ctx, nir);
 
 	nir_foreach_variable(variable, &nir->inputs)
 		handle_fs_input_decl(ctx, variable);
 
-	unsigned index = 0;
-
 	for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
-		LLVMValueRef interp_param;
 		LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0);
 
 		if (!(ctx->shader_info->info.input_mask & (1ull << i)))
 			continue;
 
-		if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
-		    i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
-			unsigned interp_mode = info->ps.input_interp_mode[i];
-			unsigned interp_loc = info->ps.input_interp_loc[i];
-
-			interp_param = lookup_interp_param(&ctx->abi, interp_mode,
-							   interp_loc);
-
-			interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
-					inputs);
-
-			++index;
-		} else if (i == VARYING_SLOT_POS) {
-			for(int i = 0; i < 3; ++i)
-				inputs[i] = ctx->abi.frag_pos[i];
-
-			inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
-						  ctx->abi.frag_pos[3]);
-		}
+		radv_load_fs_inputs(ctx, i, inputs);
 	}
 
 	if (ctx->shader_info->info.needs_multiview_view_index)
@@ -6924,6 +6987,7 @@  LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 			ctx.abi.load_base_vertex = radv_load_base_vertex;
 		} else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
 			shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard;
+			ctx.abi.load_fs_inputs = load_fs_inputs;
 			ctx.abi.lookup_interp_param = lookup_interp_param;
 			ctx.abi.load_sample_position = load_sample_position;
 			ctx.abi.load_sample_mask_in = load_sample_mask_in;
diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_abi.h
index 901e49b1f9..8e51ce9fdd 100644
--- a/src/amd/common/ac_shader_abi.h
+++ b/src/amd/common/ac_shader_abi.h
@@ -97,6 +97,13 @@  struct ac_shader_abi {
 				    unsigned const_index,
 				    LLVMTypeRef type);
 
+	void (*load_fs_inputs)(struct ac_shader_abi *abi,
+			       unsigned location,
+			       LLVMValueRef indir_index,
+			       unsigned const_index,
+			       unsigned stride,
+			       LLVMValueRef out[4]);
+
 	LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
 					   LLVMTypeRef type,
 					   LLVMValueRef vertex_index,

Comments

On Thu, Mar 8, 2018 at 9:08 AM, Samuel Pitoiset <samuel.pitoiset@gmail.com>
wrote:

> RadeonSI does something similar, the VGPRs decrease is a win
> but not sure if we really want to implement that.
>
> Polaris10:
> Totals from affected shaders:
> SGPRS: 116376 -> 116768 (0.34 %)
> VGPRS: 76556 -> 74868 (-2.20 %)
> Spilled SGPRs: 10347 -> 10466 (1.15 %)
> Code Size: 5555072 -> 5569024 (0.25 %) bytes
> Max Waves: 9854 -> 9951 (0.98 %)
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> ---
>  src/amd/common/ac_nir_to_llvm.c | 118 ++++++++++++++++++++++++++++++
> +---------
>  src/amd/common/ac_shader_abi.h  |   7 +++
>  2 files changed, 98 insertions(+), 27 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_
> llvm.c
> index 644c85e2eb..eb0935972d 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
>                                    nir_intrinsic_instr *instr)
>  {
>         LLVMValueRef values[8];
> +       int location = instr->variables[0]->var->data.location;
>         int idx = instr->variables[0]->var->data.driver_location;
>         int ve = instr->dest.ssa.num_components;
>         unsigned comp = instr->variables[0]->var->data.location_frac;
> @@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
>
>  instr->num_components, vertex_index, const_index, type);
>                 }
>
> +               LLVMValueRef inputs[4];
> +
> +               if (ctx->stage == MESA_SHADER_FRAGMENT) {
> +                       ctx->abi->load_fs_inputs(ctx->abi, location,
> +                                                indir_index, const_index,
> +                                                stride, inputs);
>

load_fs_inputs is NULL for radeonsi. Are you sure that radeonsi doesn't get
here?

Marek


> +               } else {
> +                       unsigned index = idx +
> +                               (indir_index ? 0 : const_index * stride);
> +
> +                       memcpy(inputs, &ctx->abi->inputs[index],
> sizeof(inputs));
> +               }
> +
>                 for (unsigned chan = comp; chan < ve + comp; chan++) {
>                         if (indir_index) {
>                                 unsigned count =
> glsl_count_attribute_slots(
> @@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct
> ac_nir_context *ctx,
>                                                 ctx->stage ==
> MESA_SHADER_VERTEX);
>                                 count -= chan / 4;
>                                 LLVMValueRef tmp_vec =
> ac_build_gather_values_extended(
> -                                               &ctx->ac, ctx->abi->inputs
> + idx + chan, count,
> +                                               &ctx->ac, inputs + chan,
> count,
>                                                 stride, false, true);
>
>                                 values[chan] =
> LLVMBuildExtractElement(ctx->ac.builder,
>
>  tmp_vec,
>
>  indir_index, "");
> -                       } else
> -                               values[chan] = ctx->abi->inputs[idx + chan
> + const_index * stride];
> +                       } else {
> +                               values[chan] = inputs[chan];
> +                       }
>                 }
>                 break;
>         case nir_var_local:
> @@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct radv_shader_context
> *ctx,
>         }
>  }
>
> +static unsigned
> +get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
> +{
> +       struct ac_shader_info *info = &ctx->shader_info->info;
> +       uint64_t mask = info->input_mask & ((1ull << idx) - 1);
> +
> +       mask &= ~(1ull << VARYING_SLOT_POS);
> +
> +       return util_bitcount64(mask);
> +}
> +
> +/* If this is true, preload FS inputs at the beginning of shaders.
> Otherwise,
> + * reload them at each use. This must be true if the shader is using
> + * derivatives and KILL, because KILL can leave the WQM and then a lazy
> + * input load isn't in the WQM anymore.
> + */
> +static bool
> +radv_preload_fs_inputs(struct radv_shader_context *ctx)
> +{
> +       return ctx->shader_info->info.ps.uses_derivatives &&
> +              ctx->shader_info->info.ps.uses_kill;
> +}
> +
>  static void
> -handle_fs_inputs(struct radv_shader_context *ctx,
> -                 struct nir_shader *nir)
> +radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
> +                   LLVMValueRef out[4])
>  {
>         struct ac_shader_info *info = &ctx->shader_info->info;
>
> +       if (idx >= VARYING_SLOT_VAR0 ||
> +           idx == VARYING_SLOT_PNTC ||
> +           idx == VARYING_SLOT_PRIMITIVE_ID ||
> +           idx == VARYING_SLOT_LAYER) {
> +               unsigned interp_mode = info->ps.input_interp_mode[idx];
> +               unsigned interp_loc = info->ps.input_interp_loc[idx];
> +               unsigned hw_index = get_input_hw_index(ctx, idx);
> +               LLVMValueRef interp_param =
> +                       lookup_interp_param(&ctx->abi, interp_mode,
> interp_loc);
> +
> +               interp_fs_input(ctx, hw_index, interp_param,
> ctx->abi.prim_mask,
> +                               &out[0]);
> +       } else if (idx == VARYING_SLOT_POS) {
> +               for (int i = 0; i < 3; ++i)
> +                       out[i] = ctx->abi.frag_pos[i];
> +
> +               out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
> +                                      ctx->abi.frag_pos[3]);
> +       }
> +}
> +
> +static void
> +load_fs_inputs(struct ac_shader_abi *abi,
> +              unsigned location,
> +              LLVMValueRef indir_index,
> +              unsigned const_index,
> +              unsigned stride,
> +              LLVMValueRef out[4])
> +{
> +       struct radv_shader_context *ctx = radv_shader_context_from_abi(
> abi);
> +
> +       if (!radv_preload_fs_inputs(ctx)) {
> +               radv_load_fs_inputs(ctx, location, out);
> +       } else {
> +               unsigned index = radeon_llvm_reg_index_soa(location, 0);
> +
> +               index += (indir_index ? 0 : const_index * stride);
> +
> +               memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
> +       }
> +}
> +
> +static void
> +handle_fs_inputs(struct radv_shader_context *ctx,
> +                 struct nir_shader *nir)
> +{
>         prepare_interp_optimize(ctx, nir);
>
>         nir_foreach_variable(variable, &nir->inputs)
>                 handle_fs_input_decl(ctx, variable);
>
> -       unsigned index = 0;
> -
>         for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
> -               LLVMValueRef interp_param;
>                 LLVMValueRef *inputs = ctx->inputs
> +radeon_llvm_reg_index_soa(i, 0);
>
>                 if (!(ctx->shader_info->info.input_mask & (1ull << i)))
>                         continue;
>
> -               if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
> -                   i == VARYING_SLOT_PRIMITIVE_ID || i ==
> VARYING_SLOT_LAYER) {
> -                       unsigned interp_mode =
> info->ps.input_interp_mode[i];
> -                       unsigned interp_loc = info->ps.input_interp_loc[i];
> -
> -                       interp_param = lookup_interp_param(&ctx->abi,
> interp_mode,
> -                                                          interp_loc);
> -
> -                       interp_fs_input(ctx, index, interp_param,
> ctx->abi.prim_mask,
> -                                       inputs);
> -
> -                       ++index;
> -               } else if (i == VARYING_SLOT_POS) {
> -                       for(int i = 0; i < 3; ++i)
> -                               inputs[i] = ctx->abi.frag_pos[i];
> -
> -                       inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
> -                                                 ctx->abi.frag_pos[3]);
> -               }
> +               radv_load_fs_inputs(ctx, i, inputs);
>         }
>
>         if (ctx->shader_info->info.needs_multiview_view_index)
> @@ -6924,6 +6987,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef
> tm,
>                         ctx.abi.load_base_vertex = radv_load_base_vertex;
>                 } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT)
> {
>                         shader_info->fs.can_discard =
> shaders[i]->info.fs.uses_discard;
> +                       ctx.abi.load_fs_inputs = load_fs_inputs;
>                         ctx.abi.lookup_interp_param = lookup_interp_param;
>                         ctx.abi.load_sample_position =
> load_sample_position;
>                         ctx.abi.load_sample_mask_in = load_sample_mask_in;
> diff --git a/src/amd/common/ac_shader_abi.h b/src/amd/common/ac_shader_
> abi.h
> index 901e49b1f9..8e51ce9fdd 100644
> --- a/src/amd/common/ac_shader_abi.h
> +++ b/src/amd/common/ac_shader_abi.h
> @@ -97,6 +97,13 @@ struct ac_shader_abi {
>                                     unsigned const_index,
>                                     LLVMTypeRef type);
>
> +       void (*load_fs_inputs)(struct ac_shader_abi *abi,
> +                              unsigned location,
> +                              LLVMValueRef indir_index,
> +                              unsigned const_index,
> +                              unsigned stride,
> +                              LLVMValueRef out[4]);
> +
>         LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
>                                            LLVMTypeRef type,
>                                            LLVMValueRef vertex_index,
> --
> 2.16.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
On 03/11/2018 04:07 PM, Marek Olšák wrote:
> On Thu, Mar 8, 2018 at 9:08 AM, Samuel Pitoiset 
> <samuel.pitoiset@gmail.com <mailto:samuel.pitoiset@gmail.com>> wrote:
> 
>     RadeonSI does something similar, the VGPRs decrease is a win
>     but not sure if we really want to implement that.
> 
>     Polaris10:
>     Totals from affected shaders:
>     SGPRS: 116376 -> 116768 (0.34 %)
>     VGPRS: 76556 -> 74868 (-2.20 %)
>     Spilled SGPRs: 10347 -> 10466 (1.15 %)
>     Code Size: 5555072 -> 5569024 (0.25 %) bytes
>     Max Waves: 9854 -> 9951 (0.98 %)
> 
>     Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com
>     <mailto:samuel.pitoiset@gmail.com>>
>     ---
>       src/amd/common/ac_nir_to_llvm.c | 118
>     +++++++++++++++++++++++++++++++---------
>       src/amd/common/ac_shader_abi.h  |   7 +++
>       2 files changed, 98 insertions(+), 27 deletions(-)
> 
>     diff --git a/src/amd/common/ac_nir_to_llvm.c
>     b/src/amd/common/ac_nir_to_llvm.c
>     index 644c85e2eb..eb0935972d 100644
>     --- a/src/amd/common/ac_nir_to_llvm.c
>     +++ b/src/amd/common/ac_nir_to_llvm.c
>     @@ -3131,6 +3131,7 @@ static LLVMValueRef visit_load_var(struct
>     ac_nir_context *ctx,
>                                         nir_intrinsic_instr *instr)
>       {
>              LLVMValueRef values[8];
>     +       int location = instr->variables[0]->var->data.location;
>              int idx = instr->variables[0]->var->data.driver_location;
>              int ve = instr->dest.ssa.num_components;
>              unsigned comp = instr->variables[0]->var->data.location_frac;
>     @@ -3167,6 +3168,19 @@ static LLVMValueRef visit_load_var(struct
>     ac_nir_context *ctx,
>                                                         
>       instr->num_components, vertex_index, const_index, type);
>                      }
> 
>     +               LLVMValueRef inputs[4];
>     +
>     +               if (ctx->stage == MESA_SHADER_FRAGMENT) {
>     +                       ctx->abi->load_fs_inputs(ctx->abi, location,
>     +                                                indir_index,
>     const_index,
>     +                                                stride, inputs);
> 
> 
> load_fs_inputs is NULL for radeonsi. Are you sure that radeonsi doesn't 
> get here?

Yes, missed that, RadeonSI should get there actually.

> 
> Marek
> 
>     +               } else {
>     +                       unsigned index = idx +
>     +                               (indir_index ? 0 : const_index *
>     stride);
>     +
>     +                       memcpy(inputs, &ctx->abi->inputs[index],
>     sizeof(inputs));
>     +               }
>     +
>                      for (unsigned chan = comp; chan < ve + comp; chan++) {
>                              if (indir_index) {
>                                      unsigned count =
>     glsl_count_attribute_slots(
>     @@ -3174,14 +3188,15 @@ static LLVMValueRef visit_load_var(struct
>     ac_nir_context *ctx,
>                                                      ctx->stage ==
>     MESA_SHADER_VERTEX);
>                                      count -= chan / 4;
>                                      LLVMValueRef tmp_vec =
>     ac_build_gather_values_extended(
>     -                                               &ctx->ac,
>     ctx->abi->inputs + idx + chan, count,
>     +                                               &ctx->ac, inputs +
>     chan, count,
>                                                      stride, false, true);
> 
>                                      values[chan] =
>     LLVMBuildExtractElement(ctx->ac.builder,
>                                                                         
>         tmp_vec,
>                                                                         
>         indir_index, "");
>     -                       } else
>     -                               values[chan] = ctx->abi->inputs[idx
>     + chan + const_index * stride];
>     +                       } else {
>     +                               values[chan] = inputs[chan];
>     +                       }
>                      }
>                      break;
>              case nir_var_local:
>     @@ -5556,45 +5571,93 @@ prepare_interp_optimize(struct
>     radv_shader_context *ctx,
>              }
>       }
> 
>     +static unsigned
>     +get_input_hw_index(struct radv_shader_context *ctx, unsigned idx)
>     +{
>     +       struct ac_shader_info *info = &ctx->shader_info->info;
>     +       uint64_t mask = info->input_mask & ((1ull << idx) - 1);
>     +
>     +       mask &= ~(1ull << VARYING_SLOT_POS);
>     +
>     +       return util_bitcount64(mask);
>     +}
>     +
>     +/* If this is true, preload FS inputs at the beginning of shaders.
>     Otherwise,
>     + * reload them at each use. This must be true if the shader is using
>     + * derivatives and KILL, because KILL can leave the WQM and then a lazy
>     + * input load isn't in the WQM anymore.
>     + */
>     +static bool
>     +radv_preload_fs_inputs(struct radv_shader_context *ctx)
>     +{
>     +       return ctx->shader_info->info.ps
>     <http://info.ps>.uses_derivatives &&
>     +              ctx->shader_info->info.ps <http://info.ps>.uses_kill;
>     +}
>     +
>       static void
>     -handle_fs_inputs(struct radv_shader_context *ctx,
>     -                 struct nir_shader *nir)
>     +radv_load_fs_inputs(struct radv_shader_context *ctx, unsigned idx,
>     +                   LLVMValueRef out[4])
>       {
>              struct ac_shader_info *info = &ctx->shader_info->info;
> 
>     +       if (idx >= VARYING_SLOT_VAR0 ||
>     +           idx == VARYING_SLOT_PNTC ||
>     +           idx == VARYING_SLOT_PRIMITIVE_ID ||
>     +           idx == VARYING_SLOT_LAYER) {
>     +               unsigned interp_mode = info->ps.input_interp_mode[idx];
>     +               unsigned interp_loc = info->ps.input_interp_loc[idx];
>     +               unsigned hw_index = get_input_hw_index(ctx, idx);
>     +               LLVMValueRef interp_param =
>     +                       lookup_interp_param(&ctx->abi, interp_mode,
>     interp_loc);
>     +
>     +               interp_fs_input(ctx, hw_index, interp_param,
>     ctx->abi.prim_mask,
>     +                               &out[0]);
>     +       } else if (idx == VARYING_SLOT_POS) {
>     +               for (int i = 0; i < 3; ++i)
>     +                       out[i] = ctx->abi.frag_pos[i];
>     +
>     +               out[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
>     +                                      ctx->abi.frag_pos[3]);
>     +       }
>     +}
>     +
>     +static void
>     +load_fs_inputs(struct ac_shader_abi *abi,
>     +              unsigned location,
>     +              LLVMValueRef indir_index,
>     +              unsigned const_index,
>     +              unsigned stride,
>     +              LLVMValueRef out[4])
>     +{
>     +       struct radv_shader_context *ctx =
>     radv_shader_context_from_abi(abi);
>     +
>     +       if (!radv_preload_fs_inputs(ctx)) {
>     +               radv_load_fs_inputs(ctx, location, out);
>     +       } else {
>     +               unsigned index = radeon_llvm_reg_index_soa(location, 0);
>     +
>     +               index += (indir_index ? 0 : const_index * stride);
>     +
>     +               memcpy(out, &abi->inputs[index], sizeof(out[0]) * 4);
>     +       }
>     +}
>     +
>     +static void
>     +handle_fs_inputs(struct radv_shader_context *ctx,
>     +                 struct nir_shader *nir)
>     +{
>              prepare_interp_optimize(ctx, nir);
> 
>              nir_foreach_variable(variable, &nir->inputs)
>                      handle_fs_input_decl(ctx, variable);
> 
>     -       unsigned index = 0;
>     -
>              for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
>     -               LLVMValueRef interp_param;
>                      LLVMValueRef *inputs = ctx->inputs
>     +radeon_llvm_reg_index_soa(i, 0);
> 
>                      if (!(ctx->shader_info->info.input_mask & (1ull << i)))
>                              continue;
> 
>     -               if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
>     -                   i == VARYING_SLOT_PRIMITIVE_ID || i ==
>     VARYING_SLOT_LAYER) {
>     -                       unsigned interp_mode =
>     info->ps.input_interp_mode[i];
>     -                       unsigned interp_loc =
>     info->ps.input_interp_loc[i];
>     -
>     -                       interp_param =
>     lookup_interp_param(&ctx->abi, interp_mode,
>     -                                                          interp_loc);
>     -
>     -                       interp_fs_input(ctx, index, interp_param,
>     ctx->abi.prim_mask,
>     -                                       inputs);
>     -
>     -                       ++index;
>     -               } else if (i == VARYING_SLOT_POS) {
>     -                       for(int i = 0; i < 3; ++i)
>     -                               inputs[i] = ctx->abi.frag_pos[i];
>     -
>     -                       inputs[3] = ac_build_fdiv(&ctx->ac,
>     ctx->ac.f32_1,
>     -                                                 ctx->abi.frag_pos[3]);
>     -               }
>     +               radv_load_fs_inputs(ctx, i, inputs);
>              }
> 
>              if (ctx->shader_info->info.needs_multiview_view_index)
>     @@ -6924,6 +6987,7 @@ LLVMModuleRef
>     ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>                              ctx.abi.load_base_vertex =
>     radv_load_base_vertex;
>                      } else if (shaders[i]->info.stage ==
>     MESA_SHADER_FRAGMENT) {
>                              shader_info->fs.can_discard =
>     shaders[i]->info.fs.uses_discard;
>     +                       ctx.abi.load_fs_inputs = load_fs_inputs;
>                              ctx.abi.lookup_interp_param =
>     lookup_interp_param;
>                              ctx.abi.load_sample_position =
>     load_sample_position;
>                              ctx.abi.load_sample_mask_in =
>     load_sample_mask_in;
>     diff --git a/src/amd/common/ac_shader_abi.h
>     b/src/amd/common/ac_shader_abi.h
>     index 901e49b1f9..8e51ce9fdd 100644
>     --- a/src/amd/common/ac_shader_abi.h
>     +++ b/src/amd/common/ac_shader_abi.h
>     @@ -97,6 +97,13 @@ struct ac_shader_abi {
>                                          unsigned const_index,
>                                          LLVMTypeRef type);
> 
>     +       void (*load_fs_inputs)(struct ac_shader_abi *abi,
>     +                              unsigned location,
>     +                              LLVMValueRef indir_index,
>     +                              unsigned const_index,
>     +                              unsigned stride,
>     +                              LLVMValueRef out[4]);
>     +
>              LLVMValueRef (*load_tess_varyings)(struct ac_shader_abi *abi,
>                                                 LLVMTypeRef type,
>                                                 LLVMValueRef vertex_index,
>     --
>     2.16.2
> 
>     _______________________________________________
>     mesa-dev mailing list
>     mesa-dev@lists.freedesktop.org <mailto:mesa-dev@lists.freedesktop.org>
>     https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>     <https://lists.freedesktop.org/mailman/listinfo/mesa-dev>
> 
>