[Mesa-dev,RFC,2/2] ac/nir: Don't treat each element as a vec4 in compute shared memory

Submitted by Alex Smith on June 27, 2017, 3:53 p.m.

Details

Message ID 20170627155319.23388-2-asmith@feralinteractive.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Alex Smith June 27, 2017, 3:53 p.m.
Currently every element in shared memory (including individual elements
of an array) are treated as a vec4. For example, the following:

    shared uint array[1024];

gets treated as an array of 1024 vec4s with only the first component
ever used.

This can be highly wasteful of LDS space. We have a shader which only
really requires ~24KB of shared memory, but since scalars are being
padded to vec4, it actually ends up needing 96KB. This is more than
the hardware supports, and leads to an LLVM compilation failure.

Therefore, instead pack data in LDS according to the std430 layout.

Signed-off-by: Alex Smith <asmith@feralinteractive.com>
---
RFC since I'm not sure whether this is the best approach to handle this,
and there may be something I've missed.

No regressions seen in Mad Max or Dawn of War 3, which both have a few
shaders which use shared memory. Also no regressions in the CTS
'dEQP-VK.compute.*' tests.

I've also checked over various test cases using arrays/structs/etc. in
shared memory, and as far as I can tell the generated LLVM/GCN code is
correct.
---
 src/amd/common/ac_nir_to_llvm.c | 55 ++++++++++++++++++++++-------------------
 1 file changed, 29 insertions(+), 26 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index 468ce4d..a48bb3b 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -397,7 +397,7 @@  static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
 	LLVMValueRef ptr;
 	int addr_space;
 
-	offset = LLVMConstInt(ctx->i32, idx * 16, false);
+	offset = LLVMConstInt(ctx->i32, idx, false);
 
 	ptr = ctx->shared_memory;
 	ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
@@ -2404,7 +2404,7 @@  static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
 
 static void
 radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
-		      bool vs_in, unsigned *vertex_index_out,
+		      bool vs_in, bool compute_shared, unsigned *vertex_index_out,
 		      LLVMValueRef *vertex_index_ref,
 		      unsigned *const_out, LLVMValueRef *indir_out)
 {
@@ -2445,7 +2445,14 @@  radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
 		if (tail->deref_type == nir_deref_type_array) {
 			nir_deref_array *deref_array = nir_deref_as_array(tail);
 			LLVMValueRef index, stride, local_offset;
-			unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
+
+			/* For compute LDS, data is packed rather than treating
+			 * each individual element as a vec4. The returned
+			 * index is used to index into an i32 array so divide
+			 * by 4.
+			 */
+			unsigned size = compute_shared ? glsl_get_std430_array_stride(tail->type, false) / 4
+						       : glsl_count_attribute_slots(tail->type, vs_in);
 
 			const_offset += size * deref_array->base_offset;
 			if (deref_array->deref_array_type == nir_deref_array_type_direct)
@@ -2465,7 +2472,11 @@  radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
 
 			for (unsigned i = 0; i < deref_struct->index; i++) {
 				const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
-				const_offset += glsl_count_attribute_slots(ft, vs_in);
+
+				/* Same as above. */
+				unsigned size = compute_shared ? glsl_get_std430_array_stride(ft, false) / 4
+							       : glsl_count_attribute_slots(tail->type, vs_in);
+				const_offset += size;
 			}
 		} else
 			unreachable("unsupported deref type");
@@ -2641,7 +2652,7 @@  load_tcs_input(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 
 	stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8);
@@ -2673,7 +2684,7 @@  load_tcs_output(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 
 	if (!instr->variables[0]->var->data.patch) {
@@ -2712,7 +2723,7 @@  store_tcs_output(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
@@ -2779,7 +2790,7 @@  load_tes_input(struct nir_to_llvm_context *ctx,
 	const bool is_compact = instr->variables[0]->var->data.compact;
 
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, NULL, per_vertex ? &vertex_index : NULL,
+			      false, false, NULL, per_vertex ? &vertex_index : NULL,
 			      &const_index, &indir_index);
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
 	if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
@@ -2808,7 +2819,7 @@  load_gs_input(struct nir_to_llvm_context *ctx,
 	LLVMValueRef value[4], result;
 	unsigned vertex_index;
 	radv_get_deref_offset(ctx, instr->variables[0],
-			      false, &vertex_index, NULL,
+			      false, false, &vertex_index, NULL,
 			      &const_index, &indir_index);
 	vtx_offset_param = vertex_index;
 	assert(vtx_offset_param < 6);
@@ -2849,8 +2860,9 @@  static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
 	unsigned const_index;
 	bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
 	             instr->variables[0]->var->data.mode == nir_var_shader_in;
-	radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
-				      &const_index, &indir_index);
+	bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
+	radv_get_deref_offset(ctx, instr->variables[0], vs_in, compute_shared,
+			      NULL, NULL, &const_index, &indir_index);
 
 	if (instr->dest.ssa.bit_size == 64)
 		ve *= 2;
@@ -2925,9 +2937,6 @@  static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
 		LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
 		LLVMValueRef derived_ptr;
 
-		if (indir_index)
-			indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
-
 		for (unsigned chan = 0; chan < ve; chan++) {
 			LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
 			if (indir_index)
@@ -2955,7 +2964,8 @@  visit_store_var(struct nir_to_llvm_context *ctx,
 	int writemask = instr->const_index[0];
 	LLVMValueRef indir_index;
 	unsigned const_index;
-	radv_get_deref_offset(ctx, instr->variables[0], false,
+	bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
+	radv_get_deref_offset(ctx, instr->variables[0], false, compute_shared,
 	                      NULL, NULL, &const_index, &indir_index);
 
 	if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
@@ -3040,9 +3050,6 @@  visit_store_var(struct nir_to_llvm_context *ctx,
 	case nir_var_shared: {
 		LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
 
-		if (indir_index)
-			indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
-
 		for (unsigned chan = 0; chan < 8; chan++) {
 			if (!(writemask & (1 << chan)))
 				continue;
@@ -5756,9 +5763,9 @@  handle_shader_outputs_post(struct nir_to_llvm_context *ctx)
 
 static void
 handle_shared_compute_var(struct nir_to_llvm_context *ctx,
-			  struct nir_variable *variable, uint32_t *offset, int idx)
+			  struct nir_variable *variable, uint32_t *offset)
 {
-	unsigned size = glsl_count_attribute_slots(variable->type, false);
+	unsigned size = glsl_get_std430_array_stride(variable->type, false);
 	variable->data.driver_location = *offset;
 	*offset += size;
 }
@@ -5926,16 +5933,12 @@  LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 		nir_foreach_variable(variable, &nir->shared)
 			num_shared++;
 		if (num_shared) {
-			int idx = 0;
 			uint32_t shared_size = 0;
 			LLVMValueRef var;
 			LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
-			nir_foreach_variable(variable, &nir->shared) {
-				handle_shared_compute_var(&ctx, variable, &shared_size, idx);
-				idx++;
-			}
+			nir_foreach_variable(variable, &nir->shared)
+				handle_shared_compute_var(&ctx, variable, &shared_size);
 
-			shared_size *= 16;
 			var = LLVMAddGlobalInAddressSpace(ctx.module,
 							  LLVMArrayType(ctx.i8, shared_size),
 							  "compute_lds",

Comments

I think a better approach would be to translate the NIR variables
directly to LLVM variables instead of creating one giant LLVM
variable. I have a series coming up that does a similar thing for
local variables, I can try and convert shared variables too.

On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <asmith@feralinteractive.com> wrote:
> Currently every element in shared memory (including individual elements
> of an array) are treated as a vec4. For example, the following:
>
>     shared uint array[1024];
>
> gets treated as an array of 1024 vec4s with only the first component
> ever used.
>
> This can be highly wasteful of LDS space. We have a shader which only
> really requires ~24KB of shared memory, but since scalars are being
> padded to vec4, it actually ends up needing 96KB. This is more than
> the hardware supports, and leads to an LLVM compilation failure.
>
> Therefore, instead pack data in LDS according to the std430 layout.
>
> Signed-off-by: Alex Smith <asmith@feralinteractive.com>
> ---
> RFC since I'm not sure whether this is the best approach to handle this,
> and there may be something I've missed.
>
> No regressions seen in Mad Max or Dawn of War 3, which both have a few
> shaders which use shared memory. Also no regressions in the CTS
> 'dEQP-VK.compute.*' tests.
>
> I've also checked over various test cases using arrays/structs/etc. in
> shared memory, and as far as I can tell the generated LLVM/GCN code is
> correct.
> ---
>  src/amd/common/ac_nir_to_llvm.c | 55 ++++++++++++++++++++++-------------------
>  1 file changed, 29 insertions(+), 26 deletions(-)
>
> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
> index 468ce4d..a48bb3b 100644
> --- a/src/amd/common/ac_nir_to_llvm.c
> +++ b/src/amd/common/ac_nir_to_llvm.c
> @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
>         LLVMValueRef ptr;
>         int addr_space;
>
> -       offset = LLVMConstInt(ctx->i32, idx * 16, false);
> +       offset = LLVMConstInt(ctx->i32, idx, false);
>
>         ptr = ctx->shared_memory;
>         ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
> @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
>
>  static void
>  radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
> -                     bool vs_in, unsigned *vertex_index_out,
> +                     bool vs_in, bool compute_shared, unsigned *vertex_index_out,
>                       LLVMValueRef *vertex_index_ref,
>                       unsigned *const_out, LLVMValueRef *indir_out)
>  {
> @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>                 if (tail->deref_type == nir_deref_type_array) {
>                         nir_deref_array *deref_array = nir_deref_as_array(tail);
>                         LLVMValueRef index, stride, local_offset;
> -                       unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
> +
> +                       /* For compute LDS, data is packed rather than treating
> +                        * each individual element as a vec4. The returned
> +                        * index is used to index into an i32 array so divide
> +                        * by 4.
> +                        */
> +                       unsigned size = compute_shared ? glsl_get_std430_array_stride(tail->type, false) / 4
> +                                                      : glsl_count_attribute_slots(tail->type, vs_in);
>
>                         const_offset += size * deref_array->base_offset;
>                         if (deref_array->deref_array_type == nir_deref_array_type_direct)
> @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>
>                         for (unsigned i = 0; i < deref_struct->index; i++) {
>                                 const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
> -                               const_offset += glsl_count_attribute_slots(ft, vs_in);
> +
> +                               /* Same as above. */
> +                               unsigned size = compute_shared ? glsl_get_std430_array_stride(ft, false) / 4
> +                                                              : glsl_count_attribute_slots(tail->type, vs_in);
> +                               const_offset += size;
>                         }
>                 } else
>                         unreachable("unsupported deref type");
> @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
>         const bool is_compact = instr->variables[0]->var->data.compact;
>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>         radv_get_deref_offset(ctx, instr->variables[0],
> -                             false, NULL, per_vertex ? &vertex_index : NULL,
> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>                               &const_index, &indir_index);
>
>         stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8);
> @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
>         const bool is_compact = instr->variables[0]->var->data.compact;
>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>         radv_get_deref_offset(ctx, instr->variables[0],
> -                             false, NULL, per_vertex ? &vertex_index : NULL,
> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>                               &const_index, &indir_index);
>
>         if (!instr->variables[0]->var->data.patch) {
> @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
>         const bool is_compact = instr->variables[0]->var->data.compact;
>
>         radv_get_deref_offset(ctx, instr->variables[0],
> -                             false, NULL, per_vertex ? &vertex_index : NULL,
> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>                               &const_index, &indir_index);
>
>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
> @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx,
>         const bool is_compact = instr->variables[0]->var->data.compact;
>
>         radv_get_deref_offset(ctx, instr->variables[0],
> -                             false, NULL, per_vertex ? &vertex_index : NULL,
> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>                               &const_index, &indir_index);
>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>         if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
> @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx,
>         LLVMValueRef value[4], result;
>         unsigned vertex_index;
>         radv_get_deref_offset(ctx, instr->variables[0],
> -                             false, &vertex_index, NULL,
> +                             false, false, &vertex_index, NULL,
>                               &const_index, &indir_index);
>         vtx_offset_param = vertex_index;
>         assert(vtx_offset_param < 6);
> @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>         unsigned const_index;
>         bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
>                      instr->variables[0]->var->data.mode == nir_var_shader_in;
> -       radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
> -                                     &const_index, &indir_index);
> +       bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
> +       radv_get_deref_offset(ctx, instr->variables[0], vs_in, compute_shared,
> +                             NULL, NULL, &const_index, &indir_index);
>
>         if (instr->dest.ssa.bit_size == 64)
>                 ve *= 2;
> @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>                 LLVMValueRef derived_ptr;
>
> -               if (indir_index)
> -                       indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
> -
>                 for (unsigned chan = 0; chan < ve; chan++) {
>                         LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
>                         if (indir_index)
> @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>         int writemask = instr->const_index[0];
>         LLVMValueRef indir_index;
>         unsigned const_index;
> -       radv_get_deref_offset(ctx, instr->variables[0], false,
> +       bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
> +       radv_get_deref_offset(ctx, instr->variables[0], false, compute_shared,
>                               NULL, NULL, &const_index, &indir_index);
>
>         if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
> @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>         case nir_var_shared: {
>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>
> -               if (indir_index)
> -                       indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
> -
>                 for (unsigned chan = 0; chan < 8; chan++) {
>                         if (!(writemask & (1 << chan)))
>                                 continue;
> @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context *ctx)
>
>  static void
>  handle_shared_compute_var(struct nir_to_llvm_context *ctx,
> -                         struct nir_variable *variable, uint32_t *offset, int idx)
> +                         struct nir_variable *variable, uint32_t *offset)
>  {
> -       unsigned size = glsl_count_attribute_slots(variable->type, false);
> +       unsigned size = glsl_get_std430_array_stride(variable->type, false);
>         variable->data.driver_location = *offset;
>         *offset += size;
>  }
> @@ -5926,16 +5933,12 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>                 nir_foreach_variable(variable, &nir->shared)
>                         num_shared++;
>                 if (num_shared) {
> -                       int idx = 0;
>                         uint32_t shared_size = 0;
>                         LLVMValueRef var;
>                         LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
> -                       nir_foreach_variable(variable, &nir->shared) {
> -                               handle_shared_compute_var(&ctx, variable, &shared_size, idx);
> -                               idx++;
> -                       }
> +                       nir_foreach_variable(variable, &nir->shared)
> +                               handle_shared_compute_var(&ctx, variable, &shared_size);
>
> -                       shared_size *= 16;
>                         var = LLVMAddGlobalInAddressSpace(ctx.module,
>                                                           LLVMArrayType(ctx.i8, shared_size),
>                                                           "compute_lds",
> --
> 2.9.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
I'm going to do a full CTS run later this week before I send the
patches to the list, but can you confirm that this series fixes your
problems: https://cgit.freedesktop.org/~cwabbott0/mesa/log/?h=radv-rewrite-vars
(it passes all the CTS dEQP.compute.* tests, but that's not saying
much...)

On Mon, Jul 3, 2017 at 6:33 PM, Connor Abbott <cwabbott0@gmail.com> wrote:
> I think a better approach would be to translate the NIR variables
> directly to LLVM variables instead of creating one giant LLVM
> variable. I have a series coming up that does a similar thing for
> local variables, I can try and convert shared variables too.
>
> On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <asmith@feralinteractive.com> wrote:
>> Currently every element in shared memory (including individual elements
>> of an array) are treated as a vec4. For example, the following:
>>
>>     shared uint array[1024];
>>
>> gets treated as an array of 1024 vec4s with only the first component
>> ever used.
>>
>> This can be highly wasteful of LDS space. We have a shader which only
>> really requires ~24KB of shared memory, but since scalars are being
>> padded to vec4, it actually ends up needing 96KB. This is more than
>> the hardware supports, and leads to an LLVM compilation failure.
>>
>> Therefore, instead pack data in LDS according to the std430 layout.
>>
>> Signed-off-by: Alex Smith <asmith@feralinteractive.com>
>> ---
>> RFC since I'm not sure whether this is the best approach to handle this,
>> and there may be something I've missed.
>>
>> No regressions seen in Mad Max or Dawn of War 3, which both have a few
>> shaders which use shared memory. Also no regressions in the CTS
>> 'dEQP-VK.compute.*' tests.
>>
>> I've also checked over various test cases using arrays/structs/etc. in
>> shared memory, and as far as I can tell the generated LLVM/GCN code is
>> correct.
>> ---
>>  src/amd/common/ac_nir_to_llvm.c | 55 ++++++++++++++++++++++-------------------
>>  1 file changed, 29 insertions(+), 26 deletions(-)
>>
>> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
>> index 468ce4d..a48bb3b 100644
>> --- a/src/amd/common/ac_nir_to_llvm.c
>> +++ b/src/amd/common/ac_nir_to_llvm.c
>> @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
>>         LLVMValueRef ptr;
>>         int addr_space;
>>
>> -       offset = LLVMConstInt(ctx->i32, idx * 16, false);
>> +       offset = LLVMConstInt(ctx->i32, idx, false);
>>
>>         ptr = ctx->shared_memory;
>>         ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
>> @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
>>
>>  static void
>>  radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>> -                     bool vs_in, unsigned *vertex_index_out,
>> +                     bool vs_in, bool compute_shared, unsigned *vertex_index_out,
>>                       LLVMValueRef *vertex_index_ref,
>>                       unsigned *const_out, LLVMValueRef *indir_out)
>>  {
>> @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>                 if (tail->deref_type == nir_deref_type_array) {
>>                         nir_deref_array *deref_array = nir_deref_as_array(tail);
>>                         LLVMValueRef index, stride, local_offset;
>> -                       unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
>> +
>> +                       /* For compute LDS, data is packed rather than treating
>> +                        * each individual element as a vec4. The returned
>> +                        * index is used to index into an i32 array so divide
>> +                        * by 4.
>> +                        */
>> +                       unsigned size = compute_shared ? glsl_get_std430_array_stride(tail->type, false) / 4
>> +                                                      : glsl_count_attribute_slots(tail->type, vs_in);
>>
>>                         const_offset += size * deref_array->base_offset;
>>                         if (deref_array->deref_array_type == nir_deref_array_type_direct)
>> @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>
>>                         for (unsigned i = 0; i < deref_struct->index; i++) {
>>                                 const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
>> -                               const_offset += glsl_count_attribute_slots(ft, vs_in);
>> +
>> +                               /* Same as above. */
>> +                               unsigned size = compute_shared ? glsl_get_std430_array_stride(ft, false) / 4
>> +                                                              : glsl_count_attribute_slots(tail->type, vs_in);
>> +                               const_offset += size;
>>                         }
>>                 } else
>>                         unreachable("unsupported deref type");
>> @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>                               &const_index, &indir_index);
>>
>>         stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8);
>> @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>                               &const_index, &indir_index);
>>
>>         if (!instr->variables[0]->var->data.patch) {
>> @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>                               &const_index, &indir_index);
>>
>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>> @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx,
>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>                               &const_index, &indir_index);
>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>         if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
>> @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx,
>>         LLVMValueRef value[4], result;
>>         unsigned vertex_index;
>>         radv_get_deref_offset(ctx, instr->variables[0],
>> -                             false, &vertex_index, NULL,
>> +                             false, false, &vertex_index, NULL,
>>                               &const_index, &indir_index);
>>         vtx_offset_param = vertex_index;
>>         assert(vtx_offset_param < 6);
>> @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>>         unsigned const_index;
>>         bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
>>                      instr->variables[0]->var->data.mode == nir_var_shader_in;
>> -       radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
>> -                                     &const_index, &indir_index);
>> +       bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
>> +       radv_get_deref_offset(ctx, instr->variables[0], vs_in, compute_shared,
>> +                             NULL, NULL, &const_index, &indir_index);
>>
>>         if (instr->dest.ssa.bit_size == 64)
>>                 ve *= 2;
>> @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>                 LLVMValueRef derived_ptr;
>>
>> -               if (indir_index)
>> -                       indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>> -
>>                 for (unsigned chan = 0; chan < ve; chan++) {
>>                         LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
>>                         if (indir_index)
>> @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>         int writemask = instr->const_index[0];
>>         LLVMValueRef indir_index;
>>         unsigned const_index;
>> -       radv_get_deref_offset(ctx, instr->variables[0], false,
>> +       bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
>> +       radv_get_deref_offset(ctx, instr->variables[0], false, compute_shared,
>>                               NULL, NULL, &const_index, &indir_index);
>>
>>         if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
>> @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>         case nir_var_shared: {
>>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>
>> -               if (indir_index)
>> -                       indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>> -
>>                 for (unsigned chan = 0; chan < 8; chan++) {
>>                         if (!(writemask & (1 << chan)))
>>                                 continue;
>> @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context *ctx)
>>
>>  static void
>>  handle_shared_compute_var(struct nir_to_llvm_context *ctx,
>> -                         struct nir_variable *variable, uint32_t *offset, int idx)
>> +                         struct nir_variable *variable, uint32_t *offset)
>>  {
>> -       unsigned size = glsl_count_attribute_slots(variable->type, false);
>> +       unsigned size = glsl_get_std430_array_stride(variable->type, false);
>>         variable->data.driver_location = *offset;
>>         *offset += size;
>>  }
>> @@ -5926,16 +5933,12 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>>                 nir_foreach_variable(variable, &nir->shared)
>>                         num_shared++;
>>                 if (num_shared) {
>> -                       int idx = 0;
>>                         uint32_t shared_size = 0;
>>                         LLVMValueRef var;
>>                         LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
>> -                       nir_foreach_variable(variable, &nir->shared) {
>> -                               handle_shared_compute_var(&ctx, variable, &shared_size, idx);
>> -                               idx++;
>> -                       }
>> +                       nir_foreach_variable(variable, &nir->shared)
>> +                               handle_shared_compute_var(&ctx, variable, &shared_size);
>>
>> -                       shared_size *= 16;
>>                         var = LLVMAddGlobalInAddressSpace(ctx.module,
>>                                                           LLVMArrayType(ctx.i8, shared_size),
>>                                                           "compute_lds",
>> --
>> 2.9.4
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Hi Connor,

Can confirm this does work here, thanks!

Makes this patch redundant too, since that problem's also fixed by
your changes: https://lists.freedesktop.org/archives/mesa-dev/2017-June/161559.html

Alex

On 4 July 2017 at 04:32, Connor Abbott <cwabbott0@gmail.com> wrote:
> I'm going to do a full CTS run later this week before I send the
> patches to the list, but can you confirm that this series fixes your
> problems: https://cgit.freedesktop.org/~cwabbott0/mesa/log/?h=radv-rewrite-vars
> (it passes all the CTS dEQP.compute.* tests, but that's not saying
> much...)
>
> On Mon, Jul 3, 2017 at 6:33 PM, Connor Abbott <cwabbott0@gmail.com> wrote:
>> I think a better approach would be to translate the NIR variables
>> directly to LLVM variables instead of creating one giant LLVM
>> variable. I have a series coming up that does a similar thing for
>> local variables, I can try and convert shared variables too.
>>
>> On Tue, Jun 27, 2017 at 8:53 AM, Alex Smith <asmith@feralinteractive.com> wrote:
>>> Currently every element in shared memory (including individual elements
>>> of an array) are treated as a vec4. For example, the following:
>>>
>>>     shared uint array[1024];
>>>
>>> gets treated as an array of 1024 vec4s with only the first component
>>> ever used.
>>>
>>> This can be highly wasteful of LDS space. We have a shader which only
>>> really requires ~24KB of shared memory, but since scalars are being
>>> padded to vec4, it actually ends up needing 96KB. This is more than
>>> the hardware supports, and leads to an LLVM compilation failure.
>>>
>>> Therefore, instead pack data in LDS according to the std430 layout.
>>>
>>> Signed-off-by: Alex Smith <asmith@feralinteractive.com>
>>> ---
>>> RFC since I'm not sure whether this is the best approach to handle this,
>>> and there may be something I've missed.
>>>
>>> No regressions seen in Mad Max or Dawn of War 3, which both have a few
>>> shaders which use shared memory. Also no regressions in the CTS
>>> 'dEQP-VK.compute.*' tests.
>>>
>>> I've also checked over various test cases using arrays/structs/etc. in
>>> shared memory, and as far as I can tell the generated LLVM/GCN code is
>>> correct.
>>> ---
>>>  src/amd/common/ac_nir_to_llvm.c | 55 ++++++++++++++++++++++-------------------
>>>  1 file changed, 29 insertions(+), 26 deletions(-)
>>>
>>> diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
>>> index 468ce4d..a48bb3b 100644
>>> --- a/src/amd/common/ac_nir_to_llvm.c
>>> +++ b/src/amd/common/ac_nir_to_llvm.c
>>> @@ -397,7 +397,7 @@ static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
>>>         LLVMValueRef ptr;
>>>         int addr_space;
>>>
>>> -       offset = LLVMConstInt(ctx->i32, idx * 16, false);
>>> +       offset = LLVMConstInt(ctx->i32, idx, false);
>>>
>>>         ptr = ctx->shared_memory;
>>>         ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
>>> @@ -2404,7 +2404,7 @@ static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
>>>
>>>  static void
>>>  radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>> -                     bool vs_in, unsigned *vertex_index_out,
>>> +                     bool vs_in, bool compute_shared, unsigned *vertex_index_out,
>>>                       LLVMValueRef *vertex_index_ref,
>>>                       unsigned *const_out, LLVMValueRef *indir_out)
>>>  {
>>> @@ -2445,7 +2445,14 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>>                 if (tail->deref_type == nir_deref_type_array) {
>>>                         nir_deref_array *deref_array = nir_deref_as_array(tail);
>>>                         LLVMValueRef index, stride, local_offset;
>>> -                       unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
>>> +
>>> +                       /* For compute LDS, data is packed rather than treating
>>> +                        * each individual element as a vec4. The returned
>>> +                        * index is used to index into an i32 array so divide
>>> +                        * by 4.
>>> +                        */
>>> +                       unsigned size = compute_shared ? glsl_get_std430_array_stride(tail->type, false) / 4
>>> +                                                      : glsl_count_attribute_slots(tail->type, vs_in);
>>>
>>>                         const_offset += size * deref_array->base_offset;
>>>                         if (deref_array->deref_array_type == nir_deref_array_type_direct)
>>> @@ -2465,7 +2472,11 @@ radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref_var *deref,
>>>
>>>                         for (unsigned i = 0; i < deref_struct->index; i++) {
>>>                                 const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
>>> -                               const_offset += glsl_count_attribute_slots(ft, vs_in);
>>> +
>>> +                               /* Same as above. */
>>> +                               unsigned size = compute_shared ? glsl_get_std430_array_stride(ft, false) / 4
>>> +                                                              : glsl_count_attribute_slots(tail->type, vs_in);
>>> +                               const_offset += size;
>>>                         }
>>>                 } else
>>>                         unreachable("unsupported deref type");
>>> @@ -2641,7 +2652,7 @@ load_tcs_input(struct nir_to_llvm_context *ctx,
>>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>>         radv_get_deref_offset(ctx, instr->variables[0],
>>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>>                               &const_index, &indir_index);
>>>
>>>         stride = unpack_param(ctx, ctx->tcs_in_layout, 13, 8);
>>> @@ -2673,7 +2684,7 @@ load_tcs_output(struct nir_to_llvm_context *ctx,
>>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>>         radv_get_deref_offset(ctx, instr->variables[0],
>>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>>                               &const_index, &indir_index);
>>>
>>>         if (!instr->variables[0]->var->data.patch) {
>>> @@ -2712,7 +2723,7 @@ store_tcs_output(struct nir_to_llvm_context *ctx,
>>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>>
>>>         radv_get_deref_offset(ctx, instr->variables[0],
>>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>>                               &const_index, &indir_index);
>>>
>>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>> @@ -2779,7 +2790,7 @@ load_tes_input(struct nir_to_llvm_context *ctx,
>>>         const bool is_compact = instr->variables[0]->var->data.compact;
>>>
>>>         radv_get_deref_offset(ctx, instr->variables[0],
>>> -                             false, NULL, per_vertex ? &vertex_index : NULL,
>>> +                             false, false, NULL, per_vertex ? &vertex_index : NULL,
>>>                               &const_index, &indir_index);
>>>         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
>>>         if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 &&
>>> @@ -2808,7 +2819,7 @@ load_gs_input(struct nir_to_llvm_context *ctx,
>>>         LLVMValueRef value[4], result;
>>>         unsigned vertex_index;
>>>         radv_get_deref_offset(ctx, instr->variables[0],
>>> -                             false, &vertex_index, NULL,
>>> +                             false, false, &vertex_index, NULL,
>>>                               &const_index, &indir_index);
>>>         vtx_offset_param = vertex_index;
>>>         assert(vtx_offset_param < 6);
>>> @@ -2849,8 +2860,9 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>>>         unsigned const_index;
>>>         bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
>>>                      instr->variables[0]->var->data.mode == nir_var_shader_in;
>>> -       radv_get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
>>> -                                     &const_index, &indir_index);
>>> +       bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
>>> +       radv_get_deref_offset(ctx, instr->variables[0], vs_in, compute_shared,
>>> +                             NULL, NULL, &const_index, &indir_index);
>>>
>>>         if (instr->dest.ssa.bit_size == 64)
>>>                 ve *= 2;
>>> @@ -2925,9 +2937,6 @@ static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
>>>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>>                 LLVMValueRef derived_ptr;
>>>
>>> -               if (indir_index)
>>> -                       indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>>> -
>>>                 for (unsigned chan = 0; chan < ve; chan++) {
>>>                         LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
>>>                         if (indir_index)
>>> @@ -2955,7 +2964,8 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>>         int writemask = instr->const_index[0];
>>>         LLVMValueRef indir_index;
>>>         unsigned const_index;
>>> -       radv_get_deref_offset(ctx, instr->variables[0], false,
>>> +       bool compute_shared = instr->variables[0]->var->data.mode == nir_var_shared;
>>> +       radv_get_deref_offset(ctx, instr->variables[0], false, compute_shared,
>>>                               NULL, NULL, &const_index, &indir_index);
>>>
>>>         if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
>>> @@ -3040,9 +3050,6 @@ visit_store_var(struct nir_to_llvm_context *ctx,
>>>         case nir_var_shared: {
>>>                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
>>>
>>> -               if (indir_index)
>>> -                       indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
>>> -
>>>                 for (unsigned chan = 0; chan < 8; chan++) {
>>>                         if (!(writemask & (1 << chan)))
>>>                                 continue;
>>> @@ -5756,9 +5763,9 @@ handle_shader_outputs_post(struct nir_to_llvm_context *ctx)
>>>
>>>  static void
>>>  handle_shared_compute_var(struct nir_to_llvm_context *ctx,
>>> -                         struct nir_variable *variable, uint32_t *offset, int idx)
>>> +                         struct nir_variable *variable, uint32_t *offset)
>>>  {
>>> -       unsigned size = glsl_count_attribute_slots(variable->type, false);
>>> +       unsigned size = glsl_get_std430_array_stride(variable->type, false);
>>>         variable->data.driver_location = *offset;
>>>         *offset += size;
>>>  }
>>> @@ -5926,16 +5933,12 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
>>>                 nir_foreach_variable(variable, &nir->shared)
>>>                         num_shared++;
>>>                 if (num_shared) {
>>> -                       int idx = 0;
>>>                         uint32_t shared_size = 0;
>>>                         LLVMValueRef var;
>>>                         LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
>>> -                       nir_foreach_variable(variable, &nir->shared) {
>>> -                               handle_shared_compute_var(&ctx, variable, &shared_size, idx);
>>> -                               idx++;
>>> -                       }
>>> +                       nir_foreach_variable(variable, &nir->shared)
>>> +                               handle_shared_compute_var(&ctx, variable, &shared_size);
>>>
>>> -                       shared_size *= 16;
>>>                         var = LLVMAddGlobalInAddressSpace(ctx.module,
>>>                                                           LLVMArrayType(ctx.i8, shared_size),
>>>                                                           "compute_lds",
>>> --
>>> 2.9.4
>>>
>>> _______________________________________________
>>> mesa-dev mailing list
>>> mesa-dev@lists.freedesktop.org
>>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev