[v2] radv: only load 2-dwords for vertex buffers when robustness is disabled

Submitted by Samuel Pitoiset on March 19, 2019, 8:45 a.m.

Details

Message ID 20190319084504.11814-1-samuel.pitoiset@gmail.com
State New
Headers show
Series "radv: only load 2-dwords for vertex buffers when robustness is disabled" ( rev: 2 ) in Mesa

Not browsing as part of any series.

Commit Message

Samuel Pitoiset March 19, 2019, 8:45 a.m.
This patch requires the typed vertex fetches series.

Totals from affected shaders:
SGPRS: 445574 -> 452638 (1.59 %)
VGPRS: 373392 -> 370436 (-0.79 %)
Spilled SGPRs: 77 -> 14 (-81.82 %)
Spilled VGPRs: 0 -> 0 (0.00 %)
Code Size: 14162288 -> 14413036 (1.77 %) bytes
Max Waves: 119999 -> 120509 (0.43 %)

v2: - fix vertex descriptors

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/amd/vulkan/radv_cmd_buffer.c  | 32 +++++++++++++++++++------------
 src/amd/vulkan/radv_device.c      |  2 ++
 src/amd/vulkan/radv_nir_to_llvm.c | 21 +++++++++++++++++++-
 src/amd/vulkan/radv_private.h     |  1 +
 src/amd/vulkan/radv_shader.c      |  1 +
 src/amd/vulkan/radv_shader.h      |  1 +
 6 files changed, 45 insertions(+), 13 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
index ae8f50d0348..0c8572bd1e5 100644
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@@ -1991,6 +1991,7 @@  radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 	    cmd_buffer->state.pipeline->num_vertex_bindings &&
 	    radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
 		struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
+		unsigned desc_size_bytes = cmd_buffer->device->robust_buffer_access ? 16 : 8;
 		unsigned vb_offset;
 		void *vb_ptr;
 		uint32_t i = 0;
@@ -1998,12 +1999,13 @@  radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 		uint64_t va;
 
 		/* allocate some descriptor state for vertex buffers */
-		if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
+		if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
+						  count * desc_size_bytes, 256,
 						  &vb_offset, &vb_ptr))
 			return;
 
 		for (i = 0; i < count; i++) {
-			uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
+			uint32_t *desc = &((uint32_t *)vb_ptr)[i * (desc_size_bytes / 4)];
 			uint32_t offset;
 			struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
 			uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
@@ -2017,16 +2019,22 @@  radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
 			va += offset + buffer->offset;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
-			if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride)
-				desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
-			else
-				desc[2] = buffer->size - offset;
-			desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
-				  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
-				  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
-				  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
-				  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
-				  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+
+			if (cmd_buffer->device->robust_buffer_access) {
+				/* Enable out of bounds checking only when
+				 * robust buffer access is requested.
+				 */
+				if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride)
+					desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
+				else
+					desc[2] = buffer->size - offset;
+				desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+					  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+					  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+					  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+					  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+					  S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
+			}
 		}
 
 		va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
index dc4346b7498..895e7d67d50 100644
--- a/src/amd/vulkan/radv_device.c
+++ b/src/amd/vulkan/radv_device.c
@@ -1798,6 +1798,8 @@  VkResult radv_CreateDevice(
 	device->has_distributed_tess =
 		device->physical_device->rad_info.chip_class >= VI &&
 		device->physical_device->rad_info.max_se >= 2;
+	device->robust_buffer_access = pCreateInfo->pEnabledFeatures &&
+		pCreateInfo->pEnabledFeatures->robustBufferAccess;
 
 	if (getenv("RADV_TRACE_FILE")) {
 		const char *filename = getenv("RADV_TRACE_FILE");
diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
index 58a3cf18fe1..f0ed27e0a53 100644
--- a/src/amd/vulkan/radv_nir_to_llvm.c
+++ b/src/amd/vulkan/radv_nir_to_llvm.c
@@ -807,8 +807,11 @@  declare_vs_specific_input_sgprs(struct radv_shader_context *ctx,
 	    (stage == MESA_SHADER_VERTEX ||
 	     (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
 		if (ctx->shader_info->info.vs.has_vertex_buffers) {
+			LLVMTypeRef type =
+				ctx->options->robust_buffer_access ? ctx->ac.v4i32 : ctx->ac.v2i32;
+
 			add_arg(args, ARG_SGPR,
-				ac_array_in_const32_addr_space(ctx->ac.v4i32),
+				ac_array_in_const32_addr_space(type),
 				&ctx->vertex_buffers);
 		}
 		add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
@@ -2188,6 +2191,22 @@  handle_vs_input_decl(struct radv_shader_context *ctx,
 		t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
 		t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
 
+		if (!ctx->options->robust_buffer_access) {
+			LLVMValueRef desc[4];
+			desc[0] = ac_llvm_extract_elem(&ctx->ac, t_list, 0);
+			desc[1] = ac_llvm_extract_elem(&ctx->ac, t_list, 1);
+			desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
+			desc[3] = LLVMConstInt(ctx->ac.i32,
+					       S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+					       S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+					       S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+					       S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
+					       S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+					       S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0);
+
+			t_list = ac_build_gather_values(&ctx->ac, desc, 4);
+		}
+
 		input = ac_build_struct_tbuffer_load(&ctx->ac, t_list,
 						     buffer_index,
 						     LLVMConstInt(ctx->ac.i32, attrib_offset, false),
diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
index 39fa6110fde..f81ba4c602c 100644
--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@@ -676,6 +676,7 @@  struct radv_device {
 
 	bool always_use_syncobj;
 	bool has_distributed_tess;
+	bool robust_buffer_access;
 	bool pbb_allowed;
 	bool dfsm_allowed;
 	uint32_t tess_offchip_block_dw_size;
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index bd045a0b92f..ae7e315652e 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -606,6 +606,7 @@  shader_variant_create(struct radv_device *device,
 	options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
 	options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
 	options->address32_hi = device->physical_device->rad_info.address32_hi;
+	options->robust_buffer_access = device->robust_buffer_access;
 
 	if (options->supports_spill)
 		tm_options |= AC_TM_SUPPORTS_SPILL;
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index fe2f2868630..9e214457ad8 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -127,6 +127,7 @@  struct radv_nir_compiler_options {
 	bool dump_preoptir;
 	bool record_llvm_ir;
 	bool check_ir;
+	bool robust_buffer_access;
 	enum radeon_family family;
 	enum chip_class chip_class;
 	uint32_t tess_offchip_block_dw_size;

Comments

Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>

On Tue, Mar 19, 2019 at 9:42 AM Samuel Pitoiset
<samuel.pitoiset@gmail.com> wrote:
>
> This patch requires the typed vertex fetches series.
>
> Totals from affected shaders:
> SGPRS: 445574 -> 452638 (1.59 %)
> VGPRS: 373392 -> 370436 (-0.79 %)
> Spilled SGPRs: 77 -> 14 (-81.82 %)
> Spilled VGPRs: 0 -> 0 (0.00 %)
> Code Size: 14162288 -> 14413036 (1.77 %) bytes
> Max Waves: 119999 -> 120509 (0.43 %)
>
> v2: - fix vertex descriptors
>
> Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
> ---
>  src/amd/vulkan/radv_cmd_buffer.c  | 32 +++++++++++++++++++------------
>  src/amd/vulkan/radv_device.c      |  2 ++
>  src/amd/vulkan/radv_nir_to_llvm.c | 21 +++++++++++++++++++-
>  src/amd/vulkan/radv_private.h     |  1 +
>  src/amd/vulkan/radv_shader.c      |  1 +
>  src/amd/vulkan/radv_shader.h      |  1 +
>  6 files changed, 45 insertions(+), 13 deletions(-)
>
> diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> index ae8f50d0348..0c8572bd1e5 100644
> --- a/src/amd/vulkan/radv_cmd_buffer.c
> +++ b/src/amd/vulkan/radv_cmd_buffer.c
> @@ -1991,6 +1991,7 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
>             cmd_buffer->state.pipeline->num_vertex_bindings &&
>             radv_get_shader(cmd_buffer->state.pipeline, MESA_SHADER_VERTEX)->info.info.vs.has_vertex_buffers) {
>                 struct radv_vertex_elements_info *velems = &cmd_buffer->state.pipeline->vertex_elements;
> +               unsigned desc_size_bytes = cmd_buffer->device->robust_buffer_access ? 16 : 8;
>                 unsigned vb_offset;
>                 void *vb_ptr;
>                 uint32_t i = 0;
> @@ -1998,12 +1999,13 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
>                 uint64_t va;
>
>                 /* allocate some descriptor state for vertex buffers */
> -               if (!radv_cmd_buffer_upload_alloc(cmd_buffer, count * 16, 256,
> +               if (!radv_cmd_buffer_upload_alloc(cmd_buffer,
> +                                                 count * desc_size_bytes, 256,
>                                                   &vb_offset, &vb_ptr))
>                         return;
>
>                 for (i = 0; i < count; i++) {
> -                       uint32_t *desc = &((uint32_t *)vb_ptr)[i * 4];
> +                       uint32_t *desc = &((uint32_t *)vb_ptr)[i * (desc_size_bytes / 4)];
>                         uint32_t offset;
>                         struct radv_buffer *buffer = cmd_buffer->vertex_bindings[i].buffer;
>                         uint32_t stride = cmd_buffer->state.pipeline->binding_stride[i];
> @@ -2017,16 +2019,22 @@ radv_flush_vertex_descriptors(struct radv_cmd_buffer *cmd_buffer,
>                         va += offset + buffer->offset;
>                         desc[0] = va;
>                         desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) | S_008F04_STRIDE(stride);
> -                       if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride)
> -                               desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
> -                       else
> -                               desc[2] = buffer->size - offset;
> -                       desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> -                                 S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> -                                 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> -                                 S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> -                                 S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
> -                                 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
> +
> +                       if (cmd_buffer->device->robust_buffer_access) {
> +                               /* Enable out of bounds checking only when
> +                                * robust buffer access is requested.
> +                                */
> +                               if (cmd_buffer->device->physical_device->rad_info.chip_class <= CIK && stride)
> +                                       desc[2] = (buffer->size - offset - velems->format_size[i]) / stride + 1;
> +                               else
> +                                       desc[2] = buffer->size - offset;
> +                               desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +                                         S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +                                         S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +                                         S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +                                         S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
> +                                         S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32);
> +                       }
>                 }
>
>                 va = radv_buffer_get_va(cmd_buffer->upload.upload_bo);
> diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c
> index dc4346b7498..895e7d67d50 100644
> --- a/src/amd/vulkan/radv_device.c
> +++ b/src/amd/vulkan/radv_device.c
> @@ -1798,6 +1798,8 @@ VkResult radv_CreateDevice(
>         device->has_distributed_tess =
>                 device->physical_device->rad_info.chip_class >= VI &&
>                 device->physical_device->rad_info.max_se >= 2;
> +       device->robust_buffer_access = pCreateInfo->pEnabledFeatures &&
> +               pCreateInfo->pEnabledFeatures->robustBufferAccess;
>
>         if (getenv("RADV_TRACE_FILE")) {
>                 const char *filename = getenv("RADV_TRACE_FILE");
> diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c
> index 58a3cf18fe1..f0ed27e0a53 100644
> --- a/src/amd/vulkan/radv_nir_to_llvm.c
> +++ b/src/amd/vulkan/radv_nir_to_llvm.c
> @@ -807,8 +807,11 @@ declare_vs_specific_input_sgprs(struct radv_shader_context *ctx,
>             (stage == MESA_SHADER_VERTEX ||
>              (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
>                 if (ctx->shader_info->info.vs.has_vertex_buffers) {
> +                       LLVMTypeRef type =
> +                               ctx->options->robust_buffer_access ? ctx->ac.v4i32 : ctx->ac.v2i32;
> +
>                         add_arg(args, ARG_SGPR,
> -                               ac_array_in_const32_addr_space(ctx->ac.v4i32),
> +                               ac_array_in_const32_addr_space(type),
>                                 &ctx->vertex_buffers);
>                 }
>                 add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
> @@ -2188,6 +2191,22 @@ handle_vs_input_decl(struct radv_shader_context *ctx,
>                 t_offset = LLVMConstInt(ctx->ac.i32, attrib_binding, false);
>                 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
>
> +               if (!ctx->options->robust_buffer_access) {
> +                       LLVMValueRef desc[4];
> +                       desc[0] = ac_llvm_extract_elem(&ctx->ac, t_list, 0);
> +                       desc[1] = ac_llvm_extract_elem(&ctx->ac, t_list, 1);
> +                       desc[2] = LLVMConstInt(ctx->ac.i32, 0xffffffff, 0);
> +                       desc[3] = LLVMConstInt(ctx->ac.i32,
> +                                              S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
> +                                              S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
> +                                              S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
> +                                              S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W) |
> +                                              S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
> +                                              S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32), 0);
> +
> +                       t_list = ac_build_gather_values(&ctx->ac, desc, 4);
> +               }
> +
>                 input = ac_build_struct_tbuffer_load(&ctx->ac, t_list,
>                                                      buffer_index,
>                                                      LLVMConstInt(ctx->ac.i32, attrib_offset, false),
> diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> index 39fa6110fde..f81ba4c602c 100644
> --- a/src/amd/vulkan/radv_private.h
> +++ b/src/amd/vulkan/radv_private.h
> @@ -676,6 +676,7 @@ struct radv_device {
>
>         bool always_use_syncobj;
>         bool has_distributed_tess;
> +       bool robust_buffer_access;
>         bool pbb_allowed;
>         bool dfsm_allowed;
>         uint32_t tess_offchip_block_dw_size;
> diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
> index bd045a0b92f..ae7e315652e 100644
> --- a/src/amd/vulkan/radv_shader.c
> +++ b/src/amd/vulkan/radv_shader.c
> @@ -606,6 +606,7 @@ shader_variant_create(struct radv_device *device,
>         options->check_ir = device->instance->debug_flags & RADV_DEBUG_CHECKIR;
>         options->tess_offchip_block_dw_size = device->tess_offchip_block_dw_size;
>         options->address32_hi = device->physical_device->rad_info.address32_hi;
> +       options->robust_buffer_access = device->robust_buffer_access;
>
>         if (options->supports_spill)
>                 tm_options |= AC_TM_SUPPORTS_SPILL;
> diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
> index fe2f2868630..9e214457ad8 100644
> --- a/src/amd/vulkan/radv_shader.h
> +++ b/src/amd/vulkan/radv_shader.h
> @@ -127,6 +127,7 @@ struct radv_nir_compiler_options {
>         bool dump_preoptir;
>         bool record_llvm_ir;
>         bool check_ir;
> +       bool robust_buffer_access;
>         enum radeon_family family;
>         enum chip_class chip_class;
>         uint32_t tess_offchip_block_dw_size;
> --
> 2.21.0
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev