[2/3] radv: hardcode shader WAVE_LIMIT to the maximum value

Submitted by Andres Rodriguez on Oct. 20, 2017, 8:34 p.m.

Details

Message ID 20171020203425.20691-3-andresx7@gmail.com
State New
Headers show
Series "radv: enable VK_EXT_global_priority" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Andres Rodriguez Oct. 20, 2017, 8:34 p.m.
When WAVE_LIMIT is set, a submission will opt-in for SPI based resource
scheduling. Because this mechanism is cooperative, we must ensure that
all submissions have this field set, otherwise they will bypass resource
arbitration.

We always hardcode the field to its maximum value, instead of
attempting to calculate an approximate usage. In testing, there were no
benefits to using anything other than the maximum.

Signed-off-by: Andres Rodriguez <andresx7@gmail.com>
---
 src/amd/vulkan/si_cmd_buffer.c          | 27 ++++++++++++++++++---------
 src/gallium/drivers/radeonsi/si_state.c | 21 ++++++++++++++-------
 2 files changed, 32 insertions(+), 16 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
index de3b388..ac3dff8 100644
--- a/src/amd/vulkan/si_cmd_buffer.c
+++ b/src/amd/vulkan/si_cmd_buffer.c
@@ -179,7 +179,8 @@  si_emit_compute(struct radv_physical_device *physical_device,
 	radeon_emit(cs, 0);
 	radeon_emit(cs, 0);
 
-	radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
+	radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+			      S_00B854_WAVES_PER_SH(0x3));
 	radeon_emit(cs, 0);
 	/* R_00B858_COMPUTE_STATIC_THREAD_MGMT_SE0 / SE1 */
 	radeon_emit(cs, S_00B858_SH0_CU_EN(0xffff) | S_00B858_SH1_CU_EN(0xffff));
@@ -432,11 +433,15 @@  si_emit_config(struct radv_physical_device *physical_device,
 
 	if (physical_device->rad_info.chip_class >= CIK) {
 		if (physical_device->rad_info.chip_class >= GFX9) {
-			radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff));
+			radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+					  S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
 		} else {
-			radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
-			radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
-			radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
+			radeon_set_sh_reg(cs, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+					  S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+			radeon_set_sh_reg(cs, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+					  S_00B41C_WAVE_LIMIT(0x3F));
+			radeon_set_sh_reg(cs, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+					  S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
 			/* If this is 0, Bonaire can hang even if GS isn't being used.
 			 * Other chips are unaffected. These are suboptimal values,
 			 * but we don't use on-chip GS.
@@ -445,7 +450,8 @@  si_emit_config(struct radv_physical_device *physical_device,
 					       S_028A44_ES_VERTS_PER_SUBGRP(64) |
 					       S_028A44_GS_PRIMS_PER_SUBGRP(4));
 		}
-		radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
+		radeon_set_sh_reg(cs, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+				  S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
 
 		if (physical_device->rad_info.num_good_compute_units /
 		    (physical_device->rad_info.max_se * physical_device->rad_info.max_sh_per_se) <= 4) {
@@ -455,7 +461,8 @@  si_emit_config(struct radv_physical_device *physical_device,
 			 *
 			 * LATE_ALLOC_VS = 2 is the highest safe number.
 			 */
-			radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xffff));
+			radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+					  S_00B118_CU_EN(0xffff) | S_00B118_WAVE_LIMIT(0x3F) );
 			radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(2));
 		} else {
 			/* Set LATE_ALLOC_VS == 31. It should be less than
@@ -463,11 +470,13 @@  si_emit_config(struct radv_physical_device *physical_device,
 			 * - VS can't execute on CU0.
 			 * - If HS writes outputs to LDS, LS can't execute on CU0.
 			 */
-			radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS, S_00B118_CU_EN(0xfffe));
+			radeon_set_sh_reg(cs, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
+					  S_00B118_CU_EN(0xfffe) | S_00B118_WAVE_LIMIT(0x3F));
 			radeon_set_sh_reg(cs, R_00B11C_SPI_SHADER_LATE_ALLOC_VS, S_00B11C_LIMIT(31));
 		}
 
-		radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
+		radeon_set_sh_reg(cs, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+				  S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
 	}
 
 	if (physical_device->rad_info.chip_class >= VI) {
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index ae45e1a..8e3717e 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -4994,11 +4994,15 @@  static void si_init_config(struct si_context *sctx)
 
 	if (sctx->b.chip_class >= CIK) {
 		if (sctx->b.chip_class >= GFX9) {
-			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, S_00B41C_CU_EN(0xffff));
+			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+				       S_00B41C_CU_EN(0xffff) | S_00B41C_WAVE_LIMIT(0x3F));
 		} else {
-			si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS, S_00B51C_CU_EN(0xffff));
-			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS, 0);
-			si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES, S_00B31C_CU_EN(0xffff));
+			si_pm4_set_reg(pm4, R_00B51C_SPI_SHADER_PGM_RSRC3_LS,
+				       S_00B51C_CU_EN(0xffff) | S_00B51C_WAVE_LIMIT(0x3F));
+			si_pm4_set_reg(pm4, R_00B41C_SPI_SHADER_PGM_RSRC3_HS,
+				       S_00B41C_WAVE_LIMIT(0x3F));
+			si_pm4_set_reg(pm4, R_00B31C_SPI_SHADER_PGM_RSRC3_ES,
+				       S_00B31C_CU_EN(0xffff) | S_00B31C_WAVE_LIMIT(0x3F));
 
 			/* If this is 0, Bonaire can hang even if GS isn't being used.
 			 * Other chips are unaffected. These are suboptimal values,
@@ -5008,7 +5012,8 @@  static void si_init_config(struct si_context *sctx)
 				       S_028A44_ES_VERTS_PER_SUBGRP(64) |
 				       S_028A44_GS_PRIMS_PER_SUBGRP(4));
 		}
-		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS, S_00B21C_CU_EN(0xffff));
+		si_pm4_set_reg(pm4, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
+			       S_00B21C_CU_EN(0xffff) | S_00B21C_WAVE_LIMIT(0x3F));
 
 		/* Compute LATE_ALLOC_VS.LIMIT. */
 		unsigned num_cu_per_sh = sscreen->b.info.num_good_compute_units /
@@ -5040,10 +5045,12 @@  static void si_init_config(struct si_context *sctx)
 
 		/* VS can't execute on one CU if the limit is > 2. */
 		si_pm4_set_reg(pm4, R_00B118_SPI_SHADER_PGM_RSRC3_VS,
-			       S_00B118_CU_EN(late_alloc_limit > 2 ? 0xfffe : 0xffff));
+			       S_00B118_CU_EN(late_alloc_limit > 2 ? 0xfffe : 0xffff) |
+			       S_00B118_WAVE_LIMIT(0x3F));
 		si_pm4_set_reg(pm4, R_00B11C_SPI_SHADER_LATE_ALLOC_VS,
 			       S_00B11C_LIMIT(late_alloc_limit));
-		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS, S_00B01C_CU_EN(0xffff));
+		si_pm4_set_reg(pm4, R_00B01C_SPI_SHADER_PGM_RSRC3_PS,
+			       S_00B01C_CU_EN(0xffff) | S_00B01C_WAVE_LIMIT(0x3F));
 	}
 
 	if (sctx->b.chip_class >= VI) {

Comments

On Fri, Oct 20, 2017 at 4:34 PM, Andres Rodriguez <andresx7@gmail.com> wrote:
> When WAVE_LIMIT is set, a submission will opt-in for SPI based resource
> scheduling. Because this mechanism is cooperative, we must ensure that
> all submissions have this field set, otherwise they will bypass resource
> arbitration.
>
> We always hardcode the field to its maximum value, instead of
> attempting to calculate an approximate usage. In testing, there were no
> benefits to using anything other than the maximum.
>
> Signed-off-by: Andres Rodriguez <andresx7@gmail.com>
> ---
>  src/amd/vulkan/si_cmd_buffer.c          | 27 ++++++++++++++++++---------
>  src/gallium/drivers/radeonsi/si_state.c | 21 ++++++++++++++-------
>  2 files changed, 32 insertions(+), 16 deletions(-)
>
> diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
> index de3b388..ac3dff8 100644
> --- a/src/amd/vulkan/si_cmd_buffer.c
> +++ b/src/amd/vulkan/si_cmd_buffer.c
> @@ -179,7 +179,8 @@ si_emit_compute(struct radv_physical_device *physical_device,
>         radeon_emit(cs, 0);
>         radeon_emit(cs, 0);
>
> -       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
> +       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
> +                             S_00B854_WAVES_PER_SH(0x3));

This part doesn't set WAVES_PER_SH. Also, the number 3 would be wrong for it.

Marek
On 8/1/18 5:12 AM, Marek Olšák wrote:
> On Fri, Oct 20, 2017 at 4:34 PM, Andres Rodriguez <andresx7@gmail.com> wrote:
>> When WAVE_LIMIT is set, a submission will opt-in for SPI based resource
>> scheduling. Because this mechanism is cooperative, we must ensure that
>> all submissions have this field set, otherwise they will bypass resource
>> arbitration.
>>
>> We always hardcode the field to its maximum value, instead of
>> attempting to calculate an approximate usage. In testing, there were no
>> benefits to using anything other than the maximum.
>>
>> Signed-off-by: Andres Rodriguez <andresx7@gmail.com>
>> ---
>>   src/amd/vulkan/si_cmd_buffer.c          | 27 ++++++++++++++++++---------
>>   src/gallium/drivers/radeonsi/si_state.c | 21 ++++++++++++++-------
>>   2 files changed, 32 insertions(+), 16 deletions(-)
>>
>> diff --git a/src/amd/vulkan/si_cmd_buffer.c b/src/amd/vulkan/si_cmd_buffer.c
>> index de3b388..ac3dff8 100644
>> --- a/src/amd/vulkan/si_cmd_buffer.c
>> +++ b/src/amd/vulkan/si_cmd_buffer.c
>> @@ -179,7 +179,8 @@ si_emit_compute(struct radv_physical_device *physical_device,
>>          radeon_emit(cs, 0);
>>          radeon_emit(cs, 0);
>>
>> -       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
>> +       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
>> +                             S_00B854_WAVES_PER_SH(0x3));
> 
> This part doesn't set WAVES_PER_SH. Also, the number 3 would be wrong for it.

This is just a default value for that register. COMPUTE_RESOURCE_LIMITS 
is computed like RadeonSI when we generate the PM4 stuff for compute 
pipelines, and then emitted at dispatch time.

Do you think we should remove that default value?

> 
> Marek
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
On Tue, Aug 14, 2018, 8:28 AM Samuel Pitoiset <samuel.pitoiset@gmail.com>
wrote:

>
>
> On 8/1/18 5:12 AM, Marek Olšák wrote:
> > On Fri, Oct 20, 2017 at 4:34 PM, Andres Rodriguez <andresx7@gmail.com>
> wrote:
> >> When WAVE_LIMIT is set, a submission will opt-in for SPI based resource
> >> scheduling. Because this mechanism is cooperative, we must ensure that
> >> all submissions have this field set, otherwise they will bypass resource
> >> arbitration.
> >>
> >> We always hardcode the field to its maximum value, instead of
> >> attempting to calculate an approximate usage. In testing, there were no
> >> benefits to using anything other than the maximum.
> >>
> >> Signed-off-by: Andres Rodriguez <andresx7@gmail.com>
> >> ---
> >>   src/amd/vulkan/si_cmd_buffer.c          | 27
> ++++++++++++++++++---------
> >>   src/gallium/drivers/radeonsi/si_state.c | 21 ++++++++++++++-------
> >>   2 files changed, 32 insertions(+), 16 deletions(-)
> >>
> >> diff --git a/src/amd/vulkan/si_cmd_buffer.c
> b/src/amd/vulkan/si_cmd_buffer.c
> >> index de3b388..ac3dff8 100644
> >> --- a/src/amd/vulkan/si_cmd_buffer.c
> >> +++ b/src/amd/vulkan/si_cmd_buffer.c
> >> @@ -179,7 +179,8 @@ si_emit_compute(struct radv_physical_device
> *physical_device,
> >>          radeon_emit(cs, 0);
> >>          radeon_emit(cs, 0);
> >>
> >> -       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
> >> +       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
> >> +                             S_00B854_WAVES_PER_SH(0x3));
> >
> > This part doesn't set WAVES_PER_SH. Also, the number 3 would be wrong
> for it.
>
> This is just a default value for that register. COMPUTE_RESOURCE_LIMITS
> is computed like RadeonSI when we generate the PM4 stuff for compute
> pipelines, and then emitted at dispatch time.
>
> Do you think we should remove that default value?
>

Please read the patch again. It's obvious if you know what the seq function
does.

Marek


> >
> > Marek
> > _______________________________________________
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> >
>
On 8/14/18 5:46 PM, Marek Olšák wrote:
> 
> 
> On Tue, Aug 14, 2018, 8:28 AM Samuel Pitoiset <samuel.pitoiset@gmail.com 
> <mailto:samuel.pitoiset@gmail.com>> wrote:
> 
> 
> 
>     On 8/1/18 5:12 AM, Marek Olšák wrote:
>      > On Fri, Oct 20, 2017 at 4:34 PM, Andres Rodriguez
>     <andresx7@gmail.com <mailto:andresx7@gmail.com>> wrote:
>      >> When WAVE_LIMIT is set, a submission will opt-in for SPI based
>     resource
>      >> scheduling. Because this mechanism is cooperative, we must
>     ensure that
>      >> all submissions have this field set, otherwise they will bypass
>     resource
>      >> arbitration.
>      >>
>      >> We always hardcode the field to its maximum value, instead of
>      >> attempting to calculate an approximate usage. In testing, there
>     were no
>      >> benefits to using anything other than the maximum.
>      >>
>      >> Signed-off-by: Andres Rodriguez <andresx7@gmail.com
>     <mailto:andresx7@gmail.com>>
>      >> ---
>      >>   src/amd/vulkan/si_cmd_buffer.c          | 27
>     ++++++++++++++++++---------
>      >>   src/gallium/drivers/radeonsi/si_state.c | 21 ++++++++++++++-------
>      >>   2 files changed, 32 insertions(+), 16 deletions(-)
>      >>
>      >> diff --git a/src/amd/vulkan/si_cmd_buffer.c
>     b/src/amd/vulkan/si_cmd_buffer.c
>      >> index de3b388..ac3dff8 100644
>      >> --- a/src/amd/vulkan/si_cmd_buffer.c
>      >> +++ b/src/amd/vulkan/si_cmd_buffer.c
>      >> @@ -179,7 +179,8 @@ si_emit_compute(struct radv_physical_device
>     *physical_device,
>      >>          radeon_emit(cs, 0);
>      >>          radeon_emit(cs, 0);
>      >>
>      >> -       radeon_set_sh_reg_seq(cs,
>     R_00B854_COMPUTE_RESOURCE_LIMITS, 3);
>      >> +       radeon_set_sh_reg_seq(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
>      >> +                             S_00B854_WAVES_PER_SH(0x3));
>      >
>      > This part doesn't set WAVES_PER_SH. Also, the number 3 would be
>     wrong for it.
> 
>     This is just a default value for that register. COMPUTE_RESOURCE_LIMITS
>     is computed like RadeonSI when we generate the PM4 stuff for compute
>     pipelines, and then emitted at dispatch time.
> 
>     Do you think we should remove that default value?
> 
> 
> Please read the patch again. It's obvious if you know what the seq 
> function does.

Oh right, this is totally wrong...

> 
> Marek
> 
> 
>      >
>      > Marek
>      > _______________________________________________
>      > mesa-dev mailing list
>      > mesa-dev@lists.freedesktop.org
>     <mailto:mesa-dev@lists.freedesktop.org>
>      > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>      >
>