[3/3] intel/compiler: implement more algebraic optimizations

Submitted by Iago Toral Quiroga on Feb. 27, 2019, 12:45 p.m.

Details

Message ID 20190227124559.7436-4-itoral@igalia.com
State New
Headers show
Series "intel: propagate constants to first source of 2-src instructions" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Iago Toral Quiroga Feb. 27, 2019, 12:45 p.m.
Now that we propagate constants to the first source of 2src instructions we
see more opportunities of constant folding in the backend.

Shader-db results on KBL:

total instructions in shared programs: 14965607 -> 14855983 (-0.73%)
instructions in affected programs: 3988102 -> 3878478 (-2.75%)
helped: 14292
HURT: 59

total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
cycles in affected programs: 247527740 -> 243859453 (-1.48%)
helped: 14056
HURT: 3314

total loops in shared programs: 4283 -> 4283 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total spills in shared programs: 27812 -> 24350 (-12.45%)
spills in affected programs: 24921 -> 21459 (-13.89%)
helped: 345
HURT: 19

total fills in shared programs: 24173 -> 22032 (-8.86%)
fills in affected programs: 21124 -> 18983 (-10.14%)
helped: 355
HURT: 25

LOST:   0
GAINED: 5
---
 src/intel/compiler/brw_fs.cpp | 203 ++++++++++++++++++++++++++++++++--
 1 file changed, 195 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 2358acbeb59..b2b60237c82 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -2583,9 +2583,55 @@  fs_visitor::opt_algebraic()
          break;
 
       case BRW_OPCODE_MUL:
-         if (inst->src[1].file != IMM)
+         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
             continue;
 
+         /* Constant folding */
+         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+            assert(inst->src[0].type == inst->src[1].type);
+            bool local_progress = true;
+            switch (inst->src[0].type) {
+            case BRW_REGISTER_TYPE_HF: {
+               float v1 = _mesa_half_to_float(inst->src[0].ud & 0xffffu);
+               float v2 = _mesa_half_to_float(inst->src[1].ud & 0xffffu);
+               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 * v2));
+               break;
+            }
+            case BRW_REGISTER_TYPE_W: {
+               int16_t v1 = inst->src[0].ud & 0xffffu;
+               int16_t v2 = inst->src[1].ud & 0xffffu;
+               inst->src[0] = brw_imm_w(v1 * v2);
+               break;
+            }
+            case BRW_REGISTER_TYPE_UW: {
+               uint16_t v1 = inst->src[0].ud & 0xffffu;
+               uint16_t v2 = inst->src[1].ud & 0xffffu;
+               inst->src[0] = brw_imm_uw(v1 * v2);
+               break;
+            }
+            case BRW_REGISTER_TYPE_F:
+               inst->src[0].f *= inst->src[1].f;
+               break;
+            case BRW_REGISTER_TYPE_D:
+               inst->src[0].d *= inst->src[1].d;
+               break;
+            case BRW_REGISTER_TYPE_UD:
+               inst->src[0].ud *= inst->src[1].ud;
+               break;
+            default:
+               local_progress = false;
+               break;
+            };
+
+            if (local_progress) {
+               inst->opcode = BRW_OPCODE_MOV;
+               inst->src[1] = reg_undef;
+               progress = true;
+               break;
+            }
+         }
+
+
          /* a * 1.0 = a */
          if (inst->src[1].is_one()) {
             inst->opcode = BRW_OPCODE_MOV;
@@ -2594,6 +2640,14 @@  fs_visitor::opt_algebraic()
             break;
          }
 
+         if (inst->src[0].is_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = inst->src[1];
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
          /* a * -1.0 = -a */
          if (inst->src[1].is_negative_one()) {
             inst->opcode = BRW_OPCODE_MOV;
@@ -2603,27 +2657,160 @@  fs_visitor::opt_algebraic()
             break;
          }
 
-         if (inst->src[0].file == IMM) {
-            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+         if (inst->src[0].is_negative_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = inst->src[1];
+            inst->src[0].negate = !inst->src[1].negate;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         /* a * 0 = 0 (this is not exact for floating point) */
+         if (inst->src[1].is_zero() &&
+             brw_reg_type_is_integer(inst->src[1].type)) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = inst->src[1];
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         if (inst->src[0].is_zero() &&
+             brw_reg_type_is_integer(inst->src[0].type)) {
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0].f *= inst->src[1].f;
             inst->src[1] = reg_undef;
             progress = true;
             break;
          }
          break;
       case BRW_OPCODE_ADD:
-         if (inst->src[1].file != IMM)
+         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
             continue;
 
-         if (inst->src[0].file == IMM) {
-            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+         /* Constant folding */
+         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+            assert(inst->src[0].type == inst->src[1].type);
+            bool local_progress = true;
+            switch (inst->src[0].type) {
+            case BRW_REGISTER_TYPE_HF: {
+               float v1 = _mesa_half_to_float(inst->src[0].ud & 0xffffu);
+               float v2 = _mesa_half_to_float(inst->src[1].ud & 0xffffu);
+               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 + v2));
+               break;
+            }
+            case BRW_REGISTER_TYPE_W: {
+               int16_t v1 = inst->src[0].ud & 0xffffu;
+               int16_t v2 = inst->src[1].ud & 0xffffu;
+               inst->src[0] = brw_imm_w(v1 + v2);
+               break;
+            }
+            case BRW_REGISTER_TYPE_UW: {
+               uint16_t v1 = inst->src[0].ud & 0xffffu;
+               uint16_t v2 = inst->src[1].ud & 0xffffu;
+               inst->src[0] = brw_imm_uw(v1 + v2);
+               break;
+            }
+            case BRW_REGISTER_TYPE_F:
+               inst->src[0].f += inst->src[1].f;
+               break;
+            case BRW_REGISTER_TYPE_D:
+               inst->src[0].d += inst->src[1].d;
+               break;
+            case BRW_REGISTER_TYPE_UD:
+               inst->src[0].ud += inst->src[1].ud;
+               break;
+            default:
+               local_progress = false;
+               break;
+            };
+
+            if (local_progress) {
+               inst->opcode = BRW_OPCODE_MOV;
+               inst->src[1] = reg_undef;
+               progress = true;
+               break;
+            }
+         }
+
+         /* a + 0 = a (this is not exact for floating point) */
+         if (inst->src[1].is_zero() &&
+             brw_reg_type_is_integer(inst->src[1].type)) {
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0].f += inst->src[1].f;
             inst->src[1] = reg_undef;
             progress = true;
             break;
          }
+
+         if (inst->src[0].is_zero() &&
+             brw_reg_type_is_integer(inst->src[0].type)) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = inst->src[1];
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+         break;
+      case BRW_OPCODE_SHL:
+         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+            bool local_progress = true;
+            switch (inst->src[0].type) {
+            case BRW_REGISTER_TYPE_D:
+            case BRW_REGISTER_TYPE_UD:
+               inst->src[0].ud <<= inst->src[1].ud;
+               break;
+            case BRW_REGISTER_TYPE_W:
+            case BRW_REGISTER_TYPE_UW: {
+               uint16_t v1 = inst->src[0].ud & 0xffffu;
+               uint16_t v2 = inst->src[1].ud & 0xffffu;
+               inst->src[0] = retype(brw_imm_uw(v1 << v2), inst->src[0].type);
+               break;
+            }
+            default:
+               local_progress = false;
+               break;
+            }
+            if (local_progress) {
+               inst->opcode = BRW_OPCODE_MOV;
+               inst->src[1] = reg_undef;
+               progress = true;
+               break;
+            }
+         }
+         break;
+      case BRW_OPCODE_SHR:
+         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
+            bool local_progress = true;
+            switch (inst->src[0].type) {
+            case BRW_REGISTER_TYPE_D:
+               inst->src[0].d >>= inst->src[1].ud;
+               break;
+            case BRW_REGISTER_TYPE_UD:
+               inst->src[0].ud >>= inst->src[1].ud;
+               break;
+            case BRW_REGISTER_TYPE_W: {
+               int16_t v1 = inst->src[0].ud & 0xffffu;
+               uint16_t v2 = inst->src[1].ud & 0xffffu;
+               inst->src[0] = brw_imm_w(v1 >> v2);
+               break;
+            }
+            case BRW_REGISTER_TYPE_UW: {
+               uint16_t v1 = inst->src[0].ud & 0xffffu;
+               uint16_t v2 = inst->src[1].ud & 0xffffu;
+               inst->src[0] = brw_imm_uw(v1 >> v2);
+               break;
+            }
+            default:
+               local_progress = false;
+               break;
+            }
+            if (local_progress) {
+               inst->opcode = BRW_OPCODE_MOV;
+               inst->src[1] = reg_undef;
+               progress = true;
+               break;
+            }
+         }
          break;
       case BRW_OPCODE_OR:
          if (inst->src[0].equals(inst->src[1]) ||

Comments

On 2/27/19 4:45 AM, Iago Toral Quiroga wrote:
> Now that we propagate constants to the first source of 2src instructions we
> see more opportunities of constant folding in the backend.

All the benefit of the series is from more constant folding?  Once upon
a time, I had a patch that added another call to
nir_opt_constant_folding after we call nir_opt_algebraic_late.  My
recollection is that it hurt vec4 shaders, but it helped scalar shaders
quite a bit.  How does doing that affect these results?

Hrm... I can collect that data.

> Shader-db results on KBL:
> 
> total instructions in shared programs: 14965607 -> 14855983 (-0.73%)
> instructions in affected programs: 3988102 -> 3878478 (-2.75%)
> helped: 14292
> HURT: 59
> 
> total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
> cycles in affected programs: 247527740 -> 243859453 (-1.48%)
> helped: 14056
> HURT: 3314
> 
> total loops in shared programs: 4283 -> 4283 (0.00%)
> loops in affected programs: 0 -> 0
> helped: 0
> HURT: 0
> 
> total spills in shared programs: 27812 -> 24350 (-12.45%)
> spills in affected programs: 24921 -> 21459 (-13.89%)
> helped: 345
> HURT: 19
> 
> total fills in shared programs: 24173 -> 22032 (-8.86%)
> fills in affected programs: 21124 -> 18983 (-10.14%)
> helped: 355
> HURT: 25
> 
> LOST:   0
> GAINED: 5
> ---
>  src/intel/compiler/brw_fs.cpp | 203 ++++++++++++++++++++++++++++++++--
>  1 file changed, 195 insertions(+), 8 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 2358acbeb59..b2b60237c82 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
>           break;
>  
>        case BRW_OPCODE_MUL:
> -         if (inst->src[1].file != IMM)
> +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
>              continue;
>  
> +         /* Constant folding */
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            assert(inst->src[0].type == inst->src[1].type);
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_HF: {
> +               float v1 = _mesa_half_to_float(inst->src[0].ud & 0xffffu);
> +               float v2 = _mesa_half_to_float(inst->src[1].ud & 0xffffu);
> +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 * v2));
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_W: {
> +               int16_t v1 = inst->src[0].ud & 0xffffu;
> +               int16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_w(v1 * v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_uw(v1 * v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_F:
> +               inst->src[0].f *= inst->src[1].f;
> +               break;
> +            case BRW_REGISTER_TYPE_D:
> +               inst->src[0].d *= inst->src[1].d;
> +               break;
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud *= inst->src[1].ud;
> +               break;
> +            default:
> +               local_progress = false;
> +               break;
> +            };
> +
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
> +
> +
>           /* a * 1.0 = a */
>           if (inst->src[1].is_one()) {
>              inst->opcode = BRW_OPCODE_MOV;
> @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
>              break;
>           }
>  
> +         if (inst->src[0].is_one()) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +
>           /* a * -1.0 = -a */
>           if (inst->src[1].is_negative_one()) {
>              inst->opcode = BRW_OPCODE_MOV;
> @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
>              break;
>           }
>  
> -         if (inst->src[0].file == IMM) {
> -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> +         if (inst->src[0].is_negative_one()) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[0].negate = !inst->src[1].negate;
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +
> +         /* a * 0 = 0 (this is not exact for floating point) */
> +         if (inst->src[1].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[1].type)) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +
> +         if (inst->src[0].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[0].type)) {
>              inst->opcode = BRW_OPCODE_MOV;
> -            inst->src[0].f *= inst->src[1].f;
>              inst->src[1] = reg_undef;
>              progress = true;
>              break;
>           }
>           break;
>        case BRW_OPCODE_ADD:
> -         if (inst->src[1].file != IMM)
> +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
>              continue;
>  
> -         if (inst->src[0].file == IMM) {
> -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> +         /* Constant folding */
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            assert(inst->src[0].type == inst->src[1].type);
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_HF: {
> +               float v1 = _mesa_half_to_float(inst->src[0].ud & 0xffffu);
> +               float v2 = _mesa_half_to_float(inst->src[1].ud & 0xffffu);
> +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 + v2));
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_W: {
> +               int16_t v1 = inst->src[0].ud & 0xffffu;
> +               int16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_w(v1 + v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_uw(v1 + v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_F:
> +               inst->src[0].f += inst->src[1].f;
> +               break;
> +            case BRW_REGISTER_TYPE_D:
> +               inst->src[0].d += inst->src[1].d;
> +               break;
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud += inst->src[1].ud;
> +               break;
> +            default:
> +               local_progress = false;
> +               break;
> +            };
> +
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
> +
> +         /* a + 0 = a (this is not exact for floating point) */
> +         if (inst->src[1].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[1].type)) {
>              inst->opcode = BRW_OPCODE_MOV;
> -            inst->src[0].f += inst->src[1].f;
>              inst->src[1] = reg_undef;
>              progress = true;
>              break;
>           }
> +
> +         if (inst->src[0].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[0].type)) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +         break;
> +      case BRW_OPCODE_SHL:
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_D:
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud <<= inst->src[1].ud;
> +               break;
> +            case BRW_REGISTER_TYPE_W:
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = retype(brw_imm_uw(v1 << v2), inst->src[0].type);
> +               break;
> +            }
> +            default:
> +               local_progress = false;
> +               break;
> +            }
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
> +         break;
> +      case BRW_OPCODE_SHR:
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_D:
> +               inst->src[0].d >>= inst->src[1].ud;
> +               break;
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud >>= inst->src[1].ud;
> +               break;
> +            case BRW_REGISTER_TYPE_W: {
> +               int16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_w(v1 >> v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_uw(v1 >> v2);
> +               break;
> +            }
> +            default:
> +               local_progress = false;
> +               break;
> +            }
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
>           break;
>        case BRW_OPCODE_OR:
>           if (inst->src[0].equals(inst->src[1]) ||
On 2/27/19 4:45 AM, Iago Toral Quiroga wrote:
> Now that we propagate constants to the first source of 2src instructions we
> see more opportunities of constant folding in the backend.
> 
> Shader-db results on KBL:
> 
> total instructions in shared programs: 14965607 -> 14855983 (-0.73%)
> instructions in affected programs: 3988102 -> 3878478 (-2.75%)
> helped: 14292
> HURT: 59
> 
> total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
> cycles in affected programs: 247527740 -> 243859453 (-1.48%)
> helped: 14056
> HURT: 3314
> 
> total loops in shared programs: 4283 -> 4283 (0.00%)
> loops in affected programs: 0 -> 0
> helped: 0
> HURT: 0
> 
> total spills in shared programs: 27812 -> 24350 (-12.45%)
> spills in affected programs: 24921 -> 21459 (-13.89%)
> helped: 345
> HURT: 19
> 
> total fills in shared programs: 24173 -> 22032 (-8.86%)
> fills in affected programs: 21124 -> 18983 (-10.14%)
> helped: 355
> HURT: 25

Ignore my previous questions about nir_opt_constant_folding after
nir_opt_algebraic_late.  I had done that because I added a bunch of
things to nir_opt_algebraic_late that created my constant folding
opportunities.

This is the combined changes for this patch and the previous patch.  For
this patch alone, I got:

total instructions in shared programs: 15306213 -> 15221518 (-0.55%)
instructions in affected programs: 2911451 -> 2826756 (-2.91%)
helped: 13121
HURT: 44
helped stats (abs) min: 1 max: 51 x̄: 6.66 x̃: 6
helped stats (rel) min: <.01% max: 16.67% x̄: 4.27% x̃: 3.30%
HURT stats (abs)   min: 3 max: 453 x̄: 61.16 x̃: 5
HURT stats (rel)   min: 0.20% max: 151.00% x̄: 31.57% x̃: 19.23%
95% mean confidence interval for instructions value: -6.61 -6.26
95% mean confidence interval for instructions %-change: -4.23% -4.07%
Instructions are helped.

total cycles in shared programs: 375419164 -> 372829148 (-0.69%)
cycles in affected programs: 146769299 -> 144179283 (-1.76%)
helped: 10992
HURT: 1833
helped stats (abs) min: 1 max: 56127 x̄: 250.29 x̃: 18
helped stats (rel) min: <.01% max: 40.52% x̄: 3.11% x̃: 2.58%
HURT stats (abs)   min: 1 max: 1718 x̄: 87.93 x̃: 42
HURT stats (rel)   min: <.01% max: 139.33% x̄: 7.74% x̃: 3.08%
95% mean confidence interval for cycles value: -248.21 -155.69
95% mean confidence interval for cycles %-change: -1.67% -1.44%
Cycles are helped.

total spills in shared programs: 28828 -> 28888 (0.21%)
spills in affected programs: 2037 -> 2097 (2.95%)
helped: 0
HURT: 24

total fills in shared programs: 35542 -> 35639 (0.27%)
fills in affected programs: 3078 -> 3175 (3.15%)
helped: 2
HURT: 26

I decided to look at some of the hurt shaders... it looks like some of
the Unigine geometry shaders really took a beating (+150% instructions).
Note the "max" in the "instructions in affected programs" above.

More comments below by SHL...

> LOST:   0
> GAINED: 5
> ---
>  src/intel/compiler/brw_fs.cpp | 203 ++++++++++++++++++++++++++++++++--
>  1 file changed, 195 insertions(+), 8 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 2358acbeb59..b2b60237c82 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
>           break;
>  
>        case BRW_OPCODE_MUL:
> -         if (inst->src[1].file != IMM)
> +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
>              continue;
>  
> +         /* Constant folding */
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            assert(inst->src[0].type == inst->src[1].type);
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_HF: {
> +               float v1 = _mesa_half_to_float(inst->src[0].ud & 0xffffu);
> +               float v2 = _mesa_half_to_float(inst->src[1].ud & 0xffffu);
> +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 * v2));
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_W: {
> +               int16_t v1 = inst->src[0].ud & 0xffffu;
> +               int16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_w(v1 * v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_uw(v1 * v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_F:
> +               inst->src[0].f *= inst->src[1].f;
> +               break;
> +            case BRW_REGISTER_TYPE_D:
> +               inst->src[0].d *= inst->src[1].d;
> +               break;
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud *= inst->src[1].ud;
> +               break;
> +            default:
> +               local_progress = false;
> +               break;
> +            };
> +
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
> +
> +
>           /* a * 1.0 = a */
>           if (inst->src[1].is_one()) {
>              inst->opcode = BRW_OPCODE_MOV;
> @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
>              break;
>           }
>  
> +         if (inst->src[0].is_one()) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +
>           /* a * -1.0 = -a */
>           if (inst->src[1].is_negative_one()) {
>              inst->opcode = BRW_OPCODE_MOV;
> @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
>              break;
>           }
>  
> -         if (inst->src[0].file == IMM) {
> -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> +         if (inst->src[0].is_negative_one()) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[0].negate = !inst->src[1].negate;
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +
> +         /* a * 0 = 0 (this is not exact for floating point) */
> +         if (inst->src[1].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[1].type)) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +
> +         if (inst->src[0].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[0].type)) {
>              inst->opcode = BRW_OPCODE_MOV;
> -            inst->src[0].f *= inst->src[1].f;
>              inst->src[1] = reg_undef;
>              progress = true;
>              break;
>           }
>           break;
>        case BRW_OPCODE_ADD:
> -         if (inst->src[1].file != IMM)
> +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
>              continue;
>  
> -         if (inst->src[0].file == IMM) {
> -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> +         /* Constant folding */
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            assert(inst->src[0].type == inst->src[1].type);
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_HF: {
> +               float v1 = _mesa_half_to_float(inst->src[0].ud & 0xffffu);
> +               float v2 = _mesa_half_to_float(inst->src[1].ud & 0xffffu);
> +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 + v2));
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_W: {
> +               int16_t v1 = inst->src[0].ud & 0xffffu;
> +               int16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_w(v1 + v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_uw(v1 + v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_F:
> +               inst->src[0].f += inst->src[1].f;
> +               break;
> +            case BRW_REGISTER_TYPE_D:
> +               inst->src[0].d += inst->src[1].d;
> +               break;
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud += inst->src[1].ud;
> +               break;
> +            default:
> +               local_progress = false;
> +               break;
> +            };
> +
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
> +
> +         /* a + 0 = a (this is not exact for floating point) */
> +         if (inst->src[1].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[1].type)) {
>              inst->opcode = BRW_OPCODE_MOV;
> -            inst->src[0].f += inst->src[1].f;
>              inst->src[1] = reg_undef;
>              progress = true;
>              break;
>           }
> +
> +         if (inst->src[0].is_zero() &&
> +             brw_reg_type_is_integer(inst->src[0].type)) {
> +            inst->opcode = BRW_OPCODE_MOV;
> +            inst->src[0] = inst->src[1];
> +            inst->src[1] = reg_undef;
> +            progress = true;
> +            break;
> +         }
> +         break;
> +      case BRW_OPCODE_SHL:
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_D:
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud <<= inst->src[1].ud;
> +               break;

There's something fishy going on here.  In one of the less hurt Unigine
shaders, the before code was:

mov(1)          g2<1>UD         0x00000001UD                    { align1 WE_all 1N compacted };
mov(8)          g126<1>UD       g1<8,8,1>UD                     { align1 WE_all 1Q compacted };
shl(8)          g127<1>UD       g2<0,1,0>UD     0xffffffffUD    { align1 1Q compacted };

and the after code is

mov(8)          g126<1>UD       g1<8,8,1>UD                     { align1 WE_all 1Q compacted };
mov(8)          g127<1>UD       0x00000008UD                    { align1 WE_all 1Q compacted };

That doesn't seem right.  I thought 1U << 0xffffffffU should be 1u << 31
= 0x80000000.  Since Gen explicitly only uses the low 5 bits of src1,
maybe we need "& 0x1f" around all the shift counts?  I thought the CPU
did the same thing...

Does this series pass the CI?

> +            case BRW_REGISTER_TYPE_W:
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = retype(brw_imm_uw(v1 << v2), inst->src[0].type);
> +               break;
> +            }
> +            default:
> +               local_progress = false;
> +               break;
> +            }
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
> +         break;
> +      case BRW_OPCODE_SHR:
> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
> +            bool local_progress = true;
> +            switch (inst->src[0].type) {
> +            case BRW_REGISTER_TYPE_D:
> +               inst->src[0].d >>= inst->src[1].ud;
> +               break;
> +            case BRW_REGISTER_TYPE_UD:
> +               inst->src[0].ud >>= inst->src[1].ud;
> +               break;
> +            case BRW_REGISTER_TYPE_W: {
> +               int16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_w(v1 >> v2);
> +               break;
> +            }
> +            case BRW_REGISTER_TYPE_UW: {
> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> +               inst->src[0] = brw_imm_uw(v1 >> v2);
> +               break;
> +            }
> +            default:
> +               local_progress = false;
> +               break;
> +            }
> +            if (local_progress) {
> +               inst->opcode = BRW_OPCODE_MOV;
> +               inst->src[1] = reg_undef;
> +               progress = true;
> +               break;
> +            }
> +         }
>           break;
>        case BRW_OPCODE_OR:
>           if (inst->src[0].equals(inst->src[1]) ||
>
On Wed, 2019-02-27 at 17:04 -0800, Ian Romanick wrote:
> On 2/27/19 4:45 AM, Iago Toral Quiroga wrote:
> > Now that we propagate constants to the first source of 2src
> > instructions we
> > see more opportunities of constant folding in the backend.
> > 
> > Shader-db results on KBL:
> > 
> > total instructions in shared programs: 14965607 -> 14855983 (-
> > 0.73%)
> > instructions in affected programs: 3988102 -> 3878478 (-2.75%)
> > helped: 14292
> > HURT: 59
> > 
> > total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
> > cycles in affected programs: 247527740 -> 243859453 (-1.48%)
> > helped: 14056
> > HURT: 3314
> > 
> > total loops in shared programs: 4283 -> 4283 (0.00%)
> > loops in affected programs: 0 -> 0
> > helped: 0
> > HURT: 0
> > 
> > total spills in shared programs: 27812 -> 24350 (-12.45%)
> > spills in affected programs: 24921 -> 21459 (-13.89%)
> > helped: 345
> > HURT: 19
> > 
> > total fills in shared programs: 24173 -> 22032 (-8.86%)
> > fills in affected programs: 21124 -> 18983 (-10.14%)
> > helped: 355
> > HURT: 25
> 
> Ignore my previous questions about nir_opt_constant_folding after
> nir_opt_algebraic_late.  I had done that because I added a bunch of
> things to nir_opt_algebraic_late that created my constant folding
> opportunities.
> 
> This is the combined changes for this patch and the previous
> patch.  For
> this patch alone, I got:
> 
> total instructions in shared programs: 15306213 -> 15221518 (-0.55%)
> instructions in affected programs: 2911451 -> 2826756 (-2.91%)
> helped: 13121
> HURT: 44
> helped stats (abs) min: 1 max: 51 x̄: 6.66 x̃: 6
> helped stats (rel) min: <.01% max: 16.67% x̄: 4.27% x̃: 3.30%
> HURT stats (abs)   min: 3 max: 453 x̄: 61.16 x̃: 5
> HURT stats (rel)   min: 0.20% max: 151.00% x̄: 31.57% x̃: 19.23%
> 95% mean confidence interval for instructions value: -6.61 -6.26
> 95% mean confidence interval for instructions %-change: -4.23% -4.07%
> Instructions are helped.
> 
> total cycles in shared programs: 375419164 -> 372829148 (-0.69%)
> cycles in affected programs: 146769299 -> 144179283 (-1.76%)
> helped: 10992
> HURT: 1833
> helped stats (abs) min: 1 max: 56127 x̄: 250.29 x̃: 18
> helped stats (rel) min: <.01% max: 40.52% x̄: 3.11% x̃: 2.58%
> HURT stats (abs)   min: 1 max: 1718 x̄: 87.93 x̃: 42
> HURT stats (rel)   min: <.01% max: 139.33% x̄: 7.74% x̃: 3.08%
> 95% mean confidence interval for cycles value: -248.21 -155.69
> 95% mean confidence interval for cycles %-change: -1.67% -1.44%
> Cycles are helped.
> 
> total spills in shared programs: 28828 -> 28888 (0.21%)
> spills in affected programs: 2037 -> 2097 (2.95%)
> helped: 0
> HURT: 24
> 
> total fills in shared programs: 35542 -> 35639 (0.27%)
> fills in affected programs: 3078 -> 3175 (3.15%)
> helped: 2
> HURT: 26
> 
> I decided to look at some of the hurt shaders... it looks like some
> of
> the Unigine geometry shaders really took a beating (+150%
> instructions).
> Note the "max" in the "instructions in affected programs" above.
> 
> More comments below by SHL...
> 
> > LOST:   0
> > GAINED: 5
> > ---
> >  src/intel/compiler/brw_fs.cpp | 203
> > ++++++++++++++++++++++++++++++++--
> >  1 file changed, 195 insertions(+), 8 deletions(-)
> > 
> > diff --git a/src/intel/compiler/brw_fs.cpp
> > b/src/intel/compiler/brw_fs.cpp
> > index 2358acbeb59..b2b60237c82 100644
> > --- a/src/intel/compiler/brw_fs.cpp
> > +++ b/src/intel/compiler/brw_fs.cpp
> > @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
> >           break;
> >  
> >        case BRW_OPCODE_MUL:
> > -         if (inst->src[1].file != IMM)
> > +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
> >              continue;
> >  
> > +         /* Constant folding */
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            assert(inst->src[0].type == inst->src[1].type);
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_HF: {
> > +               float v1 = _mesa_half_to_float(inst->src[0].ud &
> > 0xffffu);
> > +               float v2 = _mesa_half_to_float(inst->src[1].ud &
> > 0xffffu);
> > +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 *
> > v2));
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_W: {
> > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > +               int16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_w(v1 * v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_uw(v1 * v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_F:
> > +               inst->src[0].f *= inst->src[1].f;
> > +               break;
> > +            case BRW_REGISTER_TYPE_D:
> > +               inst->src[0].d *= inst->src[1].d;
> > +               break;
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud *= inst->src[1].ud;
> > +               break;
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            };
> > +
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> > +
> > +
> >           /* a * 1.0 = a */
> >           if (inst->src[1].is_one()) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
> >              break;
> >           }
> >  
> > +         if (inst->src[0].is_one()) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +
> >           /* a * -1.0 = -a */
> >           if (inst->src[1].is_negative_one()) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
> >              break;
> >           }
> >  
> > -         if (inst->src[0].file == IMM) {
> > -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> > +         if (inst->src[0].is_negative_one()) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[0].negate = !inst->src[1].negate;
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +
> > +         /* a * 0 = 0 (this is not exact for floating point) */
> > +         if (inst->src[1].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[1].type)) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +
> > +         if (inst->src[0].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[0].type)) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > -            inst->src[0].f *= inst->src[1].f;
> >              inst->src[1] = reg_undef;
> >              progress = true;
> >              break;
> >           }
> >           break;
> >        case BRW_OPCODE_ADD:
> > -         if (inst->src[1].file != IMM)
> > +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
> >              continue;
> >  
> > -         if (inst->src[0].file == IMM) {
> > -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> > +         /* Constant folding */
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            assert(inst->src[0].type == inst->src[1].type);
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_HF: {
> > +               float v1 = _mesa_half_to_float(inst->src[0].ud &
> > 0xffffu);
> > +               float v2 = _mesa_half_to_float(inst->src[1].ud &
> > 0xffffu);
> > +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 +
> > v2));
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_W: {
> > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > +               int16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_w(v1 + v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_uw(v1 + v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_F:
> > +               inst->src[0].f += inst->src[1].f;
> > +               break;
> > +            case BRW_REGISTER_TYPE_D:
> > +               inst->src[0].d += inst->src[1].d;
> > +               break;
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud += inst->src[1].ud;
> > +               break;
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            };
> > +
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> > +
> > +         /* a + 0 = a (this is not exact for floating point) */
> > +         if (inst->src[1].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[1].type)) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > -            inst->src[0].f += inst->src[1].f;
> >              inst->src[1] = reg_undef;
> >              progress = true;
> >              break;
> >           }
> > +
> > +         if (inst->src[0].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[0].type)) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +         break;
> > +      case BRW_OPCODE_SHL:
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_D:
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud <<= inst->src[1].ud;
> > +               break;
> 
> There's something fishy going on here.  In one of the less hurt
> Unigine
> shaders, the before code was:
> 
> mov(1)          g2<1>UD         0x00000001UD                    {
> align1 WE_all 1N compacted };
> mov(8)          g126<1>UD       g1<8,8,1>UD                     {
> align1 WE_all 1Q compacted };
> shl(8)          g127<1>UD       g2<0,1,0>UD     0xffffffffUD    {
> align1 1Q compacted };
> 
> and the after code is
> 
> mov(8)          g126<1>UD       g1<8,8,1>UD                     {
> align1 WE_all 1Q compacted };
> mov(8)          g127<1>UD       0x00000008UD                    {
> align1 WE_all 1Q compacted };
> 
> That doesn't seem right.  I thought 1U << 0xffffffffU should be 1u <<
> 31
> = 0x80000000.  Since Gen explicitly only uses the low 5 bits of src1,
> maybe we need "& 0x1f" around all the shift counts?  I thought the
> CPU
> did the same thing...

Oh, interesting, didn't know about the low 5 bits restriction. I
checked the docs and it seems that it does indeed only use 5 (or 6 if
dst or src0 is Q/UQ), so we should definitely only use those those
bits.

> Does this series pass the CI?

Yes, I sent this to CI before posting  the patches here... I was going
to double check it right now, just in case I messed up and didn't send
the right thing, but the CI website seems to be down at the moment. In
any case, I'll send the series again when the CI is working and I'll
fix SHL/SHR to only use the bits it should. I'll also look at some of
the hurt GS shaders to understand what is happening there.

Thanks Ian.

> > +            case BRW_REGISTER_TYPE_W:
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = retype(brw_imm_uw(v1 << v2), inst-
> > >src[0].type);
> > +               break;
> > +            }
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            }
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> > +         break;
> > +      case BRW_OPCODE_SHR:
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_D:
> > +               inst->src[0].d >>= inst->src[1].ud;
> > +               break;
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud >>= inst->src[1].ud;
> > +               break;
> > +            case BRW_REGISTER_TYPE_W: {
> > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_w(v1 >> v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_uw(v1 >> v2);
> > +               break;
> > +            }
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            }
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> >           break;
> >        case BRW_OPCODE_OR:
> >           if (inst->src[0].equals(inst->src[1]) ||
> > 
> 
>
On Wed, 2019-02-27 at 17:04 -0800, Ian Romanick wrote:
> On 2/27/19 4:45 AM, Iago Toral Quiroga wrote:
> > Now that we propagate constants to the first source of 2src
> > instructions we
> > see more opportunities of constant folding in the backend.
> > 
> > Shader-db results on KBL:
> > 
> > total instructions in shared programs: 14965607 -> 14855983 (-
> > 0.73%)
> > instructions in affected programs: 3988102 -> 3878478 (-2.75%)
> > helped: 14292
> > HURT: 59
> > 
> > total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
> > cycles in affected programs: 247527740 -> 243859453 (-1.48%)
> > helped: 14056
> > HURT: 3314
> > 
> > total loops in shared programs: 4283 -> 4283 (0.00%)
> > loops in affected programs: 0 -> 0
> > helped: 0
> > HURT: 0
> > 
> > total spills in shared programs: 27812 -> 24350 (-12.45%)
> > spills in affected programs: 24921 -> 21459 (-13.89%)
> > helped: 345
> > HURT: 19
> > 
> > total fills in shared programs: 24173 -> 22032 (-8.86%)
> > fills in affected programs: 21124 -> 18983 (-10.14%)
> > helped: 355
> > HURT: 25
> 
> Ignore my previous questions about nir_opt_constant_folding after
> nir_opt_algebraic_late.  I had done that because I added a bunch of
> things to nir_opt_algebraic_late that created my constant folding
> opportunities.
> 
> This is the combined changes for this patch and the previous
> patch.  For
> this patch alone, I got:
> 
> total instructions in shared programs: 15306213 -> 15221518 (-0.55%)
> instructions in affected programs: 2911451 -> 2826756 (-2.91%)
> helped: 13121
> HURT: 44
> helped stats (abs) min: 1 max: 51 x̄: 6.66 x̃: 6
> helped stats (rel) min: <.01% max: 16.67% x̄: 4.27% x̃: 3.30%
> HURT stats (abs)   min: 3 max: 453 x̄: 61.16 x̃: 5
> HURT stats (rel)   min: 0.20% max: 151.00% x̄: 31.57% x̃: 19.23%
> 95% mean confidence interval for instructions value: -6.61 -6.26
> 95% mean confidence interval for instructions %-change: -4.23% -4.07%
> Instructions are helped.
> 
> total cycles in shared programs: 375419164 -> 372829148 (-0.69%)
> cycles in affected programs: 146769299 -> 144179283 (-1.76%)
> helped: 10992
> HURT: 1833
> helped stats (abs) min: 1 max: 56127 x̄: 250.29 x̃: 18
> helped stats (rel) min: <.01% max: 40.52% x̄: 3.11% x̃: 2.58%
> HURT stats (abs)   min: 1 max: 1718 x̄: 87.93 x̃: 42
> HURT stats (rel)   min: <.01% max: 139.33% x̄: 7.74% x̃: 3.08%
> 95% mean confidence interval for cycles value: -248.21 -155.69
> 95% mean confidence interval for cycles %-change: -1.67% -1.44%
> Cycles are helped.
> 
> total spills in shared programs: 28828 -> 28888 (0.21%)
> spills in affected programs: 2037 -> 2097 (2.95%)
> helped: 0
> HURT: 24
> 
> total fills in shared programs: 35542 -> 35639 (0.27%)
> fills in affected programs: 3078 -> 3175 (3.15%)
> helped: 2
> HURT: 26
> 
> I decided to look at some of the hurt shaders... it looks like some
> of
> the Unigine geometry shaders really took a beating (+150%
> instructions).
> Note the "max" in the "instructions in affected programs" above.

I am seeing quite different results on my KBL laptop:

total instructions in shared programs: 14945933 -> 14858158 (-0.59%)
instructions in affected programs: 2842901 -> 2755126 (-3.09%)
helped: 13196
HURT: 5

instructions HURT:   shaders/closed/steam/deus-ex-mankind-
divided/274.shader_test CS SIMD8: 1535 -> 1538 (0.20%)
instructions HURT:   shaders/closed/steam/deus-ex-mankind-
divided/184.shader_test CS SIMD8: 1535 -> 1538 (0.20%)
instructions HURT:   shaders/dolphin/ubershaders/147.shader_test FS
SIMD8: 3481 -> 3491 (0.29%)
instructions HURT:   shaders/dolphin/ubershaders/156.shader_test FS
SIMD8: 3465 -> 3475 (0.29%)
instructions HURT:   shaders/dolphin/ubershaders/138.shader_test FS
SIMD8: 3465 -> 3475 (0.29%)

Did you test on a different gen? Can you paste here the paths of some
of the GS shaders where you see the big regressions so I can verify I
have them in my shader-db?

Also, how did you test this patch exactly? When I was going to capture
the reference shader-db results for patch 2 in this series so I could
extract the results for patch 3 by comparing against it, I noticed that
patch 2 would create constant folding scenarios (for example for ADD
and MUL) that, before this patch, would hit an assertion in the driver
since the algebraic pass only expects to find these opportunities for F
types and will assert on that, so I guess you noticed this and fixed it
before taking your numbers?

> More comments below by SHL...
> 
> > LOST:   0
> > GAINED: 5
> > ---
> >  src/intel/compiler/brw_fs.cpp | 203
> > ++++++++++++++++++++++++++++++++--
> >  1 file changed, 195 insertions(+), 8 deletions(-)
> > 
> > diff --git a/src/intel/compiler/brw_fs.cpp
> > b/src/intel/compiler/brw_fs.cpp
> > index 2358acbeb59..b2b60237c82 100644
> > --- a/src/intel/compiler/brw_fs.cpp
> > +++ b/src/intel/compiler/brw_fs.cpp
> > @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
> >           break;
> >  
> >        case BRW_OPCODE_MUL:
> > -         if (inst->src[1].file != IMM)
> > +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
> >              continue;
> >  
> > +         /* Constant folding */
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            assert(inst->src[0].type == inst->src[1].type);
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_HF: {
> > +               float v1 = _mesa_half_to_float(inst->src[0].ud &
> > 0xffffu);
> > +               float v2 = _mesa_half_to_float(inst->src[1].ud &
> > 0xffffu);
> > +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 *
> > v2));
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_W: {
> > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > +               int16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_w(v1 * v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_uw(v1 * v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_F:
> > +               inst->src[0].f *= inst->src[1].f;
> > +               break;
> > +            case BRW_REGISTER_TYPE_D:
> > +               inst->src[0].d *= inst->src[1].d;
> > +               break;
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud *= inst->src[1].ud;
> > +               break;
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            };
> > +
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> > +
> > +
> >           /* a * 1.0 = a */
> >           if (inst->src[1].is_one()) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
> >              break;
> >           }
> >  
> > +         if (inst->src[0].is_one()) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +
> >           /* a * -1.0 = -a */
> >           if (inst->src[1].is_negative_one()) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
> >              break;
> >           }
> >  
> > -         if (inst->src[0].file == IMM) {
> > -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> > +         if (inst->src[0].is_negative_one()) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[0].negate = !inst->src[1].negate;
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +
> > +         /* a * 0 = 0 (this is not exact for floating point) */
> > +         if (inst->src[1].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[1].type)) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +
> > +         if (inst->src[0].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[0].type)) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > -            inst->src[0].f *= inst->src[1].f;
> >              inst->src[1] = reg_undef;
> >              progress = true;
> >              break;
> >           }
> >           break;
> >        case BRW_OPCODE_ADD:
> > -         if (inst->src[1].file != IMM)
> > +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
> >              continue;
> >  
> > -         if (inst->src[0].file == IMM) {
> > -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> > +         /* Constant folding */
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            assert(inst->src[0].type == inst->src[1].type);
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_HF: {
> > +               float v1 = _mesa_half_to_float(inst->src[0].ud &
> > 0xffffu);
> > +               float v2 = _mesa_half_to_float(inst->src[1].ud &
> > 0xffffu);
> > +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 +
> > v2));
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_W: {
> > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > +               int16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_w(v1 + v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_uw(v1 + v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_F:
> > +               inst->src[0].f += inst->src[1].f;
> > +               break;
> > +            case BRW_REGISTER_TYPE_D:
> > +               inst->src[0].d += inst->src[1].d;
> > +               break;
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud += inst->src[1].ud;
> > +               break;
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            };
> > +
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> > +
> > +         /* a + 0 = a (this is not exact for floating point) */
> > +         if (inst->src[1].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[1].type)) {
> >              inst->opcode = BRW_OPCODE_MOV;
> > -            inst->src[0].f += inst->src[1].f;
> >              inst->src[1] = reg_undef;
> >              progress = true;
> >              break;
> >           }
> > +
> > +         if (inst->src[0].is_zero() &&
> > +             brw_reg_type_is_integer(inst->src[0].type)) {
> > +            inst->opcode = BRW_OPCODE_MOV;
> > +            inst->src[0] = inst->src[1];
> > +            inst->src[1] = reg_undef;
> > +            progress = true;
> > +            break;
> > +         }
> > +         break;
> > +      case BRW_OPCODE_SHL:
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_D:
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud <<= inst->src[1].ud;
> > +               break;
> 
> There's something fishy going on here.  In one of the less hurt
> Unigine
> shaders, the before code was:
> 
> mov(1)          g2<1>UD         0x00000001UD                    {
> align1 WE_all 1N compacted };
> mov(8)          g126<1>UD       g1<8,8,1>UD                     {
> align1 WE_all 1Q compacted };
> shl(8)          g127<1>UD       g2<0,1,0>UD     0xffffffffUD    {
> align1 1Q compacted };
> 
> and the after code is
> 
> mov(8)          g126<1>UD       g1<8,8,1>UD                     {
> align1 WE_all 1Q compacted };
> mov(8)          g127<1>UD       0x00000008UD                    {
> align1 WE_all 1Q compacted };
> 
> That doesn't seem right.  I thought 1U << 0xffffffffU should be 1u <<
> 31
> = 0x80000000.  Since Gen explicitly only uses the low 5 bits of src1,
> maybe we need "& 0x1f" around all the shift counts?  I thought the
> CPU
> did the same thing...
> 
> Does this series pass the CI?
> 
> > +            case BRW_REGISTER_TYPE_W:
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = retype(brw_imm_uw(v1 << v2), inst-
> > >src[0].type);
> > +               break;
> > +            }
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            }
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> > +         break;
> > +      case BRW_OPCODE_SHR:
> > +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
> > {
> > +            bool local_progress = true;
> > +            switch (inst->src[0].type) {
> > +            case BRW_REGISTER_TYPE_D:
> > +               inst->src[0].d >>= inst->src[1].ud;
> > +               break;
> > +            case BRW_REGISTER_TYPE_UD:
> > +               inst->src[0].ud >>= inst->src[1].ud;
> > +               break;
> > +            case BRW_REGISTER_TYPE_W: {
> > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_w(v1 >> v2);
> > +               break;
> > +            }
> > +            case BRW_REGISTER_TYPE_UW: {
> > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > +               inst->src[0] = brw_imm_uw(v1 >> v2);
> > +               break;
> > +            }
> > +            default:
> > +               local_progress = false;
> > +               break;
> > +            }
> > +            if (local_progress) {
> > +               inst->opcode = BRW_OPCODE_MOV;
> > +               inst->src[1] = reg_undef;
> > +               progress = true;
> > +               break;
> > +            }
> > +         }
> >           break;
> >        case BRW_OPCODE_OR:
> >           if (inst->src[0].equals(inst->src[1]) ||
> > 
> 
>
On 2/28/19 4:47 AM, Iago Toral wrote:
> On Wed, 2019-02-27 at 17:04 -0800, Ian Romanick wrote:
>> On 2/27/19 4:45 AM, Iago Toral Quiroga wrote:
>>> Now that we propagate constants to the first source of 2src
>>> instructions we
>>> see more opportunities of constant folding in the backend.
>>>
>>> Shader-db results on KBL:
>>>
>>> total instructions in shared programs: 14965607 -> 14855983 (-
>>> 0.73%)
>>> instructions in affected programs: 3988102 -> 3878478 (-2.75%)
>>> helped: 14292
>>> HURT: 59
>>>
>>> total cycles in shared programs: 344324295 -> 340656008 (-1.07%)
>>> cycles in affected programs: 247527740 -> 243859453 (-1.48%)
>>> helped: 14056
>>> HURT: 3314
>>>
>>> total loops in shared programs: 4283 -> 4283 (0.00%)
>>> loops in affected programs: 0 -> 0
>>> helped: 0
>>> HURT: 0
>>>
>>> total spills in shared programs: 27812 -> 24350 (-12.45%)
>>> spills in affected programs: 24921 -> 21459 (-13.89%)
>>> helped: 345
>>> HURT: 19
>>>
>>> total fills in shared programs: 24173 -> 22032 (-8.86%)
>>> fills in affected programs: 21124 -> 18983 (-10.14%)
>>> helped: 355
>>> HURT: 25
>>
>> Ignore my previous questions about nir_opt_constant_folding after
>> nir_opt_algebraic_late.  I had done that because I added a bunch of
>> things to nir_opt_algebraic_late that created my constant folding
>> opportunities.
>>
>> This is the combined changes for this patch and the previous
>> patch.  For
>> this patch alone, I got:
>>
>> total instructions in shared programs: 15306213 -> 15221518 (-0.55%)
>> instructions in affected programs: 2911451 -> 2826756 (-2.91%)
>> helped: 13121
>> HURT: 44
>> helped stats (abs) min: 1 max: 51 x̄: 6.66 x̃: 6
>> helped stats (rel) min: <.01% max: 16.67% x̄: 4.27% x̃: 3.30%
>> HURT stats (abs)   min: 3 max: 453 x̄: 61.16 x̃: 5
>> HURT stats (rel)   min: 0.20% max: 151.00% x̄: 31.57% x̃: 19.23%
>> 95% mean confidence interval for instructions value: -6.61 -6.26
>> 95% mean confidence interval for instructions %-change: -4.23% -4.07%
>> Instructions are helped.
>>
>> total cycles in shared programs: 375419164 -> 372829148 (-0.69%)
>> cycles in affected programs: 146769299 -> 144179283 (-1.76%)
>> helped: 10992
>> HURT: 1833
>> helped stats (abs) min: 1 max: 56127 x̄: 250.29 x̃: 18
>> helped stats (rel) min: <.01% max: 40.52% x̄: 3.11% x̃: 2.58%
>> HURT stats (abs)   min: 1 max: 1718 x̄: 87.93 x̃: 42
>> HURT stats (rel)   min: <.01% max: 139.33% x̄: 7.74% x̃: 3.08%
>> 95% mean confidence interval for cycles value: -248.21 -155.69
>> 95% mean confidence interval for cycles %-change: -1.67% -1.44%
>> Cycles are helped.
>>
>> total spills in shared programs: 28828 -> 28888 (0.21%)
>> spills in affected programs: 2037 -> 2097 (2.95%)
>> helped: 0
>> HURT: 24
>>
>> total fills in shared programs: 35542 -> 35639 (0.27%)
>> fills in affected programs: 3078 -> 3175 (3.15%)
>> helped: 2
>> HURT: 26
>>
>> I decided to look at some of the hurt shaders... it looks like some
>> of
>> the Unigine geometry shaders really took a beating (+150%
>> instructions).
>> Note the "max" in the "instructions in affected programs" above.
> 
> I am seeing quite different results on my KBL laptop:
> 
> total instructions in shared programs: 14945933 -> 14858158 (-0.59%)
> instructions in affected programs: 2842901 -> 2755126 (-3.09%)
> helped: 13196
> HURT: 5
> 
> instructions HURT:   shaders/closed/steam/deus-ex-mankind-
> divided/274.shader_test CS SIMD8: 1535 -> 1538 (0.20%)
> instructions HURT:   shaders/closed/steam/deus-ex-mankind-
> divided/184.shader_test CS SIMD8: 1535 -> 1538 (0.20%)
> instructions HURT:   shaders/dolphin/ubershaders/147.shader_test FS
> SIMD8: 3481 -> 3491 (0.29%)
> instructions HURT:   shaders/dolphin/ubershaders/156.shader_test FS
> SIMD8: 3465 -> 3475 (0.29%)
> instructions HURT:   shaders/dolphin/ubershaders/138.shader_test FS
> SIMD8: 3465 -> 3475 (0.29%)
> 
> Did you test on a different gen? Can you paste here the paths of some
> of the GS shaders where you see the big regressions so I can verify I
> have them in my shader-db?
> 
> Also, how did you test this patch exactly? When I was going to capture
> the reference shader-db results for patch 2 in this series so I could
> extract the results for patch 3 by comparing against it, I noticed that
> patch 2 would create constant folding scenarios (for example for ADD
> and MUL) that, before this patch, would hit an assertion in the driver
> since the algebraic pass only expects to find these opportunities for F
> types and will assert on that, so I guess you noticed this and fixed it
> before taking your numbers?

I ran it through my usual shader-db gauntlet that runs shader-db at each
commit for SKL, BDW, HSW, IVB, SNB, ILK, and GM45.  *But* since one pass
of that takes a really, really long time, I only run release builds with
-march=native and all the other tricks.  None of the assertions would
exist in that run.

If patch 2 creates possible assertion failures, the two patches should
probably be re-ordered or the previous patch should do something to
avoid the assertions.  Some day, someone will hit that patch in a
bisect, and they will be sad.

What I think happened is the previous patch helped a bunch of shaders by
breaking them, and this patch fixes them... making them "worse."  When I
look at the total across both patches, I get results much more similar
to yours, so this was a false alarm.

Skylake
total instructions in shared programs: 15328886 -> 15219343 (-0.71%)
instructions in affected programs: 3999638 -> 3890095 (-2.74%)
helped: 14311
HURT: 65
helped stats (abs) min: 1 max: 502 x̄: 7.67 x̃: 4
helped stats (rel) min: 0.04% max: 26.58% x̄: 4.07% x̃: 3.05%
HURT stats (abs)   min: 1 max: 22 x̄: 2.35 x̃: 1
HURT stats (rel)   min: 0.06% max: 1.96% x̄: 0.42% x̃: 0.34%
95% mean confidence interval for instructions value: -7.88 -7.36
95% mean confidence interval for instructions %-change: -4.10% -3.99%
Instructions are helped.

total cycles in shared programs: 376419186 -> 372742630 (-0.98%)
cycles in affected programs: 277719712 -> 274043156 (-1.32%)
helped: 14204
HURT: 3270
helped stats (abs) min: 1 max: 118186 x̄: 278.57 x̃: 17
helped stats (rel) min: <.01% max: 40.52% x̄: 2.71% x̃: 2.14%
HURT stats (abs)   min: 1 max: 5031 x̄: 85.69 x̃: 26
HURT stats (rel)   min: <.01% max: 65.74% x̄: 6.12% x̃: 2.00%
95% mean confidence interval for cycles value: -247.21 -173.60
95% mean confidence interval for cycles %-change: -1.15% -0.96%
Cycles are helped.

total spills in shared programs: 32328 -> 28888 (-10.64%)
spills in affected programs: 26262 -> 22822 (-13.10%)
helped: 345
HURT: 17

total fills in shared programs: 37768 -> 35639 (-5.64%)
fills in affected programs: 22394 -> 20265 (-9.51%)
helped: 354
HURT: 24

LOST:   0
GAINED: 5


>> More comments below by SHL...
>>
>>> LOST:   0
>>> GAINED: 5
>>> ---
>>>  src/intel/compiler/brw_fs.cpp | 203
>>> ++++++++++++++++++++++++++++++++--
>>>  1 file changed, 195 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/src/intel/compiler/brw_fs.cpp
>>> b/src/intel/compiler/brw_fs.cpp
>>> index 2358acbeb59..b2b60237c82 100644
>>> --- a/src/intel/compiler/brw_fs.cpp
>>> +++ b/src/intel/compiler/brw_fs.cpp
>>> @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
>>>           break;
>>>  
>>>        case BRW_OPCODE_MUL:
>>> -         if (inst->src[1].file != IMM)
>>> +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
>>>              continue;
>>>  
>>> +         /* Constant folding */
>>> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
>>> {
>>> +            assert(inst->src[0].type == inst->src[1].type);
>>> +            bool local_progress = true;
>>> +            switch (inst->src[0].type) {
>>> +            case BRW_REGISTER_TYPE_HF: {
>>> +               float v1 = _mesa_half_to_float(inst->src[0].ud &
>>> 0xffffu);
>>> +               float v2 = _mesa_half_to_float(inst->src[1].ud &
>>> 0xffffu);
>>> +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 *
>>> v2));
>>> +               break;
>>> +            }
>>> +            case BRW_REGISTER_TYPE_W: {
>>> +               int16_t v1 = inst->src[0].ud & 0xffffu;
>>> +               int16_t v2 = inst->src[1].ud & 0xffffu;
>>> +               inst->src[0] = brw_imm_w(v1 * v2);
>>> +               break;
>>> +            }
>>> +            case BRW_REGISTER_TYPE_UW: {
>>> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
>>> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
>>> +               inst->src[0] = brw_imm_uw(v1 * v2);
>>> +               break;
>>> +            }
>>> +            case BRW_REGISTER_TYPE_F:
>>> +               inst->src[0].f *= inst->src[1].f;
>>> +               break;
>>> +            case BRW_REGISTER_TYPE_D:
>>> +               inst->src[0].d *= inst->src[1].d;
>>> +               break;
>>> +            case BRW_REGISTER_TYPE_UD:
>>> +               inst->src[0].ud *= inst->src[1].ud;
>>> +               break;
>>> +            default:
>>> +               local_progress = false;
>>> +               break;
>>> +            };
>>> +
>>> +            if (local_progress) {
>>> +               inst->opcode = BRW_OPCODE_MOV;
>>> +               inst->src[1] = reg_undef;
>>> +               progress = true;
>>> +               break;
>>> +            }
>>> +         }
>>> +
>>> +
>>>           /* a * 1.0 = a */
>>>           if (inst->src[1].is_one()) {
>>>              inst->opcode = BRW_OPCODE_MOV;
>>> @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
>>>              break;
>>>           }
>>>  
>>> +         if (inst->src[0].is_one()) {
>>> +            inst->opcode = BRW_OPCODE_MOV;
>>> +            inst->src[0] = inst->src[1];
>>> +            inst->src[1] = reg_undef;
>>> +            progress = true;
>>> +            break;
>>> +         }
>>> +
>>>           /* a * -1.0 = -a */
>>>           if (inst->src[1].is_negative_one()) {
>>>              inst->opcode = BRW_OPCODE_MOV;
>>> @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
>>>              break;
>>>           }
>>>  
>>> -         if (inst->src[0].file == IMM) {
>>> -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
>>> +         if (inst->src[0].is_negative_one()) {
>>> +            inst->opcode = BRW_OPCODE_MOV;
>>> +            inst->src[0] = inst->src[1];
>>> +            inst->src[0].negate = !inst->src[1].negate;
>>> +            inst->src[1] = reg_undef;
>>> +            progress = true;
>>> +            break;
>>> +         }
>>> +
>>> +         /* a * 0 = 0 (this is not exact for floating point) */
>>> +         if (inst->src[1].is_zero() &&
>>> +             brw_reg_type_is_integer(inst->src[1].type)) {
>>> +            inst->opcode = BRW_OPCODE_MOV;
>>> +            inst->src[0] = inst->src[1];
>>> +            inst->src[1] = reg_undef;
>>> +            progress = true;
>>> +            break;
>>> +         }
>>> +
>>> +         if (inst->src[0].is_zero() &&
>>> +             brw_reg_type_is_integer(inst->src[0].type)) {
>>>              inst->opcode = BRW_OPCODE_MOV;
>>> -            inst->src[0].f *= inst->src[1].f;
>>>              inst->src[1] = reg_undef;
>>>              progress = true;
>>>              break;
>>>           }
>>>           break;
>>>        case BRW_OPCODE_ADD:
>>> -         if (inst->src[1].file != IMM)
>>> +         if (inst->src[0].file != IMM && inst->src[1].file != IMM)
>>>              continue;
>>>  
>>> -         if (inst->src[0].file == IMM) {
>>> -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
>>> +         /* Constant folding */
>>> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
>>> {
>>> +            assert(inst->src[0].type == inst->src[1].type);
>>> +            bool local_progress = true;
>>> +            switch (inst->src[0].type) {
>>> +            case BRW_REGISTER_TYPE_HF: {
>>> +               float v1 = _mesa_half_to_float(inst->src[0].ud &
>>> 0xffffu);
>>> +               float v2 = _mesa_half_to_float(inst->src[1].ud &
>>> 0xffffu);
>>> +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1 +
>>> v2));
>>> +               break;
>>> +            }
>>> +            case BRW_REGISTER_TYPE_W: {
>>> +               int16_t v1 = inst->src[0].ud & 0xffffu;
>>> +               int16_t v2 = inst->src[1].ud & 0xffffu;
>>> +               inst->src[0] = brw_imm_w(v1 + v2);
>>> +               break;
>>> +            }
>>> +            case BRW_REGISTER_TYPE_UW: {
>>> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
>>> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
>>> +               inst->src[0] = brw_imm_uw(v1 + v2);
>>> +               break;
>>> +            }
>>> +            case BRW_REGISTER_TYPE_F:
>>> +               inst->src[0].f += inst->src[1].f;
>>> +               break;
>>> +            case BRW_REGISTER_TYPE_D:
>>> +               inst->src[0].d += inst->src[1].d;
>>> +               break;
>>> +            case BRW_REGISTER_TYPE_UD:
>>> +               inst->src[0].ud += inst->src[1].ud;
>>> +               break;
>>> +            default:
>>> +               local_progress = false;
>>> +               break;
>>> +            };
>>> +
>>> +            if (local_progress) {
>>> +               inst->opcode = BRW_OPCODE_MOV;
>>> +               inst->src[1] = reg_undef;
>>> +               progress = true;
>>> +               break;
>>> +            }
>>> +         }
>>> +
>>> +         /* a + 0 = a (this is not exact for floating point) */
>>> +         if (inst->src[1].is_zero() &&
>>> +             brw_reg_type_is_integer(inst->src[1].type)) {
>>>              inst->opcode = BRW_OPCODE_MOV;
>>> -            inst->src[0].f += inst->src[1].f;
>>>              inst->src[1] = reg_undef;
>>>              progress = true;
>>>              break;
>>>           }
>>> +
>>> +         if (inst->src[0].is_zero() &&
>>> +             brw_reg_type_is_integer(inst->src[0].type)) {
>>> +            inst->opcode = BRW_OPCODE_MOV;
>>> +            inst->src[0] = inst->src[1];
>>> +            inst->src[1] = reg_undef;
>>> +            progress = true;
>>> +            break;
>>> +         }
>>> +         break;
>>> +      case BRW_OPCODE_SHL:
>>> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
>>> {
>>> +            bool local_progress = true;
>>> +            switch (inst->src[0].type) {
>>> +            case BRW_REGISTER_TYPE_D:
>>> +            case BRW_REGISTER_TYPE_UD:
>>> +               inst->src[0].ud <<= inst->src[1].ud;
>>> +               break;
>>
>> There's something fishy going on here.  In one of the less hurt
>> Unigine
>> shaders, the before code was:
>>
>> mov(1)          g2<1>UD         0x00000001UD                    {
>> align1 WE_all 1N compacted };
>> mov(8)          g126<1>UD       g1<8,8,1>UD                     {
>> align1 WE_all 1Q compacted };
>> shl(8)          g127<1>UD       g2<0,1,0>UD     0xffffffffUD    {
>> align1 1Q compacted };
>>
>> and the after code is
>>
>> mov(8)          g126<1>UD       g1<8,8,1>UD                     {
>> align1 WE_all 1Q compacted };
>> mov(8)          g127<1>UD       0x00000008UD                    {
>> align1 WE_all 1Q compacted };
>>
>> That doesn't seem right.  I thought 1U << 0xffffffffU should be 1u <<
>> 31
>> = 0x80000000.  Since Gen explicitly only uses the low 5 bits of src1,
>> maybe we need "& 0x1f" around all the shift counts?  I thought the
>> CPU
>> did the same thing...
>>
>> Does this series pass the CI?
>>
>>> +            case BRW_REGISTER_TYPE_W:
>>> +            case BRW_REGISTER_TYPE_UW: {
>>> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
>>> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
>>> +               inst->src[0] = retype(brw_imm_uw(v1 << v2), inst-
>>>> src[0].type);
>>> +               break;
>>> +            }
>>> +            default:
>>> +               local_progress = false;
>>> +               break;
>>> +            }
>>> +            if (local_progress) {
>>> +               inst->opcode = BRW_OPCODE_MOV;
>>> +               inst->src[1] = reg_undef;
>>> +               progress = true;
>>> +               break;
>>> +            }
>>> +         }
>>> +         break;
>>> +      case BRW_OPCODE_SHR:
>>> +         if (inst->src[0].file == IMM && inst->src[1].file == IMM)
>>> {
>>> +            bool local_progress = true;
>>> +            switch (inst->src[0].type) {
>>> +            case BRW_REGISTER_TYPE_D:
>>> +               inst->src[0].d >>= inst->src[1].ud;
>>> +               break;
>>> +            case BRW_REGISTER_TYPE_UD:
>>> +               inst->src[0].ud >>= inst->src[1].ud;
>>> +               break;
>>> +            case BRW_REGISTER_TYPE_W: {
>>> +               int16_t v1 = inst->src[0].ud & 0xffffu;
>>> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
>>> +               inst->src[0] = brw_imm_w(v1 >> v2);
>>> +               break;
>>> +            }
>>> +            case BRW_REGISTER_TYPE_UW: {
>>> +               uint16_t v1 = inst->src[0].ud & 0xffffu;
>>> +               uint16_t v2 = inst->src[1].ud & 0xffffu;
>>> +               inst->src[0] = brw_imm_uw(v1 >> v2);
>>> +               break;
>>> +            }
>>> +            default:
>>> +               local_progress = false;
>>> +               break;
>>> +            }
>>> +            if (local_progress) {
>>> +               inst->opcode = BRW_OPCODE_MOV;
>>> +               inst->src[1] = reg_undef;
>>> +               progress = true;
>>> +               break;
>>> +            }
>>> +         }
>>>           break;
>>>        case BRW_OPCODE_OR:
>>>           if (inst->src[0].equals(inst->src[1]) ||
On Thu, 2019-02-28 at 16:20 -0800, Ian Romanick wrote:
> On 2/28/19 4:47 AM, Iago Toral wrote:
> > On Wed, 2019-02-27 at 17:04 -0800, Ian Romanick wrote:
> > > On 2/27/19 4:45 AM, Iago Toral Quiroga wrote:
> > > > Now that we propagate constants to the first source of 2src
> > > > instructions we
> > > > see more opportunities of constant folding in the backend.
> > > > 
> > > > Shader-db results on KBL:
> > > > 
> > > > total instructions in shared programs: 14965607 -> 14855983 (-
> > > > 0.73%)
> > > > instructions in affected programs: 3988102 -> 3878478 (-2.75%)
> > > > helped: 14292
> > > > HURT: 59
> > > > 
> > > > total cycles in shared programs: 344324295 -> 340656008 (-
> > > > 1.07%)
> > > > cycles in affected programs: 247527740 -> 243859453 (-1.48%)
> > > > helped: 14056
> > > > HURT: 3314
> > > > 
> > > > total loops in shared programs: 4283 -> 4283 (0.00%)
> > > > loops in affected programs: 0 -> 0
> > > > helped: 0
> > > > HURT: 0
> > > > 
> > > > total spills in shared programs: 27812 -> 24350 (-12.45%)
> > > > spills in affected programs: 24921 -> 21459 (-13.89%)
> > > > helped: 345
> > > > HURT: 19
> > > > 
> > > > total fills in shared programs: 24173 -> 22032 (-8.86%)
> > > > fills in affected programs: 21124 -> 18983 (-10.14%)
> > > > helped: 355
> > > > HURT: 25
> > > 
> > > Ignore my previous questions about nir_opt_constant_folding after
> > > nir_opt_algebraic_late.  I had done that because I added a bunch
> > > of
> > > things to nir_opt_algebraic_late that created my constant folding
> > > opportunities.
> > > 
> > > This is the combined changes for this patch and the previous
> > > patch.  For
> > > this patch alone, I got:
> > > 
> > > total instructions in shared programs: 15306213 -> 15221518 (-
> > > 0.55%)
> > > instructions in affected programs: 2911451 -> 2826756 (-2.91%)
> > > helped: 13121
> > > HURT: 44
> > > helped stats (abs) min: 1 max: 51 x̄: 6.66 x̃: 6
> > > helped stats (rel) min: <.01% max: 16.67% x̄: 4.27% x̃: 3.30%
> > > HURT stats (abs)   min: 3 max: 453 x̄: 61.16 x̃: 5
> > > HURT stats (rel)   min: 0.20% max: 151.00% x̄: 31.57% x̃: 19.23%
> > > 95% mean confidence interval for instructions value: -6.61 -6.26
> > > 95% mean confidence interval for instructions %-change: -4.23%
> > > -4.07%
> > > Instructions are helped.
> > > 
> > > total cycles in shared programs: 375419164 -> 372829148 (-0.69%)
> > > cycles in affected programs: 146769299 -> 144179283 (-1.76%)
> > > helped: 10992
> > > HURT: 1833
> > > helped stats (abs) min: 1 max: 56127 x̄: 250.29 x̃: 18
> > > helped stats (rel) min: <.01% max: 40.52% x̄: 3.11% x̃: 2.58%
> > > HURT stats (abs)   min: 1 max: 1718 x̄: 87.93 x̃: 42
> > > HURT stats (rel)   min: <.01% max: 139.33% x̄: 7.74% x̃: 3.08%
> > > 95% mean confidence interval for cycles value: -248.21 -155.69
> > > 95% mean confidence interval for cycles %-change: -1.67% -1.44%
> > > Cycles are helped.
> > > 
> > > total spills in shared programs: 28828 -> 28888 (0.21%)
> > > spills in affected programs: 2037 -> 2097 (2.95%)
> > > helped: 0
> > > HURT: 24
> > > 
> > > total fills in shared programs: 35542 -> 35639 (0.27%)
> > > fills in affected programs: 3078 -> 3175 (3.15%)
> > > helped: 2
> > > HURT: 26
> > > 
> > > I decided to look at some of the hurt shaders... it looks like
> > > some
> > > of
> > > the Unigine geometry shaders really took a beating (+150%
> > > instructions).
> > > Note the "max" in the "instructions in affected programs" above.
> > 
> > I am seeing quite different results on my KBL laptop:
> > 
> > total instructions in shared programs: 14945933 -> 14858158 (-
> > 0.59%)
> > instructions in affected programs: 2842901 -> 2755126 (-3.09%)
> > helped: 13196
> > HURT: 5
> > 
> > instructions HURT:   shaders/closed/steam/deus-ex-mankind-
> > divided/274.shader_test CS SIMD8: 1535 -> 1538 (0.20%)
> > instructions HURT:   shaders/closed/steam/deus-ex-mankind-
> > divided/184.shader_test CS SIMD8: 1535 -> 1538 (0.20%)
> > instructions HURT:   shaders/dolphin/ubershaders/147.shader_test FS
> > SIMD8: 3481 -> 3491 (0.29%)
> > instructions HURT:   shaders/dolphin/ubershaders/156.shader_test FS
> > SIMD8: 3465 -> 3475 (0.29%)
> > instructions HURT:   shaders/dolphin/ubershaders/138.shader_test FS
> > SIMD8: 3465 -> 3475 (0.29%)
> > 
> > Did you test on a different gen? Can you paste here the paths of
> > some
> > of the GS shaders where you see the big regressions so I can verify
> > I
> > have them in my shader-db?
> > 
> > Also, how did you test this patch exactly? When I was going to
> > capture
> > the reference shader-db results for patch 2 in this series so I
> > could
> > extract the results for patch 3 by comparing against it, I noticed
> > that
> > patch 2 would create constant folding scenarios (for example for
> > ADD
> > and MUL) that, before this patch, would hit an assertion in the
> > driver
> > since the algebraic pass only expects to find these opportunities
> > for F
> > types and will assert on that, so I guess you noticed this and
> > fixed it
> > before taking your numbers?
> 
> I ran it through my usual shader-db gauntlet that runs shader-db at
> each
> commit for SKL, BDW, HSW, IVB, SNB, ILK, and GM45.  *But* since one
> pass
> of that takes a really, really long time, I only run release builds
> with
> -march=native and all the other tricks.  None of the assertions would
> exist in that run.
> 
> If patch 2 creates possible assertion failures, the two patches
> should
> probably be re-ordered or the previous patch should do something to
> avoid the assertions.  Some day, someone will hit that patch in a
> bisect, and they will be sad.

Yes, absolutely. I have already fixed that locally. I'll send a v2 for
review as soon as I verify the series in CI again.

Iago

> What I think happened is the previous patch helped a bunch of shaders
> by
> breaking them, and this patch fixes them... making them
> "worse."  When I
> look at the total across both patches, I get results much more
> similar
> to yours, so this was a false alarm.
> 
> Skylake
> total instructions in shared programs: 15328886 -> 15219343 (-0.71%)
> instructions in affected programs: 3999638 -> 3890095 (-2.74%)
> helped: 14311
> HURT: 65
> helped stats (abs) min: 1 max: 502 x̄: 7.67 x̃: 4
> helped stats (rel) min: 0.04% max: 26.58% x̄: 4.07% x̃: 3.05%
> HURT stats (abs)   min: 1 max: 22 x̄: 2.35 x̃: 1
> HURT stats (rel)   min: 0.06% max: 1.96% x̄: 0.42% x̃: 0.34%
> 95% mean confidence interval for instructions value: -7.88 -7.36
> 95% mean confidence interval for instructions %-change: -4.10% -3.99%
> Instructions are helped.
> 
> total cycles in shared programs: 376419186 -> 372742630 (-0.98%)
> cycles in affected programs: 277719712 -> 274043156 (-1.32%)
> helped: 14204
> HURT: 3270
> helped stats (abs) min: 1 max: 118186 x̄: 278.57 x̃: 17
> helped stats (rel) min: <.01% max: 40.52% x̄: 2.71% x̃: 2.14%
> HURT stats (abs)   min: 1 max: 5031 x̄: 85.69 x̃: 26
> HURT stats (rel)   min: <.01% max: 65.74% x̄: 6.12% x̃: 2.00%
> 95% mean confidence interval for cycles value: -247.21 -173.60
> 95% mean confidence interval for cycles %-change: -1.15% -0.96%
> Cycles are helped.
> 
> total spills in shared programs: 32328 -> 28888 (-10.64%)
> spills in affected programs: 26262 -> 22822 (-13.10%)
> helped: 345
> HURT: 17
> 
> total fills in shared programs: 37768 -> 35639 (-5.64%)
> fills in affected programs: 22394 -> 20265 (-9.51%)
> helped: 354
> HURT: 24
> 
> LOST:   0
> GAINED: 5
> 
> 
> > > More comments below by SHL...
> > > 
> > > > LOST:   0
> > > > GAINED: 5
> > > > ---
> > > >  src/intel/compiler/brw_fs.cpp | 203
> > > > ++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 195 insertions(+), 8 deletions(-)
> > > > 
> > > > diff --git a/src/intel/compiler/brw_fs.cpp
> > > > b/src/intel/compiler/brw_fs.cpp
> > > > index 2358acbeb59..b2b60237c82 100644
> > > > --- a/src/intel/compiler/brw_fs.cpp
> > > > +++ b/src/intel/compiler/brw_fs.cpp
> > > > @@ -2583,9 +2583,55 @@ fs_visitor::opt_algebraic()
> > > >           break;
> > > >  
> > > >        case BRW_OPCODE_MUL:
> > > > -         if (inst->src[1].file != IMM)
> > > > +         if (inst->src[0].file != IMM && inst->src[1].file !=
> > > > IMM)
> > > >              continue;
> > > >  
> > > > +         /* Constant folding */
> > > > +         if (inst->src[0].file == IMM && inst->src[1].file ==
> > > > IMM)
> > > > {
> > > > +            assert(inst->src[0].type == inst->src[1].type);
> > > > +            bool local_progress = true;
> > > > +            switch (inst->src[0].type) {
> > > > +            case BRW_REGISTER_TYPE_HF: {
> > > > +               float v1 = _mesa_half_to_float(inst->src[0].ud
> > > > &
> > > > 0xffffu);
> > > > +               float v2 = _mesa_half_to_float(inst->src[1].ud
> > > > &
> > > > 0xffffu);
> > > > +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1
> > > > *
> > > > v2));
> > > > +               break;
> > > > +            }
> > > > +            case BRW_REGISTER_TYPE_W: {
> > > > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > > > +               int16_t v2 = inst->src[1].ud & 0xffffu;
> > > > +               inst->src[0] = brw_imm_w(v1 * v2);
> > > > +               break;
> > > > +            }
> > > > +            case BRW_REGISTER_TYPE_UW: {
> > > > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > > > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > > > +               inst->src[0] = brw_imm_uw(v1 * v2);
> > > > +               break;
> > > > +            }
> > > > +            case BRW_REGISTER_TYPE_F:
> > > > +               inst->src[0].f *= inst->src[1].f;
> > > > +               break;
> > > > +            case BRW_REGISTER_TYPE_D:
> > > > +               inst->src[0].d *= inst->src[1].d;
> > > > +               break;
> > > > +            case BRW_REGISTER_TYPE_UD:
> > > > +               inst->src[0].ud *= inst->src[1].ud;
> > > > +               break;
> > > > +            default:
> > > > +               local_progress = false;
> > > > +               break;
> > > > +            };
> > > > +
> > > > +            if (local_progress) {
> > > > +               inst->opcode = BRW_OPCODE_MOV;
> > > > +               inst->src[1] = reg_undef;
> > > > +               progress = true;
> > > > +               break;
> > > > +            }
> > > > +         }
> > > > +
> > > > +
> > > >           /* a * 1.0 = a */
> > > >           if (inst->src[1].is_one()) {
> > > >              inst->opcode = BRW_OPCODE_MOV;
> > > > @@ -2594,6 +2640,14 @@ fs_visitor::opt_algebraic()
> > > >              break;
> > > >           }
> > > >  
> > > > +         if (inst->src[0].is_one()) {
> > > > +            inst->opcode = BRW_OPCODE_MOV;
> > > > +            inst->src[0] = inst->src[1];
> > > > +            inst->src[1] = reg_undef;
> > > > +            progress = true;
> > > > +            break;
> > > > +         }
> > > > +
> > > >           /* a * -1.0 = -a */
> > > >           if (inst->src[1].is_negative_one()) {
> > > >              inst->opcode = BRW_OPCODE_MOV;
> > > > @@ -2603,27 +2657,160 @@ fs_visitor::opt_algebraic()
> > > >              break;
> > > >           }
> > > >  
> > > > -         if (inst->src[0].file == IMM) {
> > > > -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> > > > +         if (inst->src[0].is_negative_one()) {
> > > > +            inst->opcode = BRW_OPCODE_MOV;
> > > > +            inst->src[0] = inst->src[1];
> > > > +            inst->src[0].negate = !inst->src[1].negate;
> > > > +            inst->src[1] = reg_undef;
> > > > +            progress = true;
> > > > +            break;
> > > > +         }
> > > > +
> > > > +         /* a * 0 = 0 (this is not exact for floating point)
> > > > */
> > > > +         if (inst->src[1].is_zero() &&
> > > > +             brw_reg_type_is_integer(inst->src[1].type)) {
> > > > +            inst->opcode = BRW_OPCODE_MOV;
> > > > +            inst->src[0] = inst->src[1];
> > > > +            inst->src[1] = reg_undef;
> > > > +            progress = true;
> > > > +            break;
> > > > +         }
> > > > +
> > > > +         if (inst->src[0].is_zero() &&
> > > > +             brw_reg_type_is_integer(inst->src[0].type)) {
> > > >              inst->opcode = BRW_OPCODE_MOV;
> > > > -            inst->src[0].f *= inst->src[1].f;
> > > >              inst->src[1] = reg_undef;
> > > >              progress = true;
> > > >              break;
> > > >           }
> > > >           break;
> > > >        case BRW_OPCODE_ADD:
> > > > -         if (inst->src[1].file != IMM)
> > > > +         if (inst->src[0].file != IMM && inst->src[1].file !=
> > > > IMM)
> > > >              continue;
> > > >  
> > > > -         if (inst->src[0].file == IMM) {
> > > > -            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
> > > > +         /* Constant folding */
> > > > +         if (inst->src[0].file == IMM && inst->src[1].file ==
> > > > IMM)
> > > > {
> > > > +            assert(inst->src[0].type == inst->src[1].type);
> > > > +            bool local_progress = true;
> > > > +            switch (inst->src[0].type) {
> > > > +            case BRW_REGISTER_TYPE_HF: {
> > > > +               float v1 = _mesa_half_to_float(inst->src[0].ud
> > > > &
> > > > 0xffffu);
> > > > +               float v2 = _mesa_half_to_float(inst->src[1].ud
> > > > &
> > > > 0xffffu);
> > > > +               inst->src[0] = brw_imm_w(_mesa_float_to_half(v1
> > > > +
> > > > v2));
> > > > +               break;
> > > > +            }
> > > > +            case BRW_REGISTER_TYPE_W: {
> > > > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > > > +               int16_t v2 = inst->src[1].ud & 0xffffu;
> > > > +               inst->src[0] = brw_imm_w(v1 + v2);
> > > > +               break;
> > > > +            }
> > > > +            case BRW_REGISTER_TYPE_UW: {
> > > > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > > > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > > > +               inst->src[0] = brw_imm_uw(v1 + v2);
> > > > +               break;
> > > > +            }
> > > > +            case BRW_REGISTER_TYPE_F:
> > > > +               inst->src[0].f += inst->src[1].f;
> > > > +               break;
> > > > +            case BRW_REGISTER_TYPE_D:
> > > > +               inst->src[0].d += inst->src[1].d;
> > > > +               break;
> > > > +            case BRW_REGISTER_TYPE_UD:
> > > > +               inst->src[0].ud += inst->src[1].ud;
> > > > +               break;
> > > > +            default:
> > > > +               local_progress = false;
> > > > +               break;
> > > > +            };
> > > > +
> > > > +            if (local_progress) {
> > > > +               inst->opcode = BRW_OPCODE_MOV;
> > > > +               inst->src[1] = reg_undef;
> > > > +               progress = true;
> > > > +               break;
> > > > +            }
> > > > +         }
> > > > +
> > > > +         /* a + 0 = a (this is not exact for floating point)
> > > > */
> > > > +         if (inst->src[1].is_zero() &&
> > > > +             brw_reg_type_is_integer(inst->src[1].type)) {
> > > >              inst->opcode = BRW_OPCODE_MOV;
> > > > -            inst->src[0].f += inst->src[1].f;
> > > >              inst->src[1] = reg_undef;
> > > >              progress = true;
> > > >              break;
> > > >           }
> > > > +
> > > > +         if (inst->src[0].is_zero() &&
> > > > +             brw_reg_type_is_integer(inst->src[0].type)) {
> > > > +            inst->opcode = BRW_OPCODE_MOV;
> > > > +            inst->src[0] = inst->src[1];
> > > > +            inst->src[1] = reg_undef;
> > > > +            progress = true;
> > > > +            break;
> > > > +         }
> > > > +         break;
> > > > +      case BRW_OPCODE_SHL:
> > > > +         if (inst->src[0].file == IMM && inst->src[1].file ==
> > > > IMM)
> > > > {
> > > > +            bool local_progress = true;
> > > > +            switch (inst->src[0].type) {
> > > > +            case BRW_REGISTER_TYPE_D:
> > > > +            case BRW_REGISTER_TYPE_UD:
> > > > +               inst->src[0].ud <<= inst->src[1].ud;
> > > > +               break;
> > > 
> > > There's something fishy going on here.  In one of the less hurt
> > > Unigine
> > > shaders, the before code was:
> > > 
> > > mov(1)          g2<1>UD         0x00000001UD                    {
> > > align1 WE_all 1N compacted };
> > > mov(8)          g126<1>UD       g1<8,8,1>UD                     {
> > > align1 WE_all 1Q compacted };
> > > shl(8)          g127<1>UD       g2<0,1,0>UD     0xffffffffUD    {
> > > align1 1Q compacted };
> > > 
> > > and the after code is
> > > 
> > > mov(8)          g126<1>UD       g1<8,8,1>UD                     {
> > > align1 WE_all 1Q compacted };
> > > mov(8)          g127<1>UD       0x00000008UD                    {
> > > align1 WE_all 1Q compacted };
> > > 
> > > That doesn't seem right.  I thought 1U << 0xffffffffU should be
> > > 1u <<
> > > 31
> > > = 0x80000000.  Since Gen explicitly only uses the low 5 bits of
> > > src1,
> > > maybe we need "& 0x1f" around all the shift counts?  I thought
> > > the
> > > CPU
> > > did the same thing...
> > > 
> > > Does this series pass the CI?
> > > 
> > > > +            case BRW_REGISTER_TYPE_W:
> > > > +            case BRW_REGISTER_TYPE_UW: {
> > > > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > > > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > > > +               inst->src[0] = retype(brw_imm_uw(v1 << v2),
> > > > inst-
> > > > > src[0].type);
> > > > 
> > > > +               break;
> > > > +            }
> > > > +            default:
> > > > +               local_progress = false;
> > > > +               break;
> > > > +            }
> > > > +            if (local_progress) {
> > > > +               inst->opcode = BRW_OPCODE_MOV;
> > > > +               inst->src[1] = reg_undef;
> > > > +               progress = true;
> > > > +               break;
> > > > +            }
> > > > +         }
> > > > +         break;
> > > > +      case BRW_OPCODE_SHR:
> > > > +         if (inst->src[0].file == IMM && inst->src[1].file ==
> > > > IMM)
> > > > {
> > > > +            bool local_progress = true;
> > > > +            switch (inst->src[0].type) {
> > > > +            case BRW_REGISTER_TYPE_D:
> > > > +               inst->src[0].d >>= inst->src[1].ud;
> > > > +               break;
> > > > +            case BRW_REGISTER_TYPE_UD:
> > > > +               inst->src[0].ud >>= inst->src[1].ud;
> > > > +               break;
> > > > +            case BRW_REGISTER_TYPE_W: {
> > > > +               int16_t v1 = inst->src[0].ud & 0xffffu;
> > > > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > > > +               inst->src[0] = brw_imm_w(v1 >> v2);
> > > > +               break;
> > > > +            }
> > > > +            case BRW_REGISTER_TYPE_UW: {
> > > > +               uint16_t v1 = inst->src[0].ud & 0xffffu;
> > > > +               uint16_t v2 = inst->src[1].ud & 0xffffu;
> > > > +               inst->src[0] = brw_imm_uw(v1 >> v2);
> > > > +               break;
> > > > +            }
> > > > +            default:
> > > > +               local_progress = false;
> > > > +               break;
> > > > +            }
> > > > +            if (local_progress) {
> > > > +               inst->opcode = BRW_OPCODE_MOV;
> > > > +               inst->src[1] = reg_undef;
> > > > +               progress = true;
> > > > +               break;
> > > > +            }
> > > > +         }
> > > >           break;
> > > >        case BRW_OPCODE_OR:
> > > >           if (inst->src[0].equals(inst->src[1]) ||
> 
>