[v3,40/42] intel/compiler: support half-float in the combine constants pass

Submitted by Iago Toral Quiroga on Jan. 15, 2019, 1:54 p.m.

Details

Message ID 20190115135414.2313-41-itoral@igalia.com
State New
Headers show
Series "intel: VK_KHR_shader_float16_int8 implementation" ( rev: 5 4 ) in Mesa

Not browsing as part of any series.

Commit Message

Iago Toral Quiroga Jan. 15, 2019, 1:54 p.m.
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 .../compiler/brw_fs_combine_constants.cpp     | 60 +++++++++++++++----
 1 file changed, 49 insertions(+), 11 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp
index 7343f77bb45..54017e5668b 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -36,6 +36,7 @@ 
 
 #include "brw_fs.h"
 #include "brw_cfg.h"
+#include "util/half_float.h"
 
 using namespace brw;
 
@@ -114,8 +115,9 @@  struct imm {
     */
    exec_list *uses;
 
-   /** The immediate value.  We currently only handle floats. */
+   /** The immediate value.  We currently only handle float and half-float. */
    float val;
+   brw_reg_type type;
 
    /**
     * The GRF register and subregister number where we've decided to store the
@@ -145,10 +147,10 @@  struct table {
 };
 
 static struct imm *
-find_imm(struct table *table, float val)
+find_imm(struct table *table, float val, brw_reg_type type)
 {
    for (int i = 0; i < table->len; i++) {
-      if (table->imm[i].val == val) {
+      if (table->imm[i].val == val && table->imm[i].type == type) {
          return &table->imm[i];
       }
    }
@@ -190,6 +192,20 @@  compare(const void *_a, const void *_b)
    return a->first_use_ip - b->first_use_ip;
 }
 
+static bool
+needs_negate(float reg_val, float imm_val, brw_reg_type type)
+{
+   /* reg_val represents the immediate value in the register in its original
+    * bit-size, while imm_val is always a valid 32-bit float value.
+    */
+   if (type == BRW_REGISTER_TYPE_HF) {
+      uint32_t reg_val_ud = *((uint32_t *) &reg_val);
+      reg_val = _mesa_half_to_float(reg_val_ud & 0xffff);
+   }
+
+   return signbit(imm_val) != signbit(reg_val);
+}
+
 bool
 fs_visitor::opt_combine_constants()
 {
@@ -215,12 +231,20 @@  fs_visitor::opt_combine_constants()
 
       for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file != IMM ||
-             inst->src[i].type != BRW_REGISTER_TYPE_F)
+             (inst->src[i].type != BRW_REGISTER_TYPE_F &&
+              inst->src[i].type != BRW_REGISTER_TYPE_HF))
             continue;
 
-         float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
-                     fabs(inst->src[i].f);
-         struct imm *imm = find_imm(&table, val);
+         float val;
+         if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
+            val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
+                        fabs(inst->src[i].f);
+         } else {
+            val = !inst->can_do_source_mods(devinfo) ?
+               _mesa_half_to_float(inst->src[i].d & 0xffff) :
+               fabs(_mesa_half_to_float(inst->src[i].d & 0xffff));
+         }
+         struct imm *imm = find_imm(&table, val, inst->src[i].type);
 
          if (imm) {
             bblock_t *intersection = cfg_t::intersect(block, imm->block);
@@ -238,6 +262,7 @@  fs_visitor::opt_combine_constants()
             imm->uses = new(const_ctx) exec_list();
             imm->uses->push_tail(link(const_ctx, &inst->src[i]));
             imm->val = val;
+            imm->type = inst->src[i].type;
             imm->uses_by_coissue = could_coissue(devinfo, inst);
             imm->must_promote = must_promote_imm(devinfo, inst);
             imm->first_use_ip = ip;
@@ -278,12 +303,23 @@  fs_visitor::opt_combine_constants()
                       imm->block->last_non_control_flow_inst()->next);
       const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
-      ibld.MOV(reg, brw_imm_f(imm->val));
+      reg = retype(reg, imm->type);
+      if (imm->type == BRW_REGISTER_TYPE_F) {
+         ibld.MOV(reg, brw_imm_f(imm->val));
+      } else {
+         const uint16_t val_hf = _mesa_float_to_half(imm->val);
+         ibld.MOV(reg, retype(brw_imm_uw(val_hf), BRW_REGISTER_TYPE_HF));
+      }
       imm->nr = reg.nr;
       imm->subreg_offset = reg.offset;
 
+      /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit
+       * constants into the same register
+       *
+       * TODO: try to pack pairs of HF constants into each 32-bit slot
+       */
       reg.offset += sizeof(float);
-      if (reg.offset == 8 * sizeof(float)) {
+      if (reg.offset == REG_SIZE) {
          reg.nr = alloc.allocate(1);
          reg.offset = 0;
       }
@@ -295,12 +331,14 @@  fs_visitor::opt_combine_constants()
       foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
          fs_reg *reg = link->reg;
          assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
-                fabsf(reg->f) == fabs(table.imm[i].val));
+                fabsf(reg->f) == fabs(table.imm[i].val) ||
+                table.imm[i].type == BRW_REGISTER_TYPE_HF);
 
          reg->file = VGRF;
+         reg->type = table.imm[i].type;
          reg->offset = table.imm[i].subreg_offset;
          reg->stride = 0;
-         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
+         reg->negate = needs_negate(reg->f, table.imm[i].val, table.imm[i].type);
          reg->nr = table.imm[i].nr;
       }
    }

Comments

On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <itoral@igalia.com>
wrote:

> Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> ---
>  .../compiler/brw_fs_combine_constants.cpp     | 60 +++++++++++++++----
>  1 file changed, 49 insertions(+), 11 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp
> b/src/intel/compiler/brw_fs_combine_constants.cpp
> index 7343f77bb45..54017e5668b 100644
> --- a/src/intel/compiler/brw_fs_combine_constants.cpp
> +++ b/src/intel/compiler/brw_fs_combine_constants.cpp
> @@ -36,6 +36,7 @@
>
>  #include "brw_fs.h"
>  #include "brw_cfg.h"
> +#include "util/half_float.h"
>
>  using namespace brw;
>
> @@ -114,8 +115,9 @@ struct imm {
>      */
>     exec_list *uses;
>
> -   /** The immediate value.  We currently only handle floats. */
> +   /** The immediate value.  We currently only handle float and
> half-float. */
>     float val;
> +   brw_reg_type type;
>

I had a brief chat with Matt today and I think that this may be going in
the wrong direction.  In particular, I'd like us to eventually (maybe we
can do it now?) generalize the combine_constants pass to more data types;
in particular, integers.  I recently came across a shader where the fact
that we couldn't do combine_constants on integers was causing significant
register pressure problems and spilling.  (The test was doing a bunch of
BFI2/BFE with constant sources.)  It could also be a huge win for 8-bit and
64-bit where we can't put immediates in regular 2-src instructions.

What does this mean for the pass?  I suspect we want a bit size instead of
a type and a simple char[8] for the data and just make it a blob of bits.
We may also want some sort of heuristic so we don't burn constant table
space for things that are only used once or maybe even twice.

Normally, I would say "do it as a fixup" but if we go the direction of
having a float and using _mesa_half_to_float and _mesa_float_to_half, I
suspect it'll be harder to go for the bag-of-bits approach.

Thoughts?

--Jason


>
>     /**
>      * The GRF register and subregister number where we've decided to
> store the
> @@ -145,10 +147,10 @@ struct table {
>  };
>
>  static struct imm *
> -find_imm(struct table *table, float val)
> +find_imm(struct table *table, float val, brw_reg_type type)
>  {
>     for (int i = 0; i < table->len; i++) {
> -      if (table->imm[i].val == val) {
> +      if (table->imm[i].val == val && table->imm[i].type == type) {
>           return &table->imm[i];
>        }
>     }
> @@ -190,6 +192,20 @@ compare(const void *_a, const void *_b)
>     return a->first_use_ip - b->first_use_ip;
>  }
>
> +static bool
> +needs_negate(float reg_val, float imm_val, brw_reg_type type)
> +{
> +   /* reg_val represents the immediate value in the register in its
> original
> +    * bit-size, while imm_val is always a valid 32-bit float value.
> +    */
> +   if (type == BRW_REGISTER_TYPE_HF) {
> +      uint32_t reg_val_ud = *((uint32_t *) &reg_val);
> +      reg_val = _mesa_half_to_float(reg_val_ud & 0xffff);
> +   }
> +
> +   return signbit(imm_val) != signbit(reg_val);
> +}
> +
>  bool
>  fs_visitor::opt_combine_constants()
>  {
> @@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants()
>
>        for (int i = 0; i < inst->sources; i++) {
>           if (inst->src[i].file != IMM ||
> -             inst->src[i].type != BRW_REGISTER_TYPE_F)
> +             (inst->src[i].type != BRW_REGISTER_TYPE_F &&
> +              inst->src[i].type != BRW_REGISTER_TYPE_HF))
>              continue;
>
> -         float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
> -                     fabs(inst->src[i].f);
> -         struct imm *imm = find_imm(&table, val);
> +         float val;
> +         if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
> +            val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
> +                        fabs(inst->src[i].f);
> +         } else {
> +            val = !inst->can_do_source_mods(devinfo) ?
> +               _mesa_half_to_float(inst->src[i].d & 0xffff) :
> +               fabs(_mesa_half_to_float(inst->src[i].d & 0xffff));
> +         }
> +         struct imm *imm = find_imm(&table, val, inst->src[i].type);
>
>           if (imm) {
>              bblock_t *intersection = cfg_t::intersect(block, imm->block);
> @@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants()
>              imm->uses = new(const_ctx) exec_list();
>              imm->uses->push_tail(link(const_ctx, &inst->src[i]));
>              imm->val = val;
> +            imm->type = inst->src[i].type;
>              imm->uses_by_coissue = could_coissue(devinfo, inst);
>              imm->must_promote = must_promote_imm(devinfo, inst);
>              imm->first_use_ip = ip;
> @@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants()
>                        imm->block->last_non_control_flow_inst()->next);
>        const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1,
> 0);
>
> -      ibld.MOV(reg, brw_imm_f(imm->val));
> +      reg = retype(reg, imm->type);
> +      if (imm->type == BRW_REGISTER_TYPE_F) {
> +         ibld.MOV(reg, brw_imm_f(imm->val));
> +      } else {
> +         const uint16_t val_hf = _mesa_float_to_half(imm->val);
> +         ibld.MOV(reg, retype(brw_imm_uw(val_hf), BRW_REGISTER_TYPE_HF));
> +      }
>        imm->nr = reg.nr;
>        imm->subreg_offset = reg.offset;
>
> +      /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit
> +       * constants into the same register
> +       *
> +       * TODO: try to pack pairs of HF constants into each 32-bit slot
> +       */
>        reg.offset += sizeof(float);
> -      if (reg.offset == 8 * sizeof(float)) {
> +      if (reg.offset == REG_SIZE) {
>           reg.nr = alloc.allocate(1);
>           reg.offset = 0;
>        }
> @@ -295,12 +331,14 @@ fs_visitor::opt_combine_constants()
>        foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
>           fs_reg *reg = link->reg;
>           assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
> -                fabsf(reg->f) == fabs(table.imm[i].val));
> +                fabsf(reg->f) == fabs(table.imm[i].val) ||
> +                table.imm[i].type == BRW_REGISTER_TYPE_HF);
>
>           reg->file = VGRF;
> +         reg->type = table.imm[i].type;
>           reg->offset = table.imm[i].subreg_offset;
>           reg->stride = 0;
> -         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
> +         reg->negate = needs_negate(reg->f, table.imm[i].val,
> table.imm[i].type);
>           reg->nr = table.imm[i].nr;
>        }
>     }
> --
> 2.17.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
On Thu, 2019-01-17 at 18:18 -0600, Jason Ekstrand wrote:
> On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <itoral@igalia.com
> > wrote:
> > Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> > 
> > ---
> > 
> >  .../compiler/brw_fs_combine_constants.cpp     | 60
> > +++++++++++++++----
> > 
> >  1 file changed, 49 insertions(+), 11 deletions(-)
> > 
> > 
> > 
> > diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp
> > b/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > index 7343f77bb45..54017e5668b 100644
> > 
> > --- a/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > +++ b/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > @@ -36,6 +36,7 @@
> > 
> > 
> > 
> >  #include "brw_fs.h"
> > 
> >  #include "brw_cfg.h"
> > 
> > +#include "util/half_float.h"
> > 
> > 
> > 
> >  using namespace brw;
> > 
> > 
> > 
> > @@ -114,8 +115,9 @@ struct imm {
> > 
> >      */
> > 
> >     exec_list *uses;
> > 
> > 
> > 
> > -   /** The immediate value.  We currently only handle floats. */
> > 
> > +   /** The immediate value.  We currently only handle float and
> > half-float. */
> > 
> >     float val;
> > 
> > +   brw_reg_type type;
> 
> I had a brief chat with Matt today and I think that this may be going
> in the wrong direction.  In particular, I'd like us to eventually
> (maybe we can do it now?) generalize the combine_constants pass to
> more data types; in particular, integers.  I recently came across a
> shader where the fact that we couldn't do combine_constants on
> integers was causing significant register pressure problems and
> spilling.  (The test was doing a bunch of BFI2/BFE with constant
> sources.)  It could also be a huge win for 8-bit and 64-bit where we
> can't put immediates in regular 2-src instructions.
> What does this mean for the pass?  I suspect we want a bit size
> instead of a type and a simple char[8] for the data and just make it
> a blob of bits.  We may also want some sort of heuristic so we don't
> burn constant table space for things that are only used once or maybe
> even twice.
> 
> Normally, I would say "do it as a fixup" but if we go the direction
> of having a float and using _mesa_half_to_float and
> _mesa_float_to_half, I suspect it'll be harder to go for the bag-of-
> bits approach.
> 
> Thoughts?

Fair enough, I see the value in having this support more than just F
and HF. I'll think a bit about it and I'll send a different version
that tries to cover these cases as well.
> --Jason
>  
> > 
> >     /**
> > 
> >      * The GRF register and subregister number where we've decided
> > to store the
> > 
> > @@ -145,10 +147,10 @@ struct table {
> > 
> >  };
> > 
> > 
> > 
> >  static struct imm *
> > 
> > -find_imm(struct table *table, float val)
> > 
> > +find_imm(struct table *table, float val, brw_reg_type type)
> > 
> >  {
> > 
> >     for (int i = 0; i < table->len; i++) {
> > 
> > -      if (table->imm[i].val == val) {
> > 
> > +      if (table->imm[i].val == val && table->imm[i].type == type)
> > {
> > 
> >           return &table->imm[i];
> > 
> >        }
> > 
> >     }
> > 
> > @@ -190,6 +192,20 @@ compare(const void *_a, const void *_b)
> > 
> >     return a->first_use_ip - b->first_use_ip;
> > 
> >  }
> > 
> > 
> > 
> > +static bool
> > 
> > +needs_negate(float reg_val, float imm_val, brw_reg_type type)
> > 
> > +{
> > 
> > +   /* reg_val represents the immediate value in the register in
> > its original
> > 
> > +    * bit-size, while imm_val is always a valid 32-bit float
> > value.
> > 
> > +    */
> > 
> > +   if (type == BRW_REGISTER_TYPE_HF) {
> > 
> > +      uint32_t reg_val_ud = *((uint32_t *) &reg_val);
> > 
> > +      reg_val = _mesa_half_to_float(reg_val_ud & 0xffff);
> > 
> > +   }
> > 
> > +
> > 
> > +   return signbit(imm_val) != signbit(reg_val);
> > 
> > +}
> > 
> > +
> > 
> >  bool
> > 
> >  fs_visitor::opt_combine_constants()
> > 
> >  {
> > 
> > @@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants()
> > 
> > 
> > 
> >        for (int i = 0; i < inst->sources; i++) {
> > 
> >           if (inst->src[i].file != IMM ||
> > 
> > -             inst->src[i].type != BRW_REGISTER_TYPE_F)
> > 
> > +             (inst->src[i].type != BRW_REGISTER_TYPE_F &&
> > 
> > +              inst->src[i].type != BRW_REGISTER_TYPE_HF))
> > 
> >              continue;
> > 
> > 
> > 
> > -         float val = !inst->can_do_source_mods(devinfo) ? inst-
> > >src[i].f :
> > 
> > -                     fabs(inst->src[i].f);
> > 
> > -         struct imm *imm = find_imm(&table, val);
> > 
> > +         float val;
> > 
> > +         if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
> > 
> > +            val = !inst->can_do_source_mods(devinfo) ? inst-
> > >src[i].f :
> > 
> > +                        fabs(inst->src[i].f);
> > 
> > +         } else {
> > 
> > +            val = !inst->can_do_source_mods(devinfo) ?
> > 
> > +               _mesa_half_to_float(inst->src[i].d & 0xffff) :
> > 
> > +               fabs(_mesa_half_to_float(inst->src[i].d & 0xffff));
> > 
> > +         }
> > 
> > +         struct imm *imm = find_imm(&table, val, inst-
> > >src[i].type);
> > 
> > 
> > 
> >           if (imm) {
> > 
> >              bblock_t *intersection = cfg_t::intersect(block, imm-
> > >block);
> > 
> > @@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants()
> > 
> >              imm->uses = new(const_ctx) exec_list();
> > 
> >              imm->uses->push_tail(link(const_ctx, &inst->src[i]));
> > 
> >              imm->val = val;
> > 
> > +            imm->type = inst->src[i].type;
> > 
> >              imm->uses_by_coissue = could_coissue(devinfo, inst);
> > 
> >              imm->must_promote = must_promote_imm(devinfo, inst);
> > 
> >              imm->first_use_ip = ip;
> > 
> > @@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants()
> > 
> >                        imm->block->last_non_control_flow_inst()-
> > >next);
> > 
> >        const fs_builder ibld = bld.at(imm->block,
> > n).exec_all().group(1, 0);
> > 
> > 
> > 
> > -      ibld.MOV(reg, brw_imm_f(imm->val));
> > 
> > +      reg = retype(reg, imm->type);
> > 
> > +      if (imm->type == BRW_REGISTER_TYPE_F) {
> > 
> > +         ibld.MOV(reg, brw_imm_f(imm->val));
> > 
> > +      } else {
> > 
> > +         const uint16_t val_hf = _mesa_float_to_half(imm->val);
> > 
> > +         ibld.MOV(reg, retype(brw_imm_uw(val_hf),
> > BRW_REGISTER_TYPE_HF));
> > 
> > +      }
> > 
> >        imm->nr = reg.nr;
> > 
> >        imm->subreg_offset = reg.offset;
> > 
> > 
> > 
> > +      /* Keep offsets 32-bit aligned since we are mixing 32-bit
> > and 16-bit
> > 
> > +       * constants into the same register
> > 
> > +       *
> > 
> > +       * TODO: try to pack pairs of HF constants into each 32-bit
> > slot
> > 
> > +       */
> > 
> >        reg.offset += sizeof(float);
> > 
> > -      if (reg.offset == 8 * sizeof(float)) {
> > 
> > +      if (reg.offset == REG_SIZE) {
> > 
> >           reg.nr = alloc.allocate(1);
> > 
> >           reg.offset = 0;
> > 
> >        }
> > 
> > @@ -295,12 +331,14 @@ fs_visitor::opt_combine_constants()
> > 
> >        foreach_list_typed(reg_link, link, link, table.imm[i].uses)
> > {
> > 
> >           fs_reg *reg = link->reg;
> > 
> >           assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
> > 
> > -                fabsf(reg->f) == fabs(table.imm[i].val));
> > 
> > +                fabsf(reg->f) == fabs(table.imm[i].val) ||
> > 
> > +                table.imm[i].type == BRW_REGISTER_TYPE_HF);
> > 
> > 
> > 
> >           reg->file = VGRF;
> > 
> > +         reg->type = table.imm[i].type;
> > 
> >           reg->offset = table.imm[i].subreg_offset;
> > 
> >           reg->stride = 0;
> > 
> > -         reg->negate = signbit(reg->f) !=
> > signbit(table.imm[i].val);
> > 
> > +         reg->negate = needs_negate(reg->f, table.imm[i].val,
> > table.imm[i].type);
> > 
> >           reg->nr = table.imm[i].nr;
> > 
> >        }
> > 
> >     }
> >
On Thu, 2019-01-17 at 18:18 -0600, Jason Ekstrand wrote:
> On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <itoral@igalia.com
> > wrote:
> > Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> > 
> > ---
> > 
> >  .../compiler/brw_fs_combine_constants.cpp     | 60
> > +++++++++++++++----
> > 
> >  1 file changed, 49 insertions(+), 11 deletions(-)
> > 
> > 
> > 
> > diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp
> > b/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > index 7343f77bb45..54017e5668b 100644
> > 
> > --- a/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > +++ b/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > @@ -36,6 +36,7 @@
> > 
> > 
> > 
> >  #include "brw_fs.h"
> > 
> >  #include "brw_cfg.h"
> > 
> > +#include "util/half_float.h"
> > 
> > 
> > 
> >  using namespace brw;
> > 
> > 
> > 
> > @@ -114,8 +115,9 @@ struct imm {
> > 
> >      */
> > 
> >     exec_list *uses;
> > 
> > 
> > 
> > -   /** The immediate value.  We currently only handle floats. */
> > 
> > +   /** The immediate value.  We currently only handle float and
> > half-float. */
> > 
> >     float val;
> > 
> > +   brw_reg_type type;
> 
> I had a brief chat with Matt today and I think that this may be going
> in the wrong direction.  In particular, I'd like us to eventually
> (maybe we can do it now?) generalize the combine_constants pass to
> more data types; in particular, integers.  I recently came across a
> shader where the fact that we couldn't do combine_constants on
> integers was causing significant register pressure problems and
> spilling.  (The test was doing a bunch of BFI2/BFE with constant
> sources.)  It could also be a huge win for 8-bit and 64-bit where we
> can't put immediates in regular 2-src instructions.
> What does this mean for the pass?  I suspect we want a bit size
> instead of a type and a simple char[8] for the data and just make it
> a blob of bits.  We may also want some sort of heuristic so we don't
> burn constant table space for things that are only used once or maybe
> even twice.

I started working on it yesterday and have a skeleton that is currently
capable of handling F, HF, D, UD, W, UW immediates. Seems to be working
going by a bunch of selected tests from CTS I have around but I'd like
to discuss a few things before I move forward:

The combine constants pass is very specific about the instructions it
acts on. Currently it only acts in two scenarios:
  1. When we want to take advantage of co-issuing (this is gen7 only
apparently).
  2. When we need to fix instructions that can't take an immediate
argument (such as MAD or LRP)

In order to fix the issue you mention above with BFI2/BFE, we only need
to let constant propagation happen on these instructions and the add
them to the list of opcodes under must_promote() like we do for LRP and
MAD (plus add the code to support combinationof D/UD immediates of
course).  That's what I am doing right now.  The moment we decide to
let constant propagation happen for these instructions then we must
promote the immediates to GRF by the time we run the combine pass, so
heuristics won't play any role in this, as they don't for MAD or LRP.
However, you mentioned heuristics specifically, so that makes me wonder
if your idea was to have the pass promote immediates for any
instruction in the program (that support immediates, since that would
be the only case in which we're free ot make a choice), but I don't
think there is any benefit in doing that, right?

Also, I think we still need a type instead of a size, I see a few
reasons for this:
 - The pass stores the absolute value of the constants it finds and
then relies on the signbit function (which only works for 32-bit and
64-bit floats) to decide whether it should negate the source whe it
rewrites it. This allows us to promote an immediate and its negated
version at the same time. As we start to support more types, including
unsigned integer types, I think we need to know the ty¡pe to do the
right thing. I think that we should at least know if the type is
unsigned.
 - There is an assertion in the pass that is floating-point specific:
before it rewrites an immediate source to read from the promoted
constant, it checks that the absolute value of the original immediate
source matches what we stored in the table, or that both are NaN. This
assertion doesn't make sense as is for integers, so we need to either
know the type to have a type-specific assertion or drop the assertion.
 - Some restrictions take place only on specific types. For example,
there is a restriction on BDW that affects ALIGN16 with half-float, and
we need to account for that in the pass when we promote the constant,
but there is no such restriction for W or UW.

Thoughts?

Iago

> Normally, I would say "do it as a fixup" but if we go the direction
> of having a float and using _mesa_half_to_float and
> _mesa_float_to_half, I suspect it'll be harder to go for the bag-of-
> bits approach.
> 
> Thoughts?
> --Jason
>  
> > 
> >     /**
> > 
> >      * The GRF register and subregister number where we've decided
> > to store the
> > 
> > @@ -145,10 +147,10 @@ struct table {
> > 
> >  };
> > 
> > 
> > 
> >  static struct imm *
> > 
> > -find_imm(struct table *table, float val)
> > 
> > +find_imm(struct table *table, float val, brw_reg_type type)
> > 
> >  {
> > 
> >     for (int i = 0; i < table->len; i++) {
> > 
> > -      if (table->imm[i].val == val) {
> > 
> > +      if (table->imm[i].val == val && table->imm[i].type == type)
> > {
> > 
> >           return &table->imm[i];
> > 
> >        }
> > 
> >     }
> > 
> > @@ -190,6 +192,20 @@ compare(const void *_a, const void *_b)
> > 
> >     return a->first_use_ip - b->first_use_ip;
> > 
> >  }
> > 
> > 
> > 
> > +static bool
> > 
> > +needs_negate(float reg_val, float imm_val, brw_reg_type type)
> > 
> > +{
> > 
> > +   /* reg_val represents the immediate value in the register in
> > its original
> > 
> > +    * bit-size, while imm_val is always a valid 32-bit float
> > value.
> > 
> > +    */
> > 
> > +   if (type == BRW_REGISTER_TYPE_HF) {
> > 
> > +      uint32_t reg_val_ud = *((uint32_t *) &reg_val);
> > 
> > +      reg_val = _mesa_half_to_float(reg_val_ud & 0xffff);
> > 
> > +   }
> > 
> > +
> > 
> > +   return signbit(imm_val) != signbit(reg_val);
> > 
> > +}
> > 
> > +
> > 
> >  bool
> > 
> >  fs_visitor::opt_combine_constants()
> > 
> >  {
> > 
> > @@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants()
> > 
> > 
> > 
> >        for (int i = 0; i < inst->sources; i++) {
> > 
> >           if (inst->src[i].file != IMM ||
> > 
> > -             inst->src[i].type != BRW_REGISTER_TYPE_F)
> > 
> > +             (inst->src[i].type != BRW_REGISTER_TYPE_F &&
> > 
> > +              inst->src[i].type != BRW_REGISTER_TYPE_HF))
> > 
> >              continue;
> > 
> > 
> > 
> > -         float val = !inst->can_do_source_mods(devinfo) ? inst-
> > >src[i].f :
> > 
> > -                     fabs(inst->src[i].f);
> > 
> > -         struct imm *imm = find_imm(&table, val);
> > 
> > +         float val;
> > 
> > +         if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
> > 
> > +            val = !inst->can_do_source_mods(devinfo) ? inst-
> > >src[i].f :
> > 
> > +                        fabs(inst->src[i].f);
> > 
> > +         } else {
> > 
> > +            val = !inst->can_do_source_mods(devinfo) ?
> > 
> > +               _mesa_half_to_float(inst->src[i].d & 0xffff) :
> > 
> > +               fabs(_mesa_half_to_float(inst->src[i].d & 0xffff));
> > 
> > +         }
> > 
> > +         struct imm *imm = find_imm(&table, val, inst-
> > >src[i].type);
> > 
> > 
> > 
> >           if (imm) {
> > 
> >              bblock_t *intersection = cfg_t::intersect(block, imm-
> > >block);
> > 
> > @@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants()
> > 
> >              imm->uses = new(const_ctx) exec_list();
> > 
> >              imm->uses->push_tail(link(const_ctx, &inst->src[i]));
> > 
> >              imm->val = val;
> > 
> > +            imm->type = inst->src[i].type;
> > 
> >              imm->uses_by_coissue = could_coissue(devinfo, inst);
> > 
> >              imm->must_promote = must_promote_imm(devinfo, inst);
> > 
> >              imm->first_use_ip = ip;
> > 
> > @@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants()
> > 
> >                        imm->block->last_non_control_flow_inst()-
> > >next);
> > 
> >        const fs_builder ibld = bld.at(imm->block,
> > n).exec_all().group(1, 0);
> > 
> > 
> > 
> > -      ibld.MOV(reg, brw_imm_f(imm->val));
> > 
> > +      reg = retype(reg, imm->type);
> > 
> > +      if (imm->type == BRW_REGISTER_TYPE_F) {
> > 
> > +         ibld.MOV(reg, brw_imm_f(imm->val));
> > 
> > +      } else {
> > 
> > +         const uint16_t val_hf = _mesa_float_to_half(imm->val);
> > 
> > +         ibld.MOV(reg, retype(brw_imm_uw(val_hf),
> > BRW_REGISTER_TYPE_HF));
> > 
> > +      }
> > 
> >        imm->nr = reg.nr;
> > 
> >        imm->subreg_offset = reg.offset;
> > 
> > 
> > 
> > +      /* Keep offsets 32-bit aligned since we are mixing 32-bit
> > and 16-bit
> > 
> > +       * constants into the same register
> > 
> > +       *
> > 
> > +       * TODO: try to pack pairs of HF constants into each 32-bit
> > slot
> > 
> > +       */
> > 
> >        reg.offset += sizeof(float);
> > 
> > -      if (reg.offset == 8 * sizeof(float)) {
> > 
> > +      if (reg.offset == REG_SIZE) {
> > 
> >           reg.nr = alloc.allocate(1);
> > 
> >           reg.offset = 0;
> > 
> >        }
> > 
> > @@ -295,12 +331,14 @@ fs_visitor::opt_combine_constants()
> > 
> >        foreach_list_typed(reg_link, link, link, table.imm[i].uses)
> > {
> > 
> >           fs_reg *reg = link->reg;
> > 
> >           assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
> > 
> > -                fabsf(reg->f) == fabs(table.imm[i].val));
> > 
> > +                fabsf(reg->f) == fabs(table.imm[i].val) ||
> > 
> > +                table.imm[i].type == BRW_REGISTER_TYPE_HF);
> > 
> > 
> > 
> >           reg->file = VGRF;
> > 
> > +         reg->type = table.imm[i].type;
> > 
> >           reg->offset = table.imm[i].subreg_offset;
> > 
> >           reg->stride = 0;
> > 
> > -         reg->negate = signbit(reg->f) !=
> > signbit(table.imm[i].val);
> > 
> > +         reg->negate = needs_negate(reg->f, table.imm[i].val,
> > table.imm[i].type);
> > 
> >           reg->nr = table.imm[i].nr;
> > 
> >        }
> > 
> >     }
> >
On January 29, 2019 05:27:50 Iago Toral <itoral@igalia.com> wrote:
> On Thu, 2019-01-17 at 18:18 -0600, Jason Ekstrand wrote:
>> On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <itoral@igalia.com> wrote:
>>> Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
>>> ---
>>> .../compiler/brw_fs_combine_constants.cpp     | 60 +++++++++++++++----
>>> 1 file changed, 49 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp 
>>> b/src/intel/compiler/brw_fs_combine_constants.cpp
>>> index 7343f77bb45..54017e5668b 100644
>>> --- a/src/intel/compiler/brw_fs_combine_constants.cpp
>>> +++ b/src/intel/compiler/brw_fs_combine_constants.cpp
>>> @@ -36,6 +36,7 @@
>>>
>>> #include "brw_fs.h"
>>> #include "brw_cfg.h"
>>> +#include "util/half_float.h"
>>>
>>> using namespace brw;
>>>
>>> @@ -114,8 +115,9 @@ struct imm {
>>>     */
>>>    exec_list *uses;
>>>
>>> -   /** The immediate value.  We currently only handle floats. */
>>> +   /** The immediate value.  We currently only handle float and half-float. */
>>>    float val;
>>> +   brw_reg_type type;
>>
>> I had a brief chat with Matt today and I think that this may be going in 
>> the wrong direction.  In particular, I'd like us to eventually (maybe we 
>> can do it now?) generalize the combine_constants pass to more data types; 
>> in particular, integers.  I recently came across a shader where the fact 
>> that we couldn't do combine_constants on integers was causing significant 
>> register pressure problems and spilling.  (The test was doing a bunch of 
>> BFI2/BFE with constant sources.)  It could also be a huge win for 8-bit and 
>> 64-bit where we can't put immediates in regular 2-src instructions.
>> What does this mean for the pass?  I suspect we want a bit size instead of 
>> a type and a simple char[8] for the data and just make it a blob of bits.  
>> We may also want some sort of heuristic so we don't burn constant table 
>> space for things that are only used once or maybe even twice.
>
> I started working on it yesterday and have a skeleton that is currently 
> capable of handling F, HF, D, UD, W, UW immediates. Seems to be working 
> going by a bunch of selected tests from CTS I have around but I'd like to 
> discuss a few things before I move forward:
>
> The combine constants pass is very specific about the instructions it acts 
> on. Currently it only acts in two scenarios:
> 1. When we want to take advantage of co-issuing (this is gen7 only apparently).
> 2. When we need to fix instructions that can't take an immediate argument 
> (such as MAD or LRP)
>
> In order to fix the issue you mention above with BFI2/BFE, we only need to 
> let constant propagation happen on these instructions and the add them to 
> the list of opcodes under must_promote() like we do for LRP and MAD (plus 
> add the code to support combinationof D/UD immediates of course). That's 
> what I am doing right now. The moment we decide to let constant propagation 
> happen for these instructions then we must promote the immediates to GRF by 
> the time we run the combine pass, so heuristics won't play any role in 
> this, as they don't for MAD or LRP. However, you mentioned heuristics 
> specifically, so that makes me wonder if your idea was to have the pass 
> promote immediates for any instruction in the program (that support 
> immediates, since that would be the only case in which we're free ot make a 
> choice), but I don't think there is any benefit in doing that, right?

I see no reason why we shouldn't do constant promotion for all 3src 
instructions.  We could also do it for 2src where the constant is required 
to be in the source that doesn't such as 1 << x.  Of course, we'll have to 
run shader-db on whatever we do to see the effect.

> Also, I think we still need a type instead of a size, I see a few reasons 
> for this:
> - The pass stores the absolute value of the constants it finds and then 
> relies on the signbit function (which only works for 32-bit and 64-bit 
> floats) to decide whether it should negate the source whe it rewrites it. 
> This allows us to promote an immediate and its negated version at the same 
> time. As we start to support more types, including unsigned integer types, 
> I think we need to know the ty¡pe to do the right thing. I think that we 
> should at least know if the type is unsigned.

Really, you only need to consider the sign relative to the type being used 
to read the constant.  If someone actually wanted the integer 0x3f800000, 
the same constant could be used to store that, 1.0f, and -1.0f.

> - There is an assertion in the pass that is floating-point specific: before 
> it rewrites an immediate source to read from the promoted constant, it 
> checks that the absolute value of the original immediate source matches 
> what we stored in the table, or that both are NaN. This assertion doesn't 
> make sense as is for integers, so we need to either know the type to have a 
> type-specific assertion or drop the assertion.

This sounds like it's just a sanity check for "in this constant the thing I 
stored the first time through the pass" type of thing (I'm not looking at 
the code right now) to make sure it didn't messes up its internal 
bookkeeping. If that's the case, it really just needs to be updated.

> - Some restrictions take place only on specific types. For example, there 
> is a restriction on BDW that affects ALIGN16 with half-float, and we need 
> to account for that in the pass when we promote the constant, but there is 
> no such restriction for W or UW.

Uh... That's annoying.  My initial reaction is to apply the restriction to 
all 16-bit things and hope it doesn't hurt too much. We could also add a 
bit to the constant table entry that says "I need the half-float 
restriction" and set it whenever an entry is used as HF.

--Jason

> Thoughts?
>
> Iago
>
>> Normally, I would say "do it as a fixup" but if we go the direction of 
>> having a float and using _mesa_half_to_float and _mesa_float_to_half, I 
>> suspect it'll be harder to go for the bag-of-bits approach.
>>
>> Thoughts?
>>
>> --Jason
>>
>>>
>>>    /**
>>>     * The GRF register and subregister number where we've decided to store the
>>> @@ -145,10 +147,10 @@ struct table {
>>> };
>>>
>>> static struct imm *
>>> -find_imm(struct table *table, float val)
>>> +find_imm(struct table *table, float val, brw_reg_type type)
>>> {
>>>    for (int i = 0; i < table->len; i++) {
>>> -      if (table->imm[i].val == val) {
>>> +      if (table->imm[i].val == val && table->imm[i].type == type) {
>>>          return &table->imm[i];
>>>       }
>>>    }
>>> @@ -190,6 +192,20 @@ compare(const void *_a, const void *_b)
>>>    return a->first_use_ip - b->first_use_ip;
>>> }
>>>
>>> +static bool
>>> +needs_negate(float reg_val, float imm_val, brw_reg_type type)
>>> +{
>>> +   /* reg_val represents the immediate value in the register in its original
>>> +    * bit-size, while imm_val is always a valid 32-bit float value.
>>> +    */
>>> +   if (type == BRW_REGISTER_TYPE_HF) {
>>> +      uint32_t reg_val_ud = *((uint32_t *) &reg_val);
>>> +      reg_val = _mesa_half_to_float(reg_val_ud & 0xffff);
>>> +   }
>>> +
>>> +   return signbit(imm_val) != signbit(reg_val);
>>> +}
>>> +
>>> bool
>>> fs_visitor::opt_combine_constants()
>>> {
>>> @@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants()
>>>
>>>       for (int i = 0; i < inst->sources; i++) {
>>>          if (inst->src[i].file != IMM ||
>>> -             inst->src[i].type != BRW_REGISTER_TYPE_F)
>>> +             (inst->src[i].type != BRW_REGISTER_TYPE_F &&
>>> +              inst->src[i].type != BRW_REGISTER_TYPE_HF))
>>>             continue;
>>>
>>> -         float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
>>> -                     fabs(inst->src[i].f);
>>> -         struct imm *imm = find_imm(&table, val);
>>> +         float val;
>>> +         if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
>>> +            val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
>>> +                        fabs(inst->src[i].f);
>>> +         } else {
>>> +            val = !inst->can_do_source_mods(devinfo) ?
>>> +               _mesa_half_to_float(inst->src[i].d & 0xffff) :
>>> +               fabs(_mesa_half_to_float(inst->src[i].d & 0xffff));
>>> +         }
>>> +         struct imm *imm = find_imm(&table, val, inst->src[i].type);
>>>
>>>          if (imm) {
>>>             bblock_t *intersection = cfg_t::intersect(block, imm->block);
>>> @@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants()
>>>             imm->uses = new(const_ctx) exec_list();
>>>             imm->uses->push_tail(link(const_ctx, &inst->src[i]));
>>>             imm->val = val;
>>> +            imm->type = inst->src[i].type;
>>>             imm->uses_by_coissue = could_coissue(devinfo, inst);
>>>             imm->must_promote = must_promote_imm(devinfo, inst);
>>>             imm->first_use_ip = ip;
>>> @@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants()
>>>                       imm->block->last_non_control_flow_inst()->next);
>>>       const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
>>>
>>> -      ibld.MOV(reg, brw_imm_f(imm->val));
>>> +      reg = retype(reg, imm->type);
>>> +      if (imm->type == BRW_REGISTER_TYPE_F) {
>>> +         ibld.MOV(reg, brw_imm_f(imm->val));
>>> +      } else {
>>> +         const uint16_t val_hf = _mesa_float_to_half(imm->val);
>>> +         ibld.MOV(reg, retype(brw_imm_uw(val_hf), BRW_REGISTER_TYPE_HF));
>>> +      }
>>>       imm->nr = reg.nr;
>>>       imm->subreg_offset = reg.offset;
>>>
>>> +      /* Keep offsets 32-bit aligned since we are mixing 32-bit and 16-bit
>>> +       * constants into the same register
>>> +       *
>>> +       * TODO: try to pack pairs of HF constants into each 32-bit slot
>>> +       */
>>>       reg.offset += sizeof(float);
>>> -      if (reg.offset == 8 * sizeof(float)) {
>>> +      if (reg.offset == REG_SIZE) {
>>>          reg.nr = alloc.allocate(1);
>>>          reg.offset = 0;
>>>       }
>>> @@ -295,12 +331,14 @@ fs_visitor::opt_combine_constants()
>>>       foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
>>>          fs_reg *reg = link->reg;
>>>          assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
>>> -                fabsf(reg->f) == fabs(table.imm[i].val));
>>> +                fabsf(reg->f) == fabs(table.imm[i].val) ||
>>> +                table.imm[i].type == BRW_REGISTER_TYPE_HF);
>>>
>>>          reg->file = VGRF;
>>> +         reg->type = table.imm[i].type;
>>>          reg->offset = table.imm[i].subreg_offset;
>>>          reg->stride = 0;
>>> -         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
>>> +         reg->negate = needs_negate(reg->f, table.imm[i].val, 
>>> table.imm[i].type);
>>>          reg->nr = table.imm[i].nr;
>>>       }
>>>    }
On Tue, 2019-01-29 at 07:20 -0800, Jason Ekstrand wrote:
> 
> 
> 
> 
> 
> On January 29, 2019 05:27:50 Iago Toral <itoral@igalia.com> wrote:
> > On Thu, 2019-01-17 at 18:18 -0600, Jason Ekstrand wrote:
> > > On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <
> > > itoral@igalia.com> wrote:
> > > > Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> > > > 
> > > > ---
> > > > 
> > > >  .../compiler/brw_fs_combine_constants.cpp     | 60
> > > > +++++++++++++++----
> > > > 
> > > >  1 file changed, 49 insertions(+), 11 deletions(-)
> > > > 
> > > > 
> > > > 
> > > > diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp
> > > > b/src/intel/compiler/brw_fs_combine_constants.cpp
> > > > 
> > > > index 7343f77bb45..54017e5668b 100644
> > > > 
> > > > --- a/src/intel/compiler/brw_fs_combine_constants.cpp
> > > > 
> > > > +++ b/src/intel/compiler/brw_fs_combine_constants.cpp
> > > > 
> > > > @@ -36,6 +36,7 @@
> > > > 
> > > > 
> > > > 
> > > >  #include "brw_fs.h"
> > > > 
> > > >  #include "brw_cfg.h"
> > > > 
> > > > +#include "util/half_float.h"
> > > > 
> > > > 
> > > > 
> > > >  using namespace brw;
> > > > 
> > > > 
> > > > 
> > > > @@ -114,8 +115,9 @@ struct imm {
> > > > 
> > > >      */
> > > > 
> > > >     exec_list *uses;
> > > > 
> > > > 
> > > > 
> > > > -   /** The immediate value.  We currently only handle floats.
> > > > */
> > > > 
> > > > +   /** The immediate value.  We currently only handle float
> > > > and half-float. */
> > > > 
> > > >     float val;
> > > > 
> > > > +   brw_reg_type type;
> > > 
> > > I had a brief chat with Matt today and I think that this may be
> > > going in the wrong direction.  In particular, I'd like us to
> > > eventually (maybe we can do it now?) generalize the
> > > combine_constants pass to more data types; in particular,
> > > integers.  I recently came across a shader where the fact that we
> > > couldn't do combine_constants on integers was causing significant
> > > register pressure problems and spilling.  (The test was doing a
> > > bunch of BFI2/BFE with constant sources.)  It could also be a
> > > huge win for 8-bit and 64-bit where we can't put immediates in
> > > regular 2-src instructions.
> > > What does this mean for the pass?  I suspect we want a bit size
> > > instead of a type and a simple char[8] for the data and just make
> > > it a blob of bits.  We may also want some sort of heuristic so we
> > > don't burn constant table space for things that are only used
> > > once or maybe even twice.
> > 
> > I started working on it yesterday and have a skeleton that is
> > currently capable of handling F, HF, D, UD, W, UW immediates. Seems
> > to be working going by a bunch of selected tests from CTS I have
> > around but I'd like to discuss a few things before I move forward:
> > 
> > The combine constants pass is very specific about the instructions
> > it acts on. Currently it only acts in two scenarios:
> >   1. When we want to take advantage of co-issuing (this is gen7
> > only apparently).
> >   2. When we need to fix instructions that can't take an immediate
> > argument (such as MAD or LRP)
> > 
> > In order to fix the issue you mention above with BFI2/BFE, we only
> > need to let constant propagation happen on these instructions and
> > the add them to the list of opcodes under must_promote() like we do
> > for LRP and MAD (plus add the code to support combinationof D/UD
> > immediates of course).  That's what I am doing right now.  The
> > moment we decide to let constant propagation happen for these
> > instructions then we must promote the immediates to GRF by the time
> > we run the combine pass, so heuristics won't play any role in this,
> > as they don't for MAD or LRP. However, you mentioned heuristics
> > specifically, so that makes me wonder if your idea was to have the
> > pass promote immediates for any instruction in the program (that
> > support immediates, since that would be the only case in which
> > we're free ot make a choice), but I don't think there is any
> > benefit in doing that, right?
> 
> I see no reason why we shouldn't do constant promotion for all 3src
> instructions.  We could also do it for 2src where the constant is
> required to be in the source that doesn't such as 1 << x.  Of course,
> we'll have to run shader-db on whatever we do to see the effect.

Sure, my point was that in all these cases there would be no heuristics
involved, since we would be doing this only for cases where the
instruction can't use immediates at all.
> > Also, I think we still need a type instead of a size, I see a few
> > reasons for this:
> >  - The pass stores the absolute value of the constants it finds and
> > then relies on the signbit function (which only works for 32-bit
> > and 64-bit floats) to decide whether it should negate the source
> > whe it rewrites it. This allows us to promote an immediate and its
> > negated version at the same time. As we start to support more
> > types, including unsigned integer types, I think we need to know
> > the ty¡pe to do the right thing. I think that we should at least
> > know if the type is unsigned.
> 
> Really, you only need to consider the sign relative to the type being
> used to read the constant.  If someone actually wanted the integer
> 0x3f800000, the same constant could be used to store that, 1.0f, and
> -1.0f.

True.
> >  - There is an assertion in the pass that is floating-point
> > specific: before it rewrites an immediate source to read from the
> > promoted constant, it checks that the absolute value of the
> > original immediate source matches what we stored in the table, or
> > that both are NaN. This assertion doesn't make sense as is for
> > integers, so we need to either know the type to have a type-
> > specific assertion or drop the assertion.
> 
> This sounds like it's just a sanity check for "in this constant the
> thing I stored the first time through the pass" type of thing (I'm
> not looking at the code right now) to make sure it didn't messes up
> its internal bookkeeping. If that's the case, it really just needs to
> be updated.

Yes, it just goes though all uses of a particular immediate that we
stored in the table and for each instruction that sourced from a
particular immediate it verifies that the constant matches what is
stored in the register, with the caveat that what we store in the table
is always the absiolute value, so you need to know the type to make
that check. We can take the type from the register that was reading the
immediate though so that is probably fine.
> >  - Some restrictions take place only on specific types. For
> > example, there is a restriction on BDW that affects ALIGN16 with
> > half-float, and we need to account for that in the pass when we
> > promote the constant, but there is no such restriction for W or UW.
> 
> Uh... That's annoying.  My initial reaction is to apply the
> restriction to all 16-bit things and hope it doesn't hurt too much.
> We could also add a bit to the constant table entry that says "I need
> the half-float restriction" and set it whenever an entry is used as
> HF.

I imagine we can find more restrictions like this when we expand the
pass to cover other types such as DF or B that also tend to have more
restrictions so I am not sure what is the best option. I think I'll try
to add that bit and see what happens when we add the remaining constant
types.
Thanks for the quick feedback!
> --Jason
> > Thoughts?
> > 
> > Iago
> > 
> > > Normally, I would say "do it as a fixup" but if we go the
> > > direction of having a float and using _mesa_half_to_float and
> > > _mesa_float_to_half, I suspect it'll be harder to go for the bag-
> > > of-bits approach.
> > > 
> > > Thoughts?
> > > --Jason
> > >  
> > > > 
> > > >     /**
> > > > 
> > > >      * The GRF register and subregister number where we've
> > > > decided to store the
> > > > 
> > > > @@ -145,10 +147,10 @@ struct table {
> > > > 
> > > >  };
> > > > 
> > > > 
> > > > 
> > > >  static struct imm *
> > > > 
> > > > -find_imm(struct table *table, float val)
> > > > 
> > > > +find_imm(struct table *table, float val, brw_reg_type type)
> > > > 
> > > >  {
> > > > 
> > > >     for (int i = 0; i < table->len; i++) {
> > > > 
> > > > -      if (table->imm[i].val == val) {
> > > > 
> > > > +      if (table->imm[i].val == val && table->imm[i].type ==
> > > > type) {
> > > > 
> > > >           return &table->imm[i];
> > > > 
> > > >        }
> > > > 
> > > >     }
> > > > 
> > > > @@ -190,6 +192,20 @@ compare(const void *_a, const void *_b)
> > > > 
> > > >     return a->first_use_ip - b->first_use_ip;
> > > > 
> > > >  }
> > > > 
> > > > 
> > > > 
> > > > +static bool
> > > > 
> > > > +needs_negate(float reg_val, float imm_val, brw_reg_type type)
> > > > 
> > > > +{
> > > > 
> > > > +   /* reg_val represents the immediate value in the register
> > > > in its original
> > > > 
> > > > +    * bit-size, while imm_val is always a valid 32-bit float
> > > > value.
> > > > 
> > > > +    */
> > > > 
> > > > +   if (type == BRW_REGISTER_TYPE_HF) {
> > > > 
> > > > +      uint32_t reg_val_ud = *((uint32_t *) &reg_val);
> > > > 
> > > > +      reg_val = _mesa_half_to_float(reg_val_ud & 0xffff);
> > > > 
> > > > +   }
> > > > 
> > > > +
> > > > 
> > > > +   return signbit(imm_val) != signbit(reg_val);
> > > > 
> > > > +}
> > > > 
> > > > +
> > > > 
> > > >  bool
> > > > 
> > > >  fs_visitor::opt_combine_constants()
> > > > 
> > > >  {
> > > > 
> > > > @@ -215,12 +231,20 @@ fs_visitor::opt_combine_constants()
> > > > 
> > > > 
> > > > 
> > > >        for (int i = 0; i < inst->sources; i++) {
> > > > 
> > > >           if (inst->src[i].file != IMM ||
> > > > 
> > > > -             inst->src[i].type != BRW_REGISTER_TYPE_F)
> > > > 
> > > > +             (inst->src[i].type != BRW_REGISTER_TYPE_F &&
> > > > 
> > > > +              inst->src[i].type != BRW_REGISTER_TYPE_HF))
> > > > 
> > > >              continue;
> > > > 
> > > > 
> > > > 
> > > > -         float val = !inst->can_do_source_mods(devinfo) ?
> > > > inst->src[i].f :
> > > > 
> > > > -                     fabs(inst->src[i].f);
> > > > 
> > > > -         struct imm *imm = find_imm(&table, val);
> > > > 
> > > > +         float val;
> > > > 
> > > > +         if (inst->src[i].type == BRW_REGISTER_TYPE_F) {
> > > > 
> > > > +            val = !inst->can_do_source_mods(devinfo) ? inst-
> > > > >src[i].f :
> > > > 
> > > > +                        fabs(inst->src[i].f);
> > > > 
> > > > +         } else {
> > > > 
> > > > +            val = !inst->can_do_source_mods(devinfo) ?
> > > > 
> > > > +               _mesa_half_to_float(inst->src[i].d & 0xffff) :
> > > > 
> > > > +               fabs(_mesa_half_to_float(inst->src[i].d &
> > > > 0xffff));
> > > > 
> > > > +         }
> > > > 
> > > > +         struct imm *imm = find_imm(&table, val, inst-
> > > > >src[i].type);
> > > > 
> > > > 
> > > > 
> > > >           if (imm) {
> > > > 
> > > >              bblock_t *intersection = cfg_t::intersect(block,
> > > > imm->block);
> > > > 
> > > > @@ -238,6 +262,7 @@ fs_visitor::opt_combine_constants()
> > > > 
> > > >              imm->uses = new(const_ctx) exec_list();
> > > > 
> > > >              imm->uses->push_tail(link(const_ctx, &inst-
> > > > >src[i]));
> > > > 
> > > >              imm->val = val;
> > > > 
> > > > +            imm->type = inst->src[i].type;
> > > > 
> > > >              imm->uses_by_coissue = could_coissue(devinfo,
> > > > inst);
> > > > 
> > > >              imm->must_promote = must_promote_imm(devinfo,
> > > > inst);
> > > > 
> > > >              imm->first_use_ip = ip;
> > > > 
> > > > @@ -278,12 +303,23 @@ fs_visitor::opt_combine_constants()
> > > > 
> > > >                        imm->block-
> > > > >last_non_control_flow_inst()->next);
> > > > 
> > > >        const fs_builder ibld = bld.at(imm->block,
> > > > n).exec_all().group(1, 0);
> > > > 
> > > > 
> > > > 
> > > > -      ibld.MOV(reg, brw_imm_f(imm->val));
> > > > 
> > > > +      reg = retype(reg, imm->type);
> > > > 
> > > > +      if (imm->type == BRW_REGISTER_TYPE_F) {
> > > > 
> > > > +         ibld.MOV(reg, brw_imm_f(imm->val));
> > > > 
> > > > +      } else {
> > > > 
> > > > +         const uint16_t val_hf = _mesa_float_to_half(imm-
> > > > >val);
> > > > 
> > > > +         ibld.MOV(reg, retype(brw_imm_uw(val_hf),
> > > > BRW_REGISTER_TYPE_HF));
> > > > 
> > > > +      }
> > > > 
> > > >        imm->nr = reg.nr;
> > > > 
> > > >        imm->subreg_offset = reg.offset;
> > > > 
> > > > 
> > > > 
> > > > +      /* Keep offsets 32-bit aligned since we are mixing 32-
> > > > bit and 16-bit
> > > > 
> > > > +       * constants into the same register
> > > > 
> > > > +       *
> > > > 
> > > > +       * TODO: try to pack pairs of HF constants into each 32-
> > > > bit slot
> > > > 
> > > > +       */
> > > > 
> > > >        reg.offset += sizeof(float);
> > > > 
> > > > -      if (reg.offset == 8 * sizeof(float)) {
> > > > 
> > > > +      if (reg.offset == REG_SIZE) {
> > > > 
> > > >           reg.nr = alloc.allocate(1);
> > > > 
> > > >           reg.offset = 0;
> > > > 
> > > >        }
> > > > 
> > > > @@ -295,12 +331,14 @@ fs_visitor::opt_combine_constants()
> > > > 
> > > >        foreach_list_typed(reg_link, link, link,
> > > > table.imm[i].uses) {
> > > > 
> > > >           fs_reg *reg = link->reg;
> > > > 
> > > >           assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
> > > > 
> > > > -                fabsf(reg->f) == fabs(table.imm[i].val));
> > > > 
> > > > +                fabsf(reg->f) == fabs(table.imm[i].val) ||
> > > > 
> > > > +                table.imm[i].type == BRW_REGISTER_TYPE_HF);
> > > > 
> > > > 
> > > > 
> > > >           reg->file = VGRF;
> > > > 
> > > > +         reg->type = table.imm[i].type;
> > > > 
> > > >           reg->offset = table.imm[i].subreg_offset;
> > > > 
> > > >           reg->stride = 0;
> > > > 
> > > > -         reg->negate = signbit(reg->f) !=
> > > > signbit(table.imm[i].val);
> > > > 
> > > > +         reg->negate = needs_negate(reg->f, table.imm[i].val,
> > > > table.imm[i].type);
> > > > 
> > > >           reg->nr = table.imm[i].nr;
> > > > 
> > > >        }
> > > > 
> > > >     }
> > > > 
> 
> 
> 
> 
> 
> 
> 
>