[v4,27/40] intel/compiler: generalize the combine constants pass

Submitted by Iago Toral Quiroga on Feb. 12, 2019, 11:55 a.m.

Details

Message ID 20190212115607.21467-28-itoral@igalia.com
State New
Headers show
Series "intel: VK_KHR_shader_float16_int8 implementation" ( rev: 16 15 14 13 12 11 10 9 8 7 6 ) in Mesa

Not browsing as part of any series.

Commit Message

Iago Toral Quiroga Feb. 12, 2019, 11:55 a.m.
At the very least we need it to handle HF too, since we are doing
constant propagation for MAD and LRP, which relies on this pass
to promote the immediates to GRF in the end, but ideally
we want it to support even more types so we can take advantage
of it to improve register pressure in some scenarios.
---
 .../compiler/brw_fs_combine_constants.cpp     | 202 ++++++++++++++++--
 1 file changed, 180 insertions(+), 22 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp
index 7343f77bb45..5d79f1a0826 100644
--- a/src/intel/compiler/brw_fs_combine_constants.cpp
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -36,6 +36,7 @@ 
 
 #include "brw_fs.h"
 #include "brw_cfg.h"
+#include "util/half_float.h"
 
 using namespace brw;
 
@@ -114,8 +115,17 @@  struct imm {
     */
    exec_list *uses;
 
-   /** The immediate value.  We currently only handle floats. */
-   float val;
+   /** The immediate value */
+   union {
+      char bytes[8];
+      float f;
+      int32_t d;
+      int16_t w;
+   };
+   uint8_t size;
+
+   /** When promoting half-float we need to account for certain restrictions */
+   bool is_half_float;
 
    /**
     * The GRF register and subregister number where we've decided to store the
@@ -145,10 +155,11 @@  struct table {
 };
 
 static struct imm *
-find_imm(struct table *table, float val)
+find_imm(struct table *table, void *data, uint8_t size)
 {
    for (int i = 0; i < table->len; i++) {
-      if (table->imm[i].val == val) {
+      if (table->imm[i].size == size &&
+          !memcmp(table->imm[i].bytes, data, size)) {
          return &table->imm[i];
       }
    }
@@ -190,6 +201,96 @@  compare(const void *_a, const void *_b)
    return a->first_use_ip - b->first_use_ip;
 }
 
+static bool
+get_constant_value(const struct gen_device_info *devinfo,
+                   const fs_inst *inst, uint32_t src_idx,
+                   void *out, brw_reg_type *out_type)
+{
+   const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
+   const fs_reg *src = &inst->src[src_idx];
+
+   *out_type = src->type;
+
+   switch (*out_type) {
+   case BRW_REGISTER_TYPE_F: {
+      float val = !can_do_source_mods ? src->f : fabsf(src->f);
+      memcpy(out, &val, 4);
+      break;
+   }
+   case BRW_REGISTER_TYPE_HF: {
+      uint16_t val = src->d & 0xffffu;
+      if (can_do_source_mods)
+         val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
+      memcpy(out, &val, 2);
+      break;
+   }
+   case BRW_REGISTER_TYPE_D: {
+      int32_t val = !can_do_source_mods ? src->d : abs(src->d);
+      memcpy(out, &val, 4);
+      break;
+   }
+   case BRW_REGISTER_TYPE_UD:
+      memcpy(out, &src->ud, 4);
+      break;
+   case BRW_REGISTER_TYPE_W: {
+      int16_t val = src->d & 0xffffu;
+      if (can_do_source_mods)
+         val = abs(val);
+      memcpy(out, &val, 2);
+      break;
+   }
+   case BRW_REGISTER_TYPE_UW:
+      memcpy(out, &src->ud, 2);
+      break;
+   default:
+      return false;
+   };
+
+   return true;
+}
+
+static struct brw_reg
+build_imm_reg_for_copy(struct imm *imm)
+{
+   switch (imm->size) {
+   case 4:
+      return brw_imm_d(imm->d);
+   case 2:
+      return brw_imm_w(imm->w);
+   default:
+      unreachable("not implemented");
+   }
+}
+
+static inline uint32_t
+get_alignment_for_imm(const struct imm *imm)
+{
+   if (imm->is_half_float)
+      return 4; /* At least MAD seems to require this */
+   else
+      return imm->size;
+}
+
+static bool
+needs_negate(const struct fs_reg *reg, const struct imm *imm)
+{
+   switch (reg->type) {
+   case BRW_REGISTER_TYPE_F:
+      return signbit(reg->f) != signbit(imm->f);
+   case BRW_REGISTER_TYPE_D:
+      return (reg->d < 0) != (imm->d < 0);
+   case BRW_REGISTER_TYPE_HF:
+      return (reg->d & 0x8000u) != (imm->w & 0x8000u);
+   case BRW_REGISTER_TYPE_W:
+      return ((reg->d & 0xffffu) < 0) != (imm->w < 0);
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UW:
+      return false;
+   default:
+      unreachable("not implemented");
+   };
+}
+
 bool
 fs_visitor::opt_combine_constants()
 {
@@ -214,13 +315,17 @@  fs_visitor::opt_combine_constants()
          continue;
 
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file != IMM ||
-             inst->src[i].type != BRW_REGISTER_TYPE_F)
+         if (inst->src[i].file != IMM)
             continue;
 
-         float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
-                     fabs(inst->src[i].f);
-         struct imm *imm = find_imm(&table, val);
+         char data[8];
+         brw_reg_type type;
+         if (!get_constant_value(devinfo, inst, i, data, &type))
+            continue;
+
+         uint8_t size = type_sz(type);
+
+         struct imm *imm = find_imm(&table, data, size);
 
          if (imm) {
             bblock_t *intersection = cfg_t::intersect(block, imm->block);
@@ -237,7 +342,9 @@  fs_visitor::opt_combine_constants()
             imm->inst = inst;
             imm->uses = new(const_ctx) exec_list();
             imm->uses->push_tail(link(const_ctx, &inst->src[i]));
-            imm->val = val;
+            memcpy(imm->bytes, data, size);
+            imm->size = size;
+            imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
             imm->uses_by_coissue = could_coissue(devinfo, inst);
             imm->must_promote = must_promote_imm(devinfo, inst);
             imm->first_use_ip = ip;
@@ -276,17 +383,40 @@  fs_visitor::opt_combine_constants()
        */
       exec_node *n = (imm->inst ? imm->inst :
                       imm->block->last_non_control_flow_inst()->next);
-      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
 
-      ibld.MOV(reg, brw_imm_f(imm->val));
-      imm->nr = reg.nr;
-      imm->subreg_offset = reg.offset;
+      /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
+       *
+       *   "In Align16 mode, the channel selects and channel enables apply to a
+       *    pair of half-floats, because these parameters are defined for DWord
+       *    elements ONLY. This is applicable when both source and destination
+       *    are half-floats."
+       *
+       * This means that Align16 instructions that use promoted HF immediates
+       * and use a <0,1,0>:HF region would read 2 HF slots instead of
+       * replicating the single one we want. To avoid this, we always populate
+       * both HF slots within a DWord with the constant.
+       */
+      const uint32_t width = devinfo->gen == 8 && imm->is_half_float ? 2 : 1;
+      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(width, 0);
+
+      /* Put the immediate in an offset aligned to its size. Some instructions
+       * seem to have additional alignment requirements, so account for that
+       * too.
+       */
+      reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
 
-      reg.offset += sizeof(float);
-      if (reg.offset == 8 * sizeof(float)) {
+      /* Ensure we have enough space in the register to copy the immediate */
+      struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
+      if (reg.offset + type_sz(imm_reg.type) > REG_SIZE) {
          reg.nr = alloc.allocate(1);
          reg.offset = 0;
       }
+
+      ibld.MOV(retype(reg, imm_reg.type), imm_reg);
+      imm->nr = reg.nr;
+      imm->subreg_offset = reg.offset;
+
+      reg.offset += imm->size;
    }
    promoted_constants = table.len;
 
@@ -294,13 +424,41 @@  fs_visitor::opt_combine_constants()
    for (int i = 0; i < table.len; i++) {
       foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
          fs_reg *reg = link->reg;
-         assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
-                fabsf(reg->f) == fabs(table.imm[i].val));
+#ifdef DEBUG
+         switch (reg->type) {
+         case BRW_REGISTER_TYPE_F:
+            assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
+                   (fabsf(reg->f) == fabsf(table.imm[i].f)));
+            break;
+         case BRW_REGISTER_TYPE_HF:
+            assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
+                    isnan(_mesa_half_to_float(table.imm[i].w))) ||
+                   (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
+                    fabsf(_mesa_half_to_float(table.imm[i].w))));
+            break;
+         case BRW_REGISTER_TYPE_D:
+            assert(reg->type == BRW_REGISTER_TYPE_D &&
+                   abs(reg->d) == abs(table.imm[i].d));
+            break;
+         case BRW_REGISTER_TYPE_UD:
+            assert(reg->d == table.imm[i].d);
+            break;
+         case BRW_REGISTER_TYPE_W:
+            assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
+            break;
+         case BRW_REGISTER_TYPE_UW:
+            assert(reg->type == BRW_REGISTER_TYPE_UW &&
+                   (reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
+            break;
+         default:
+            break;
+         }
+#endif
 
          reg->file = VGRF;
          reg->offset = table.imm[i].subreg_offset;
          reg->stride = 0;
-         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
+         reg->negate = needs_negate(reg, &table.imm[i]);
          reg->nr = table.imm[i].nr;
       }
    }
@@ -309,9 +467,9 @@  fs_visitor::opt_combine_constants()
       for (int i = 0; i < table.len; i++) {
          struct imm *imm = &table.imm[i];
 
-         printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d, %2d), "
-                "IP: %4d to %4d, length %4d\n",
-                imm->val,
+         printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
+                "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
+                (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
                 imm->block->num,
                 imm->nr,
                 imm->subreg_offset,

Comments

On Tue, Feb 12, 2019 at 5:57 AM Iago Toral Quiroga <itoral@igalia.com>
wrote:

> At the very least we need it to handle HF too, since we are doing
> constant propagation for MAD and LRP, which relies on this pass
> to promote the immediates to GRF in the end, but ideally
> we want it to support even more types so we can take advantage
> of it to improve register pressure in some scenarios.
> ---
>  .../compiler/brw_fs_combine_constants.cpp     | 202 ++++++++++++++++--
>  1 file changed, 180 insertions(+), 22 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp
> b/src/intel/compiler/brw_fs_combine_constants.cpp
> index 7343f77bb45..5d79f1a0826 100644
> --- a/src/intel/compiler/brw_fs_combine_constants.cpp
> +++ b/src/intel/compiler/brw_fs_combine_constants.cpp
> @@ -36,6 +36,7 @@
>
>  #include "brw_fs.h"
>  #include "brw_cfg.h"
> +#include "util/half_float.h"
>
>  using namespace brw;
>
> @@ -114,8 +115,17 @@ struct imm {
>      */
>     exec_list *uses;
>
> -   /** The immediate value.  We currently only handle floats. */
> -   float val;
> +   /** The immediate value */
> +   union {
> +      char bytes[8];
> +      float f;
> +      int32_t d;
> +      int16_t w;
> +   };
> +   uint8_t size;
> +
> +   /** When promoting half-float we need to account for certain
> restrictions */
> +   bool is_half_float;
>
>     /**
>      * The GRF register and subregister number where we've decided to
> store the
> @@ -145,10 +155,11 @@ struct table {
>  };
>
>  static struct imm *
> -find_imm(struct table *table, float val)
> +find_imm(struct table *table, void *data, uint8_t size)
>  {
>     for (int i = 0; i < table->len; i++) {
> -      if (table->imm[i].val == val) {
> +      if (table->imm[i].size == size &&
> +          !memcmp(table->imm[i].bytes, data, size)) {
>           return &table->imm[i];
>        }
>     }
> @@ -190,6 +201,96 @@ compare(const void *_a, const void *_b)
>     return a->first_use_ip - b->first_use_ip;
>  }
>
> +static bool
> +get_constant_value(const struct gen_device_info *devinfo,
> +                   const fs_inst *inst, uint32_t src_idx,
> +                   void *out, brw_reg_type *out_type)
> +{
> +   const bool can_do_source_mods = inst->can_do_source_mods(devinfo);
> +   const fs_reg *src = &inst->src[src_idx];
> +
> +   *out_type = src->type;
> +
> +   switch (*out_type) {
> +   case BRW_REGISTER_TYPE_F: {
> +      float val = !can_do_source_mods ? src->f : fabsf(src->f);
> +      memcpy(out, &val, 4);
> +      break;
> +   }
> +   case BRW_REGISTER_TYPE_HF: {
> +      uint16_t val = src->d & 0xffffu;
> +      if (can_do_source_mods)
> +         val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
> +      memcpy(out, &val, 2);
> +      break;
> +   }
> +   case BRW_REGISTER_TYPE_D: {
> +      int32_t val = !can_do_source_mods ? src->d : abs(src->d);
> +      memcpy(out, &val, 4);
> +      break;
> +   }
> +   case BRW_REGISTER_TYPE_UD:
> +      memcpy(out, &src->ud, 4);
> +      break;
> +   case BRW_REGISTER_TYPE_W: {
> +      int16_t val = src->d & 0xffffu;
> +      if (can_do_source_mods)
> +         val = abs(val);
> +      memcpy(out, &val, 2);
> +      break;
> +   }
> +   case BRW_REGISTER_TYPE_UW:
> +      memcpy(out, &src->ud, 2);
> +      break;
>

You could also throw in DF and Q types.  This is probably sufficient for
now though.


> +   default:
> +      return false;
> +   };
> +
> +   return true;
> +}
> +
> +static struct brw_reg
> +build_imm_reg_for_copy(struct imm *imm)
> +{
> +   switch (imm->size) {
> +   case 4:
> +      return brw_imm_d(imm->d);
> +   case 2:
> +      return brw_imm_w(imm->w);
> +   default:
> +      unreachable("not implemented");
> +   }
> +}
> +
> +static inline uint32_t
> +get_alignment_for_imm(const struct imm *imm)
> +{
> +   if (imm->is_half_float)
> +      return 4; /* At least MAD seems to require this */
> +   else
> +      return imm->size;
> +}
> +
> +static bool
> +needs_negate(const struct fs_reg *reg, const struct imm *imm)
> +{
> +   switch (reg->type) {
> +   case BRW_REGISTER_TYPE_F:
> +      return signbit(reg->f) != signbit(imm->f);
> +   case BRW_REGISTER_TYPE_D:
> +      return (reg->d < 0) != (imm->d < 0);
> +   case BRW_REGISTER_TYPE_HF:
> +      return (reg->d & 0x8000u) != (imm->w & 0x8000u);
> +   case BRW_REGISTER_TYPE_W:
> +      return ((reg->d & 0xffffu) < 0) != (imm->w < 0);
> +   case BRW_REGISTER_TYPE_UD:
> +   case BRW_REGISTER_TYPE_UW:
> +      return false;
> +   default:
> +      unreachable("not implemented");
> +   };
> +}
> +
>  bool
>  fs_visitor::opt_combine_constants()
>  {
> @@ -214,13 +315,17 @@ fs_visitor::opt_combine_constants()
>           continue;
>
>        for (int i = 0; i < inst->sources; i++) {
> -         if (inst->src[i].file != IMM ||
> -             inst->src[i].type != BRW_REGISTER_TYPE_F)
> +         if (inst->src[i].file != IMM)
>              continue;
>
> -         float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
> -                     fabs(inst->src[i].f);
> -         struct imm *imm = find_imm(&table, val);
> +         char data[8];
> +         brw_reg_type type;
> +         if (!get_constant_value(devinfo, inst, i, data, &type))
> +            continue;
> +
> +         uint8_t size = type_sz(type);
> +
> +         struct imm *imm = find_imm(&table, data, size);
>
>           if (imm) {
>              bblock_t *intersection = cfg_t::intersect(block, imm->block);
> @@ -237,7 +342,9 @@ fs_visitor::opt_combine_constants()
>              imm->inst = inst;
>              imm->uses = new(const_ctx) exec_list();
>              imm->uses->push_tail(link(const_ctx, &inst->src[i]));
> -            imm->val = val;
> +            memcpy(imm->bytes, data, size);
> +            imm->size = size;
> +            imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
>

In the "if (imm)" case above, I think we want to do

if (type == BRW_REGISTER_TYPE_HF)
   imm->is_half_float = true;

in case we get a constant that's used first as a uint16_t (which won't set
is_half_float) and then later as a float16_t.


>              imm->uses_by_coissue = could_coissue(devinfo, inst);
>              imm->must_promote = must_promote_imm(devinfo, inst);
>              imm->first_use_ip = ip;
> @@ -276,17 +383,40 @@ fs_visitor::opt_combine_constants()
>         */
>        exec_node *n = (imm->inst ? imm->inst :
>                        imm->block->last_non_control_flow_inst()->next);
> -      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1,
> 0);
>
> -      ibld.MOV(reg, brw_imm_f(imm->val));
> -      imm->nr = reg.nr;
> -      imm->subreg_offset = reg.offset;
> +      /* From the BDW and CHV PRM, 3D Media GPGPU, Special Restrictions:
> +       *
> +       *   "In Align16 mode, the channel selects and channel enables
> apply to a
> +       *    pair of half-floats, because these parameters are defined for
> DWord
> +       *    elements ONLY. This is applicable when both source and
> destination
> +       *    are half-floats."
> +       *
> +       * This means that Align16 instructions that use promoted HF
> immediates
> +       * and use a <0,1,0>:HF region would read 2 HF slots instead of
> +       * replicating the single one we want. To avoid this, we always
> populate
> +       * both HF slots within a DWord with the constant.
> +       */
> +      const uint32_t width = devinfo->gen == 8 && imm->is_half_float ? 2
> : 1;
> +      const fs_builder ibld = bld.at(imm->block,
> n).exec_all().group(width, 0);
> +
> +      /* Put the immediate in an offset aligned to its size. Some
> instructions
> +       * seem to have additional alignment requirements, so account for
> that
> +       * too.
> +       */
> +      reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
>
> -      reg.offset += sizeof(float);
> -      if (reg.offset == 8 * sizeof(float)) {
> +      /* Ensure we have enough space in the register to copy the
> immediate */
> +      struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
> +      if (reg.offset + type_sz(imm_reg.type) > REG_SIZE) {
>

I think you need to multiply by width here as well.


>           reg.nr = alloc.allocate(1);
>           reg.offset = 0;
>        }
> +
> +      ibld.MOV(retype(reg, imm_reg.type), imm_reg);
> +      imm->nr = reg.nr;
> +      imm->subreg_offset = reg.offset;
> +
> +      reg.offset += imm->size;
>

Multiply by width.

You can ignore the comment about DF and Q types if you'd like.  With all
the others addressed, this is

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>


>     }
>     promoted_constants = table.len;
>
> @@ -294,13 +424,41 @@ fs_visitor::opt_combine_constants()
>     for (int i = 0; i < table.len; i++) {
>        foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
>           fs_reg *reg = link->reg;
> -         assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
> -                fabsf(reg->f) == fabs(table.imm[i].val));
> +#ifdef DEBUG
> +         switch (reg->type) {
> +         case BRW_REGISTER_TYPE_F:
> +            assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
> +                   (fabsf(reg->f) == fabsf(table.imm[i].f)));
> +            break;
> +         case BRW_REGISTER_TYPE_HF:
> +            assert((isnan(_mesa_half_to_float(reg->d & 0xffffu)) &&
> +                    isnan(_mesa_half_to_float(table.imm[i].w))) ||
> +                   (fabsf(_mesa_half_to_float(reg->d & 0xffffu)) ==
> +                    fabsf(_mesa_half_to_float(table.imm[i].w))));
> +            break;
> +         case BRW_REGISTER_TYPE_D:
> +            assert(reg->type == BRW_REGISTER_TYPE_D &&
> +                   abs(reg->d) == abs(table.imm[i].d));
> +            break;
> +         case BRW_REGISTER_TYPE_UD:
> +            assert(reg->d == table.imm[i].d);
> +            break;
> +         case BRW_REGISTER_TYPE_W:
> +            assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w);
> +            break;
> +         case BRW_REGISTER_TYPE_UW:
> +            assert(reg->type == BRW_REGISTER_TYPE_UW &&
> +                   (reg->ud & 0xffffu) == (uint16_t) table.imm[i].w);
> +            break;
> +         default:
> +            break;
> +         }
> +#endif
>
>           reg->file = VGRF;
>           reg->offset = table.imm[i].subreg_offset;
>           reg->stride = 0;
> -         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
> +         reg->negate = needs_negate(reg, &table.imm[i]);
>           reg->nr = table.imm[i].nr;
>        }
>     }
> @@ -309,9 +467,9 @@ fs_visitor::opt_combine_constants()
>        for (int i = 0; i < table.len; i++) {
>           struct imm *imm = &table.imm[i];
>
> -         printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d, %2d), "
> -                "IP: %4d to %4d, length %4d\n",
> -                imm->val,
> +         printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
> +                "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
> +                (uint64_t)(imm->d & BITFIELD64_MASK(imm->size * 8)),
>                  imm->block->num,
>                  imm->nr,
>                  imm->subreg_offset,
> --
> 2.17.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Sat, 2019-02-16 at 09:29 -0600, Jason Ekstrand wrote:
> On Tue, Feb 12, 2019 at 5:57 AM Iago Toral Quiroga <itoral@igalia.com
> > wrote:
> > At the very least we need it to handle HF too, since we are doing
> > 
> > constant propagation for MAD and LRP, which relies on this pass
> > 
> > to promote the immediates to GRF in the end, but ideally
> > 
> > we want it to support even more types so we can take advantage
> > 
> > of it to improve register pressure in some scenarios.
> > 
> > ---
> > 
> >  .../compiler/brw_fs_combine_constants.cpp     | 202
> > ++++++++++++++++--
> > 
> >  1 file changed, 180 insertions(+), 22 deletions(-)
> > 
> > 
> > 
> > diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp
> > b/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > index 7343f77bb45..5d79f1a0826 100644
> > 
> > --- a/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > +++ b/src/intel/compiler/brw_fs_combine_constants.cpp
> > 
> > @@ -36,6 +36,7 @@
> > 
> > 
> > 
> >  #include "brw_fs.h"
> > 
> >  #include "brw_cfg.h"
> > 
> > +#include "util/half_float.h"
> > 
> > 
> > 
> >  using namespace brw;
> > 
> > 
> > 
> > @@ -114,8 +115,17 @@ struct imm {
> > 
> >      */
> > 
> >     exec_list *uses;
> > 
> > 
> > 
> > -   /** The immediate value.  We currently only handle floats. */
> > 
> > -   float val;
> > 
> > +   /** The immediate value */
> > 
> > +   union {
> > 
> > +      char bytes[8];
> > 
> > +      float f;
> > 
> > +      int32_t d;
> > 
> > +      int16_t w;
> > 
> > +   };
> > 
> > +   uint8_t size;
> > 
> > +
> > 
> > +   /** When promoting half-float we need to account for certain
> > restrictions */
> > 
> > +   bool is_half_float;
> > 
> > 
> > 
> >     /**
> > 
> >      * The GRF register and subregister number where we've decided
> > to store the
> > 
> > @@ -145,10 +155,11 @@ struct table {
> > 
> >  };
> > 
> > 
> > 
> >  static struct imm *
> > 
> > -find_imm(struct table *table, float val)
> > 
> > +find_imm(struct table *table, void *data, uint8_t size)
> > 
> >  {
> > 
> >     for (int i = 0; i < table->len; i++) {
> > 
> > -      if (table->imm[i].val == val) {
> > 
> > +      if (table->imm[i].size == size &&
> > 
> > +          !memcmp(table->imm[i].bytes, data, size)) {
> > 
> >           return &table->imm[i];
> > 
> >        }
> > 
> >     }
> > 
> > @@ -190,6 +201,96 @@ compare(const void *_a, const void *_b)
> > 
> >     return a->first_use_ip - b->first_use_ip;
> > 
> >  }
> > 
> > 
> > 
> > +static bool
> > 
> > +get_constant_value(const struct gen_device_info *devinfo,
> > 
> > +                   const fs_inst *inst, uint32_t src_idx,
> > 
> > +                   void *out, brw_reg_type *out_type)
> > 
> > +{
> > 
> > +   const bool can_do_source_mods = inst-
> > >can_do_source_mods(devinfo);
> > 
> > +   const fs_reg *src = &inst->src[src_idx];
> > 
> > +
> > 
> > +   *out_type = src->type;
> > 
> > +
> > 
> > +   switch (*out_type) {
> > 
> > +   case BRW_REGISTER_TYPE_F: {
> > 
> > +      float val = !can_do_source_mods ? src->f : fabsf(src->f);
> > 
> > +      memcpy(out, &val, 4);
> > 
> > +      break;
> > 
> > +   }
> > 
> > +   case BRW_REGISTER_TYPE_HF: {
> > 
> > +      uint16_t val = src->d & 0xffffu;
> > 
> > +      if (can_do_source_mods)
> > 
> > +         val =
> > _mesa_float_to_half(fabsf(_mesa_half_to_float(val)));
> > 
> > +      memcpy(out, &val, 2);
> > 
> > +      break;
> > 
> > +   }
> > 
> > +   case BRW_REGISTER_TYPE_D: {
> > 
> > +      int32_t val = !can_do_source_mods ? src->d : abs(src->d);
> > 
> > +      memcpy(out, &val, 4);
> > 
> > +      break;
> > 
> > +   }
> > 
> > +   case BRW_REGISTER_TYPE_UD:
> > 
> > +      memcpy(out, &src->ud, 4);
> > 
> > +      break;
> > 
> > +   case BRW_REGISTER_TYPE_W: {
> > 
> > +      int16_t val = src->d & 0xffffu;
> > 
> > +      if (can_do_source_mods)
> > 
> > +         val = abs(val);
> > 
> > +      memcpy(out, &val, 2);
> > 
> > +      break;
> > 
> > +   }
> > 
> > +   case BRW_REGISTER_TYPE_UW:
> > 
> > +      memcpy(out, &src->ud, 2);
> > 
> > +      break;
> 
> You could also throw in DF and Q types.  This is probably sufficient
> for now though.

Sure, I was waiting to do that until I started enabling constant
propagation of 64-bit types but there is no reason why we can't leave
the pass prepared for that I guess. It seems that for platforms that
don't support 64-bit types we should not be seeing 64-bit constants
here, so I guess there is no risk.
>  
> > +   default:
> > 
> > +      return false;
> > 
> > +   };
> > 
> > +
> > 
> > +   return true;
> > 
> > +}
> > 
> > +
> > 
> > +static struct brw_reg
> > 
> > +build_imm_reg_for_copy(struct imm *imm)
> > 
> > +{
> > 
> > +   switch (imm->size) {
> > 
> > +   case 4:
> > 
> > +      return brw_imm_d(imm->d);
> > 
> > +   case 2:
> > 
> > +      return brw_imm_w(imm->w);
> > 
> > +   default:
> > 
> > +      unreachable("not implemented");
> > 
> > +   }
> > 
> > +}
> > 
> > +
> > 
> > +static inline uint32_t
> > 
> > +get_alignment_for_imm(const struct imm *imm)
> > 
> > +{
> > 
> > +   if (imm->is_half_float)
> > 
> > +      return 4; /* At least MAD seems to require this */
> > 
> > +   else
> > 
> > +      return imm->size;
> > 
> > +}
> > 
> > +
> > 
> > +static bool
> > 
> > +needs_negate(const struct fs_reg *reg, const struct imm *imm)
> > 
> > +{
> > 
> > +   switch (reg->type) {
> > 
> > +   case BRW_REGISTER_TYPE_F:
> > 
> > +      return signbit(reg->f) != signbit(imm->f);
> > 
> > +   case BRW_REGISTER_TYPE_D:
> > 
> > +      return (reg->d < 0) != (imm->d < 0);
> > 
> > +   case BRW_REGISTER_TYPE_HF:
> > 
> > +      return (reg->d & 0x8000u) != (imm->w & 0x8000u);
> > 
> > +   case BRW_REGISTER_TYPE_W:
> > 
> > +      return ((reg->d & 0xffffu) < 0) != (imm->w < 0);
> > 
> > +   case BRW_REGISTER_TYPE_UD:
> > 
> > +   case BRW_REGISTER_TYPE_UW:
> > 
> > +      return false;
> > 
> > +   default:
> > 
> > +      unreachable("not implemented");
> > 
> > +   };
> > 
> > +}
> > 
> > +
> > 
> >  bool
> > 
> >  fs_visitor::opt_combine_constants()
> > 
> >  {
> > 
> > @@ -214,13 +315,17 @@ fs_visitor::opt_combine_constants()
> > 
> >           continue;
> > 
> > 
> > 
> >        for (int i = 0; i < inst->sources; i++) {
> > 
> > -         if (inst->src[i].file != IMM ||
> > 
> > -             inst->src[i].type != BRW_REGISTER_TYPE_F)
> > 
> > +         if (inst->src[i].file != IMM)
> > 
> >              continue;
> > 
> > 
> > 
> > -         float val = !inst->can_do_source_mods(devinfo) ? inst-
> > >src[i].f :
> > 
> > -                     fabs(inst->src[i].f);
> > 
> > -         struct imm *imm = find_imm(&table, val);
> > 
> > +         char data[8];
> > 
> > +         brw_reg_type type;
> > 
> > +         if (!get_constant_value(devinfo, inst, i, data, &type))
> > 
> > +            continue;
> > 
> > +
> > 
> > +         uint8_t size = type_sz(type);
> > 
> > +
> > 
> > +         struct imm *imm = find_imm(&table, data, size);
> > 
> > 
> > 
> >           if (imm) {
> > 
> >              bblock_t *intersection = cfg_t::intersect(block, imm-
> > >block);
> > 
> > @@ -237,7 +342,9 @@ fs_visitor::opt_combine_constants()
> > 
> >              imm->inst = inst;
> > 
> >              imm->uses = new(const_ctx) exec_list();
> > 
> >              imm->uses->push_tail(link(const_ctx, &inst->src[i]));
> > 
> > -            imm->val = val;
> > 
> > +            memcpy(imm->bytes, data, size);
> > 
> > +            imm->size = size;
> > 
> > +            imm->is_half_float = type == BRW_REGISTER_TYPE_HF;
> 
> In the "if (imm)" case above, I think we want to do
> 
> if (type == BRW_REGISTER_TYPE_HF)
>    imm->is_half_float = true;
> 
> in case we get a constant that's used first as a uint16_t (which
> won't set is_half_float) and then later as a float16_t.

Yes, good catch.
>  
> >              imm->uses_by_coissue = could_coissue(devinfo, inst);
> > 
> >              imm->must_promote = must_promote_imm(devinfo, inst);
> > 
> >              imm->first_use_ip = ip;
> > 
> > @@ -276,17 +383,40 @@ fs_visitor::opt_combine_constants()
> > 
> >         */
> > 
> >        exec_node *n = (imm->inst ? imm->inst :
> > 
> >                        imm->block->last_non_control_flow_inst()-
> > >next);
> > 
> > -      const fs_builder ibld = bld.at(imm->block,
> > n).exec_all().group(1, 0);
> > 
> > 
> > 
> > -      ibld.MOV(reg, brw_imm_f(imm->val));
> > 
> > -      imm->nr = reg.nr;
> > 
> > -      imm->subreg_offset = reg.offset;
> > 
> > +      /* From the BDW and CHV PRM, 3D Media GPGPU, Special
> > Restrictions:
> > 
> > +       *
> > 
> > +       *   "In Align16 mode, the channel selects and channel
> > enables apply to a
> > 
> > +       *    pair of half-floats, because these parameters are
> > defined for DWord
> > 
> > +       *    elements ONLY. This is applicable when both source and
> > destination
> > 
> > +       *    are half-floats."
> > 
> > +       *
> > 
> > +       * This means that Align16 instructions that use promoted HF
> > immediates
> > 
> > +       * and use a <0,1,0>:HF region would read 2 HF slots instead
> > of
> > 
> > +       * replicating the single one we want. To avoid this, we
> > always populate
> > 
> > +       * both HF slots within a DWord with the constant.
> > 
> > +       */
> > 
> > +      const uint32_t width = devinfo->gen == 8 && imm-
> > >is_half_float ? 2 : 1;
> > 
> > +      const fs_builder ibld = bld.at(imm->block,
> > n).exec_all().group(width, 0);
> > 
> > +
> > 
> > +      /* Put the immediate in an offset aligned to its size. Some
> > instructions
> > 
> > +       * seem to have additional alignment requirements, so
> > account for that
> > 
> > +       * too.
> > 
> > +       */
> > 
> > +      reg.offset = ALIGN(reg.offset, get_alignment_for_imm(imm));
> > 
> > 
> > 
> > -      reg.offset += sizeof(float);
> > 
> > -      if (reg.offset == 8 * sizeof(float)) {
> > 
> > +      /* Ensure we have enough space in the register to copy the
> > immediate */
> > 
> > +      struct brw_reg imm_reg = build_imm_reg_for_copy(imm);
> > 
> > +      if (reg.offset + type_sz(imm_reg.type) > REG_SIZE) {
> 
> I think you need to multiply by width here as well.
>  
> >           reg.nr = alloc.allocate(1);
> > 
> >           reg.offset = 0;
> > 
> >        }
> > 
> > +
> > 
> > +      ibld.MOV(retype(reg, imm_reg.type), imm_reg);
> > 
> > +      imm->nr = reg.nr;
> > 
> > +      imm->subreg_offset = reg.offset;
> > 
> > +
> > 
> > +      reg.offset += imm->size;
> 
> Multiply by width.

Yes.
> You can ignore the comment about DF and Q types if you'd like.  With
> all the others addressed, this is
> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
>  
> >     }
> > 
> >     promoted_constants = table.len;
> > 
> > 
> > 
> > @@ -294,13 +424,41 @@ fs_visitor::opt_combine_constants()
> > 
> >     for (int i = 0; i < table.len; i++) {
> > 
> >        foreach_list_typed(reg_link, link, link, table.imm[i].uses)
> > {
> > 
> >           fs_reg *reg = link->reg;
> > 
> > -         assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
> > 
> > -                fabsf(reg->f) == fabs(table.imm[i].val));
> > 
> > +#ifdef DEBUG
> > 
> > +         switch (reg->type) {
> > 
> > +         case BRW_REGISTER_TYPE_F:
> > 
> > +            assert((isnan(reg->f) && isnan(table.imm[i].f)) ||
> > 
> > +                   (fabsf(reg->f) == fabsf(table.imm[i].f)));
> > 
> > +            break;
> > 
> > +         case BRW_REGISTER_TYPE_HF:
> > 
> > +            assert((isnan(_mesa_half_to_float(reg->d & 0xffffu))
> > &&
> > 
> > +                    isnan(_mesa_half_to_float(table.imm[i].w))) ||
> > 
> > +                   (fabsf(_mesa_half_to_float(reg->d & 0xffffu))
> > ==
> > 
> > +                    fabsf(_mesa_half_to_float(table.imm[i].w))));
> > 
> > +            break;
> > 
> > +         case BRW_REGISTER_TYPE_D:
> > 
> > +            assert(reg->type == BRW_REGISTER_TYPE_D &&
> > 
> > +                   abs(reg->d) == abs(table.imm[i].d));
> > 
> > +            break;
> > 
> > +         case BRW_REGISTER_TYPE_UD:
> > 
> > +            assert(reg->d == table.imm[i].d);
> > 
> > +            break;
> > 
> > +         case BRW_REGISTER_TYPE_W:
> > 
> > +            assert(abs((int16_t) (reg->d & 0xffff)) ==
> > table.imm[i].w);
> > 
> > +            break;
> > 
> > +         case BRW_REGISTER_TYPE_UW:
> > 
> > +            assert(reg->type == BRW_REGISTER_TYPE_UW &&
> > 
> > +                   (reg->ud & 0xffffu) == (uint16_t)
> > table.imm[i].w);
> > 
> > +            break;
> > 
> > +         default:
> > 
> > +            break;
> > 
> > +         }
> > 
> > +#endif
> > 
> > 
> > 
> >           reg->file = VGRF;
> > 
> >           reg->offset = table.imm[i].subreg_offset;
> > 
> >           reg->stride = 0;
> > 
> > -         reg->negate = signbit(reg->f) !=
> > signbit(table.imm[i].val);
> > 
> > +         reg->negate = needs_negate(reg, &table.imm[i]);
> > 
> >           reg->nr = table.imm[i].nr;
> > 
> >        }
> > 
> >     }
> > 
> > @@ -309,9 +467,9 @@ fs_visitor::opt_combine_constants()
> > 
> >        for (int i = 0; i < table.len; i++) {
> > 
> >           struct imm *imm = &table.imm[i];
> > 
> > 
> > 
> > -         printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d,
> > %2d), "
> > 
> > -                "IP: %4d to %4d, length %4d\n",
> > 
> > -                imm->val,
> > 
> > +         printf("0x%016" PRIx64 " - block %3d, reg %3d sub %2d, "
> > 
> > +                "Uses: (%2d, %2d), IP: %4d to %4d, length %4d\n",
> > 
> > +                (uint64_t)(imm->d & BITFIELD64_MASK(imm->size *
> > 8)),
> > 
> >                  imm->block->num,
> > 
> >                  imm->nr,
> > 
> >                  imm->subreg_offset,
> >