[07/10] intel/fs: Introduce regioning lowering pass.

Submitted by Francisco Jerez on Dec. 29, 2018, 8:39 p.m.

Details

Message ID 20181229203904.18847-8-currojerez@riseup.net
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Francisco Jerez Dec. 29, 2018, 8:39 p.m.
This legalization pass is meant to handle situations where the source
or destination regioning controls of an instruction are unsupported by
the hardware and need to be lowered away into separate instructions.
This should be more reliable and future-proof than the current
approach of handling CHV/BXT restrictions manually all over the
visitor.  The same mechanism is leveraged to lower unsupported type
conversions easily, which obsoletes the lower_conversions pass.
---
 src/intel/Makefile.sources                    |   1 +
 src/intel/compiler/brw_fs.cpp                 |   5 +-
 src/intel/compiler/brw_fs.h                   |  21 +-
 src/intel/compiler/brw_fs_lower_regioning.cpp | 382 ++++++++++++++++++
 src/intel/compiler/brw_ir_fs.h                |  10 +
 src/intel/compiler/meson.build                |   1 +
 6 files changed, 401 insertions(+), 19 deletions(-)
 create mode 100644 src/intel/compiler/brw_fs_lower_regioning.cpp

Patch hide | download patch | download mbox

diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
index 5e7d32293b7..6b9874d2b80 100644
--- a/src/intel/Makefile.sources
+++ b/src/intel/Makefile.sources
@@ -64,6 +64,7 @@  COMPILER_FILES = \
 	compiler/brw_fs_live_variables.h \
 	compiler/brw_fs_lower_conversions.cpp \
 	compiler/brw_fs_lower_pack.cpp \
+	compiler/brw_fs_lower_regioning.cpp \
 	compiler/brw_fs_nir.cpp \
 	compiler/brw_fs_reg_allocate.cpp \
 	compiler/brw_fs_register_coalesce.cpp \
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 889509badab..caa7a798332 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6471,7 +6471,10 @@  fs_visitor::optimize()
       OPT(dead_code_eliminate);
    }
 
-   if (OPT(lower_conversions)) {
+   progress = false;
+   OPT(lower_conversions);
+   OPT(lower_regioning);
+   if (progress) {
       OPT(opt_copy_propagation);
       OPT(dead_code_eliminate);
       OPT(lower_simd_width);
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index dc36ecc21ac..36825754931 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -164,6 +164,7 @@  public:
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
    bool lower_pack();
+   bool lower_regioning();
    bool lower_conversions();
    bool lower_logical_sends();
    bool lower_integer_multiplication();
@@ -536,24 +537,8 @@  namespace brw {
       }
    }
 
-   /**
-    * Remove any modifiers from the \p i-th source region of the instruction,
-    * including negate, abs and any implicit type conversion to the execution
-    * type.  Instead any source modifiers will be implemented as a separate
-    * MOV instruction prior to the original instruction.
-    */
-   inline bool
-   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
-   {
-      assert(inst->components_read(i) == 1);
-      const fs_builder ibld(v, block, inst);
-      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
-
-      ibld.MOV(tmp, inst->src[i]);
-      inst->src[i] = tmp;
-
-      return true;
-   }
+   bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i);
 }
 
 void shuffle_from_32bit_read(const brw::fs_builder &bld,
diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
new file mode 100644
index 00000000000..9578622401d
--- /dev/null
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -0,0 +1,382 @@ 
+/*
+ * Copyright © 2018 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+namespace {
+   /* From the SKL PRM Vol 2a, "Move":
+    *
+    * "A mov with the same source and destination type, no source modifier,
+    *  and no saturation is a raw move. A packed byte destination region (B
+    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
+    *  using raw move."
+    */
+   bool
+   is_byte_raw_mov(const fs_inst *inst)
+   {
+      return type_sz(inst->dst.type) == 1 &&
+             inst->opcode == BRW_OPCODE_MOV &&
+             inst->src[0].type == inst->dst.type &&
+             !inst->saturate &&
+             !inst->src[0].negate &&
+             !inst->src[0].abs;
+   }
+
+   /*
+    * Return an acceptable byte stride for the destination of an instruction
+    * that requires it to have some particular alignment.
+    */
+   unsigned
+   required_dst_byte_stride(const fs_inst *inst)
+   {
+      if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
+          !is_byte_raw_mov(inst)) {
+         return get_exec_type_size(inst);
+      } else {
+         unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
+
+         for (unsigned i = 0; i < inst->sources; i++) {
+            if (!is_uniform(inst->src[i]))
+               stride = MAX2(stride, inst->src[i].stride *
+                             type_sz(inst->src[i].type));
+         }
+
+         return stride;
+      }
+   }
+
+   /*
+    * Return an acceptable byte sub-register offset for the destination of an
+    * instruction that requires it to be aligned to the sub-register offset of
+    * the sources.
+    */
+   unsigned
+   required_dst_byte_offset(const fs_inst *inst)
+   {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (!is_uniform(inst->src[i]))
+            if (reg_offset(inst->src[i]) % REG_SIZE !=
+                reg_offset(inst->dst) % REG_SIZE)
+               return 0;
+      }
+
+      return reg_offset(inst->dst) % REG_SIZE;
+   }
+
+   /*
+    * Return whether the instruction has an unsupported channel bit layout
+    * specified for the i-th source region.
+    */
+   bool
+   has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
+                          unsigned i)
+   {
+      if (is_unordered(inst)) {
+         return false;
+      } else {
+         const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
+         const unsigned src_byte_stride = inst->src[i].stride *
+            type_sz(inst->src[i].type);
+         const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
+         const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
+
+         return has_dst_aligned_region_restriction(devinfo, inst) &&
+                !is_uniform(inst->src[i]) &&
+                (src_byte_stride != dst_byte_stride ||
+                 src_byte_offset != dst_byte_offset);
+      }
+   }
+
+   /*
+    * Return whether the instruction has an unsupported channel bit layout
+    * specified for the destination region.
+    */
+   bool
+   has_invalid_dst_region(const gen_device_info *devinfo,
+                          const fs_inst *inst)
+   {
+      if (is_unordered(inst)) {
+         return false;
+      } else {
+         const brw_reg_type exec_type = get_exec_type(inst);
+         const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
+         const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
+         const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
+            type_sz(inst->dst.type) < type_sz(exec_type);
+
+         return (has_dst_aligned_region_restriction(devinfo, inst) &&
+                 (required_dst_byte_stride(inst) != dst_byte_stride ||
+                  required_dst_byte_offset(inst) != dst_byte_offset)) ||
+                (is_narrowing_conversion &&
+                 required_dst_byte_stride(inst) != dst_byte_stride);
+      }
+   }
+
+   /*
+    * Return whether the instruction has unsupported source modifiers
+    * specified for the i-th source region.
+    */
+   bool
+   has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
+                             unsigned i)
+   {
+      return !inst->can_do_source_mods(devinfo) &&
+             (inst->src[i].negate || inst->src[i].abs);
+   }
+
+   /*
+    * Return whether the instruction has an unsupported type conversion
+    * specified for the destination.
+    */
+   bool
+   has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
+   {
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         return false;
+      case BRW_OPCODE_SEL:
+         return inst->dst.type != get_exec_type(inst);
+      case SHADER_OPCODE_BROADCAST:
+      case SHADER_OPCODE_MOV_INDIRECT:
+         /* The source and destination types of these may are hard-coded to
+          * integer at codegen time due to hardware limitations of 64-bit
+          * types.
+          */
+         return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
+                 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
+                type_sz(inst->src[0].type) > 4 &&
+                inst->dst.type != inst->src[0].type;
+      default:
+         /* FIXME: We assume the opcodes don't explicitly mentioned before
+          * just work fine with arbitrary conversions.
+          */
+         return false;
+      }
+   }
+
+   bool
+   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
+}
+
+namespace brw {
+   /**
+    * Remove any modifiers from the \p i-th source region of the instruction,
+    * including negate, abs and any implicit type conversion to the execution
+    * type.  Instead any source modifiers will be implemented as a separate
+    * MOV instruction prior to the original instruction.
+    */
+   bool
+   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+   {
+      assert(inst->components_read(i) == 1);
+      const fs_builder ibld(v, block, inst);
+      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
+
+      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
+      inst->src[i] = tmp;
+
+      return true;
+   }
+}
+
+namespace {
+   /**
+    * Remove any modifiers from the destination region of the instruction,
+    * including saturate, conditional mod and any implicit type conversion
+    * from the execution type.  Instead any destination modifiers will be
+    * implemented as a separate MOV instruction after the original
+    * instruction.
+    */
+   bool
+   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      const fs_builder ibld(v, block, inst);
+      const brw_reg_type type = get_exec_type(inst);
+      /* Not strictly necessary, but if possible use a temporary with the same
+       * channel alignment as the current destination in order to avoid
+       * violating the restrictions enforced later on by lower_src_region()
+       * and lower_dst_region(), which would introduce additional copy
+       * instructions into the program unnecessarily.
+       */
+      const unsigned stride =
+         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
+         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
+      const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
+
+      /* Emit a MOV taking care of all the destination modifiers. */
+      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
+      mov->saturate = inst->saturate;
+      mov->conditional_mod = inst->conditional_mod;
+      if (inst->opcode != BRW_OPCODE_SEL) {
+         mov->predicate = inst->predicate;
+         mov->predicate_inverse = inst->predicate_inverse;
+      }
+      mov->flag_subreg = inst->flag_subreg;
+      lower_instruction(v, block, mov);
+
+      /* Point the original instruction at the temporary, and clean up any
+       * destination modifiers.
+       */
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      inst->dst = tmp;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+      inst->saturate = false;
+      inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+      return true;
+   }
+
+   /**
+    * Remove any non-trivial shuffling of data from the \p i-th source region
+    * of the instruction.  Instead implement the region as a series of integer
+    * copies into a temporary with the same channel layout as the destination.
+    */
+   bool
+   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
+   {
+      assert(inst->components_read(i) == 1);
+      const fs_builder ibld(v, block, inst);
+      const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
+                              type_sz(inst->src[i].type);
+      assert(stride > 0);
+      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
+                                      stride);
+
+      /* Emit a series of 32-bit integer copies with any source modifiers
+       * cleaned up (because their semantics are dependent on the type).
+       */
+      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+                                                 false);
+      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+      fs_reg raw_src = inst->src[i];
+      raw_src.negate = false;
+      raw_src.abs = false;
+
+      for (unsigned j = 0; j < n; j++)
+         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
+
+      /* Point the original instruction at the temporary, making sure to keep
+       * any source modifiers in the instruction.
+       */
+      fs_reg lower_src = tmp;
+      lower_src.negate = inst->src[i].negate;
+      lower_src.abs = inst->src[i].abs;
+      inst->src[i] = lower_src;
+
+      return true;
+   }
+
+   /**
+    * Remove any non-trivial shuffling of data from the destination region of
+    * the instruction.  Instead implement the region as a series of integer
+    * copies from a temporary with a channel layout compatible with the
+    * sources.
+    */
+   bool
+   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      const fs_builder ibld(v, block, inst);
+      const unsigned stride = required_dst_byte_stride(inst) /
+                              type_sz(inst->dst.type);
+      assert(stride > 0);
+      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
+                                      stride);
+
+      /* Emit a series of 32-bit integer copies from the temporary into the
+       * original destination.
+       */
+      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
+                                                 false);
+      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
+
+      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
+         /* Note that in general we cannot simply predicate the copies on the
+          * same flag register as the original instruction, since it may have
+          * been overwritten by the instruction itself.  Instead initialize
+          * the temporary with the previous contents of the destination
+          * register.
+          */
+         for (unsigned j = 0; j < n; j++)
+            ibld.MOV(subscript(tmp, raw_type, j),
+                     subscript(inst->dst, raw_type, j));
+      }
+
+      for (unsigned j = 0; j < n; j++)
+         ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
+                                        subscript(tmp, raw_type, j));
+
+      /* Point the original instruction at the temporary, making sure to keep
+       * any destination modifiers in the instruction.
+       */
+      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
+      inst->dst = tmp;
+      inst->size_written = inst->dst.component_size(inst->exec_size);
+
+      return true;
+   }
+
+   /**
+    * Legalize the source and destination regioning controls of the specified
+    * instruction.
+    */
+   bool
+   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
+   {
+      const gen_device_info *devinfo = v->devinfo;
+      bool progress = false;
+
+      if (has_invalid_conversion(devinfo, inst))
+         progress |= lower_dst_modifiers(v, block, inst);
+
+      if (has_invalid_dst_region(devinfo, inst))
+         progress |= lower_dst_region(v, block, inst);
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (has_invalid_src_modifiers(devinfo, inst, i))
+            progress |= lower_src_modifiers(v, block, inst, i);
+
+         if (has_invalid_src_region(devinfo, inst, i))
+            progress |= lower_src_region(v, block, inst, i);
+      }
+
+      return progress;
+   }
+}
+
+bool
+fs_visitor::lower_regioning()
+{
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg)
+      progress |= lower_instruction(this, block, inst);
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index 5bb92e4cc86..3c23fb375e4 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -486,6 +486,16 @@  get_exec_type_size(const fs_inst *inst)
    return type_sz(get_exec_type(inst));
 }
 
+/**
+ * Return whether the instruction isn't an ALU instruction and cannot be
+ * assumed to complete in-order.
+ */
+static inline bool
+is_unordered(const fs_inst *inst)
+{
+   return inst->mlen || inst->is_send_from_grf() || inst->is_math();
+}
+
 /**
  * Return whether the following regioning restriction applies to the specified
  * instruction.  From the Cherryview PRM Vol 7. "Register Region
diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build
index 69ce2eab4cf..4af134b418e 100644
--- a/src/intel/compiler/meson.build
+++ b/src/intel/compiler/meson.build
@@ -57,6 +57,7 @@  libintel_compiler_files = files(
   'brw_fs_live_variables.h',
   'brw_fs_lower_conversions.cpp',
   'brw_fs_lower_pack.cpp',
+  'brw_fs_lower_regioning.cpp',
   'brw_fs_nir.cpp',
   'brw_fs_reg_allocate.cpp',
   'brw_fs_register_coalesce.cpp',

Comments

On Sat, 2018-12-29 at 12:39 -0800, Francisco Jerez wrote:
> This legalization pass is meant to handle situations where the source
> or destination regioning controls of an instruction are unsupported
> by
> the hardware and need to be lowered away into separate instructions.
> This should be more reliable and future-proof than the current
> approach of handling CHV/BXT restrictions manually all over the
> visitor.  The same mechanism is leveraged to lower unsupported type
> conversions easily, which obsoletes the lower_conversions pass.
> ---
>  src/intel/Makefile.sources                    |   1 +
>  src/intel/compiler/brw_fs.cpp                 |   5 +-
>  src/intel/compiler/brw_fs.h                   |  21 +-
>  src/intel/compiler/brw_fs_lower_regioning.cpp | 382
> ++++++++++++++++++
>  src/intel/compiler/brw_ir_fs.h                |  10 +
>  src/intel/compiler/meson.build                |   1 +
>  6 files changed, 401 insertions(+), 19 deletions(-)
>  create mode 100644 src/intel/compiler/brw_fs_lower_regioning.cpp
> 
> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
> index 5e7d32293b7..6b9874d2b80 100644
> --- a/src/intel/Makefile.sources
> +++ b/src/intel/Makefile.sources
> @@ -64,6 +64,7 @@ COMPILER_FILES = \
>  	compiler/brw_fs_live_variables.h \
>  	compiler/brw_fs_lower_conversions.cpp \
>  	compiler/brw_fs_lower_pack.cpp \
> +	compiler/brw_fs_lower_regioning.cpp \
>  	compiler/brw_fs_nir.cpp \
>  	compiler/brw_fs_reg_allocate.cpp \
>  	compiler/brw_fs_register_coalesce.cpp \
> diff --git a/src/intel/compiler/brw_fs.cpp
> b/src/intel/compiler/brw_fs.cpp
> index 889509badab..caa7a798332 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -6471,7 +6471,10 @@ fs_visitor::optimize()
>        OPT(dead_code_eliminate);
>     }
>  
> -   if (OPT(lower_conversions)) {
> +   progress = false;
> +   OPT(lower_conversions);
> +   OPT(lower_regioning);
> +   if (progress) {
>        OPT(opt_copy_propagation);
>        OPT(dead_code_eliminate);
>        OPT(lower_simd_width);
> diff --git a/src/intel/compiler/brw_fs.h
> b/src/intel/compiler/brw_fs.h
> index dc36ecc21ac..36825754931 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -164,6 +164,7 @@ public:
>     void lower_uniform_pull_constant_loads();
>     bool lower_load_payload();
>     bool lower_pack();
> +   bool lower_regioning();
>     bool lower_conversions();
>     bool lower_logical_sends();
>     bool lower_integer_multiplication();
> @@ -536,24 +537,8 @@ namespace brw {
>        }
>     }
>  
> -   /**
> -    * Remove any modifiers from the \p i-th source region of the
> instruction,
> -    * including negate, abs and any implicit type conversion to the
> execution
> -    * type.  Instead any source modifiers will be implemented as a
> separate
> -    * MOV instruction prior to the original instruction.
> -    */
> -   inline bool
> -   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
> *inst, unsigned i)
> -   {
> -      assert(inst->components_read(i) == 1);
> -      const fs_builder ibld(v, block, inst);
> -      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
> -
> -      ibld.MOV(tmp, inst->src[i]);
> -      inst->src[i] = tmp;
> -
> -      return true;
> -   }
> +   bool
> +   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
> *inst, unsigned i);
>  }
>  
>  void shuffle_from_32bit_read(const brw::fs_builder &bld,
> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp
> b/src/intel/compiler/brw_fs_lower_regioning.cpp
> new file mode 100644
> index 00000000000..9578622401d
> --- /dev/null
> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
> @@ -0,0 +1,382 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person
> obtaining a
> + * copy of this software and associated documentation files (the
> "Software"),
> + * to deal in the Software without restriction, including without
> limitation
> + * the rights to use, copy, modify, merge, publish, distribute,
> sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom
> the
> + * Software is furnished to do so, subject to the following
> conditions:
> + *
> + * The above copyright notice and this permission notice (including
> the next
> + * paragraph) shall be included in all copies or substantial
> portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
> EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
> OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#include "brw_fs.h"
> +#include "brw_cfg.h"
> +#include "brw_fs_builder.h"
> +
> +using namespace brw;
> +
> +namespace {
> +   /* From the SKL PRM Vol 2a, "Move":
> +    *
> +    * "A mov with the same source and destination type, no source
> modifier,
> +    *  and no saturation is a raw move. A packed byte destination
> region (B
> +    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be
> written
> +    *  using raw move."
> +    */
> +   bool
> +   is_byte_raw_mov(const fs_inst *inst)
> +   {
> +      return type_sz(inst->dst.type) == 1 &&
> +             inst->opcode == BRW_OPCODE_MOV &&
> +             inst->src[0].type == inst->dst.type &&
> +             !inst->saturate &&
> +             !inst->src[0].negate &&
> +             !inst->src[0].abs;
> +   }
> +
> +   /*
> +    * Return an acceptable byte stride for the destination of an
> instruction
> +    * that requires it to have some particular alignment.
> +    */
> +   unsigned
> +   required_dst_byte_stride(const fs_inst *inst)
> +   {
> +      if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
> +          !is_byte_raw_mov(inst)) {
> +         return get_exec_type_size(inst);
> +      } else {
> +         unsigned stride = inst->dst.stride * type_sz(inst-
> >dst.type);
> +
> +         for (unsigned i = 0; i < inst->sources; i++) {
> +            if (!is_uniform(inst->src[i]))
> +               stride = MAX2(stride, inst->src[i].stride *
> +                             type_sz(inst->src[i].type));
> +         }
> +
> +         return stride;
> +      }
> +   }
> +
> +   /*
> +    * Return an acceptable byte sub-register offset for the
> destination of an
> +    * instruction that requires it to be aligned to the sub-register 
> offset of
> +    * the sources.
> +    */
> +   unsigned
> +   required_dst_byte_offset(const fs_inst *inst)
> +   {
> +      for (unsigned i = 0; i < inst->sources; i++) {
> +         if (!is_uniform(inst->src[i]))
> +            if (reg_offset(inst->src[i]) % REG_SIZE !=
> +                reg_offset(inst->dst) % REG_SIZE)
> +               return 0;
> +      }
> +
> +      return reg_offset(inst->dst) % REG_SIZE;
> +   }
> +
> +   /*
> +    * Return whether the instruction has an unsupported channel bit
> layout
> +    * specified for the i-th source region.
> +    */
> +   bool
> +   has_invalid_src_region(const gen_device_info *devinfo, const
> fs_inst *inst,
> +                          unsigned i)
> +   {
> +      if (is_unordered(inst)) {
> +         return false;
> +      } else {
> +         const unsigned dst_byte_stride = inst->dst.stride *
> type_sz(inst->dst.type);
> +         const unsigned src_byte_stride = inst->src[i].stride *
> +            type_sz(inst->src[i].type);
> +         const unsigned dst_byte_offset = reg_offset(inst->dst) %
> REG_SIZE;
> +         const unsigned src_byte_offset = reg_offset(inst->src[i]) %
> REG_SIZE;
> +
> +         return has_dst_aligned_region_restriction(devinfo, inst) &&
> +                !is_uniform(inst->src[i]) &&
> +                (src_byte_stride != dst_byte_stride ||
> +                 src_byte_offset != dst_byte_offset);
> +      }
> +   }
> +
> +   /*
> +    * Return whether the instruction has an unsupported channel bit
> layout
> +    * specified for the destination region.
> +    */
> +   bool
> +   has_invalid_dst_region(const gen_device_info *devinfo,
> +                          const fs_inst *inst)
> +   {
> +      if (is_unordered(inst)) {
> +         return false;
> +      } else {
> +         const brw_reg_type exec_type = get_exec_type(inst);
> +         const unsigned dst_byte_offset = reg_offset(inst->dst) %
> REG_SIZE;
> +         const unsigned dst_byte_stride = inst->dst.stride *
> type_sz(inst->dst.type);
> +         const bool is_narrowing_conversion = !is_byte_raw_mov(inst)
> &&
> +            type_sz(inst->dst.type) < type_sz(exec_type);
> +
> +         return (has_dst_aligned_region_restriction(devinfo, inst)
> &&
> +                 (required_dst_byte_stride(inst) != dst_byte_stride
> ||
> +                  required_dst_byte_offset(inst) !=
> dst_byte_offset)) ||
> +                (is_narrowing_conversion &&
> +                 required_dst_byte_stride(inst) != dst_byte_stride);
> +      }
> +   }
> +
> +   /*
> +    * Return whether the instruction has unsupported source
> modifiers
> +    * specified for the i-th source region.
> +    */
> +   bool
> +   has_invalid_src_modifiers(const gen_device_info *devinfo, const
> fs_inst *inst,
> +                             unsigned i)
> +   {
> +      return !inst->can_do_source_mods(devinfo) &&
> +             (inst->src[i].negate || inst->src[i].abs);
> +   }
> +
> +   /*
> +    * Return whether the instruction has an unsupported type
> conversion
> +    * specified for the destination.
> +    */
> +   bool
> +   has_invalid_conversion(const gen_device_info *devinfo, const
> fs_inst *inst)
> +   {
> +      switch (inst->opcode) {
> +      case BRW_OPCODE_MOV:
> +         return false;
> +      case BRW_OPCODE_SEL:
> +         return inst->dst.type != get_exec_type(inst);
> +      case SHADER_OPCODE_BROADCAST:
> +      case SHADER_OPCODE_MOV_INDIRECT:
> +         /* The source and destination types of these may are hard-
> coded to
> +          * integer at codegen time due to hardware limitations of
> 64-bit
> +          * types.
> +          */
> +         return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
> +                 devinfo->is_cherryview ||
> gen_device_info_is_9lp(devinfo)) &&
> +                type_sz(inst->src[0].type) > 4 &&
> +                inst->dst.type != inst->src[0].type;
> +      default:
> +         /* FIXME: We assume the opcodes don't explicitly mentioned
> before
> +          * just work fine with arbitrary conversions.
> +          */
> +         return false;
> +      }
> +   }
> +
> +   bool
> +   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
> +}
> +
> +namespace brw {
> +   /**
> +    * Remove any modifiers from the \p i-th source region of the
> instruction,
> +    * including negate, abs and any implicit type conversion to the
> execution
> +    * type.  Instead any source modifiers will be implemented as a
> separate
> +    * MOV instruction prior to the original instruction.
> +    */
> +   bool
> +   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
> *inst, unsigned i)
> +   {
> +      assert(inst->components_read(i) == 1);
> +      const fs_builder ibld(v, block, inst);
> +      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
> +
> +      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
> +      inst->src[i] = tmp;
> +
> +      return true;
> +   }
> +}
> +
> +namespace {
> +   /**
> +    * Remove any modifiers from the destination region of the
> instruction,
> +    * including saturate, conditional mod and any implicit type
> conversion
> +    * from the execution type.  Instead any destination modifiers
> will be
> +    * implemented as a separate MOV instruction after the original
> +    * instruction.
> +    */
> +   bool
> +   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst
> *inst)
> +   {
> +      const fs_builder ibld(v, block, inst);
> +      const brw_reg_type type = get_exec_type(inst);
> +      /* Not strictly necessary, but if possible use a temporary
> with the same
> +       * channel alignment as the current destination in order to
> avoid
> +       * violating the restrictions enforced later on by
> lower_src_region()
> +       * and lower_dst_region(), which would introduce additional
> copy
> +       * instructions into the program unnecessarily.
> +       */
> +      const unsigned stride =
> +         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type)
> ? 1 :
> +         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
> +      const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride),
> stride);
> +
> +      /* Emit a MOV taking care of all the destination modifiers. */
> +      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
> +      mov->saturate = inst->saturate;
> +      mov->conditional_mod = inst->conditional_mod;
> +      if (inst->opcode != BRW_OPCODE_SEL) {
> +         mov->predicate = inst->predicate;
> +         mov->predicate_inverse = inst->predicate_inverse;
> +      }
> +      mov->flag_subreg = inst->flag_subreg;
> +      lower_instruction(v, block, mov);
> +
> +      /* Point the original instruction at the temporary, and clean
> up any
> +       * destination modifiers.
> +       */
> +      assert(inst->size_written == inst->dst.component_size(inst-
> >exec_size));
> +      inst->dst = tmp;
> +      inst->size_written = inst->dst.component_size(inst-
> >exec_size);
> +      inst->saturate = false;
> +      inst->conditional_mod = BRW_CONDITIONAL_NONE;

I think this is not correct if if inst is a SEL instruction. Here is an
example that is causing a regresion for me:

With the original lower_conversions pass I have:

sel.l(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B 
mov(8) vgrf16+0.0<2>:B, vgrf15+0.0:W 

Now, with this pass this is turned into:

sel(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B 
mov.l.f0.0(8) vgrf16+0.0<2>:B, vgrf15+0.0:W 

So I guess the SEL instruction no longer works since it is not
predicated and we have removed the conditional modifier as well.

Maybe only move the conditional modifier to the MOV when the
instruction is not a SEL like we do for the predicate?

> +      return true;
> +   }
> +
> +   /**
> +    * Remove any non-trivial shuffling of data from the \p i-th
> source region
> +    * of the instruction.  Instead implement the region as a series
> of integer
> +    * copies into a temporary with the same channel layout as the
> destination.
> +    */
> +   bool
> +   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst,
> unsigned i)
> +   {
> +      assert(inst->components_read(i) == 1);
> +      const fs_builder ibld(v, block, inst);
> +      const unsigned stride = type_sz(inst->dst.type) * inst-
> >dst.stride /
> +                              type_sz(inst->src[i].type);
> +      assert(stride > 0);
> +      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type,
> stride),
> +                                      stride);
> +
> +      /* Emit a series of 32-bit integer copies with any source
> modifiers
> +       * cleaned up (because their semantics are dependent on the
> type).
> +       */
> +      const brw_reg_type raw_type =
> brw_int_type(MIN2(type_sz(tmp.type), 4),
> +                                                 false);
> +      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
> +      fs_reg raw_src = inst->src[i];
> +      raw_src.negate = false;
> +      raw_src.abs = false;
> +
> +      for (unsigned j = 0; j < n; j++)
> +         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src,
> raw_type, j));
> +
> +      /* Point the original instruction at the temporary, making
> sure to keep
> +       * any source modifiers in the instruction.
> +       */
> +      fs_reg lower_src = tmp;
> +      lower_src.negate = inst->src[i].negate;
> +      lower_src.abs = inst->src[i].abs;
> +      inst->src[i] = lower_src;
> +
> +      return true;
> +   }
> +
> +   /**
> +    * Remove any non-trivial shuffling of data from the destination
> region of
> +    * the instruction.  Instead implement the region as a series of
> integer
> +    * copies from a temporary with a channel layout compatible with
> the
> +    * sources.
> +    */
> +   bool
> +   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
> +   {
> +      const fs_builder ibld(v, block, inst);
> +      const unsigned stride = required_dst_byte_stride(inst) /
> +                              type_sz(inst->dst.type);
> +      assert(stride > 0);
> +      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type,
> stride),
> +                                      stride);
> +
> +      /* Emit a series of 32-bit integer copies from the temporary
> into the
> +       * original destination.
> +       */
> +      const brw_reg_type raw_type =
> brw_int_type(MIN2(type_sz(tmp.type), 4),
> +                                                 false);
> +      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
> +
> +      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
> +         /* Note that in general we cannot simply predicate the
> copies on the
> +          * same flag register as the original instruction, since it
> may have
> +          * been overwritten by the instruction itself.  Instead
> initialize
> +          * the temporary with the previous contents of the
> destination
> +          * register.
> +          */
> +         for (unsigned j = 0; j < n; j++)
> +            ibld.MOV(subscript(tmp, raw_type, j),
> +                     subscript(inst->dst, raw_type, j));
> +      }
> +
> +      for (unsigned j = 0; j < n; j++)
> +         ibld.at(block, inst->next).MOV(subscript(inst->dst,
> raw_type, j),
> +                                        subscript(tmp, raw_type,
> j));
> +
> +      /* Point the original instruction at the temporary, making
> sure to keep
> +       * any destination modifiers in the instruction.
> +       */
> +      assert(inst->size_written == inst->dst.component_size(inst-
> >exec_size));
> +      inst->dst = tmp;
> +      inst->size_written = inst->dst.component_size(inst-
> >exec_size);
> +
> +      return true;
> +   }
> +
> +   /**
> +    * Legalize the source and destination regioning controls of the
> specified
> +    * instruction.
> +    */
> +   bool
> +   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
> +   {
> +      const gen_device_info *devinfo = v->devinfo;
> +      bool progress = false;
> +
> +      if (has_invalid_conversion(devinfo, inst))
> +         progress |= lower_dst_modifiers(v, block, inst);
> +
> +      if (has_invalid_dst_region(devinfo, inst))
> +         progress |= lower_dst_region(v, block, inst);
> +
> +      for (unsigned i = 0; i < inst->sources; i++) {
> +         if (has_invalid_src_modifiers(devinfo, inst, i))
> +            progress |= lower_src_modifiers(v, block, inst, i);
> +
> +         if (has_invalid_src_region(devinfo, inst, i))
> +            progress |= lower_src_region(v, block, inst, i);
> +      }
> +
> +      return progress;
> +   }
> +}
> +
> +bool
> +fs_visitor::lower_regioning()
> +{
> +   bool progress = false;
> +
> +   foreach_block_and_inst(block, fs_inst, inst, cfg)
> +      progress |= lower_instruction(this, block, inst);
> +
> +   if (progress)
> +      invalidate_live_intervals();
> +
> +   return progress;
> +}
> diff --git a/src/intel/compiler/brw_ir_fs.h
> b/src/intel/compiler/brw_ir_fs.h
> index 5bb92e4cc86..3c23fb375e4 100644
> --- a/src/intel/compiler/brw_ir_fs.h
> +++ b/src/intel/compiler/brw_ir_fs.h
> @@ -486,6 +486,16 @@ get_exec_type_size(const fs_inst *inst)
>     return type_sz(get_exec_type(inst));
>  }
>  
> +/**
> + * Return whether the instruction isn't an ALU instruction and
> cannot be
> + * assumed to complete in-order.
> + */
> +static inline bool
> +is_unordered(const fs_inst *inst)
> +{
> +   return inst->mlen || inst->is_send_from_grf() || inst->is_math();
> +}
> +
>  /**
>   * Return whether the following regioning restriction applies to the
> specified
>   * instruction.  From the Cherryview PRM Vol 7. "Register Region
> diff --git a/src/intel/compiler/meson.build
> b/src/intel/compiler/meson.build
> index 69ce2eab4cf..4af134b418e 100644
> --- a/src/intel/compiler/meson.build
> +++ b/src/intel/compiler/meson.build
> @@ -57,6 +57,7 @@ libintel_compiler_files = files(
>    'brw_fs_live_variables.h',
>    'brw_fs_lower_conversions.cpp',
>    'brw_fs_lower_pack.cpp',
> +  'brw_fs_lower_regioning.cpp',
>    'brw_fs_nir.cpp',
>    'brw_fs_reg_allocate.cpp',
>    'brw_fs_register_coalesce.cpp',
Iago Toral <itoral@igalia.com> writes:

> On Sat, 2018-12-29 at 12:39 -0800, Francisco Jerez wrote:
>> This legalization pass is meant to handle situations where the source
>> or destination regioning controls of an instruction are unsupported
>> by
>> the hardware and need to be lowered away into separate instructions.
>> This should be more reliable and future-proof than the current
>> approach of handling CHV/BXT restrictions manually all over the
>> visitor.  The same mechanism is leveraged to lower unsupported type
>> conversions easily, which obsoletes the lower_conversions pass.
>> ---
>>  src/intel/Makefile.sources                    |   1 +
>>  src/intel/compiler/brw_fs.cpp                 |   5 +-
>>  src/intel/compiler/brw_fs.h                   |  21 +-
>>  src/intel/compiler/brw_fs_lower_regioning.cpp | 382
>> ++++++++++++++++++
>>  src/intel/compiler/brw_ir_fs.h                |  10 +
>>  src/intel/compiler/meson.build                |   1 +
>>  6 files changed, 401 insertions(+), 19 deletions(-)
>>  create mode 100644 src/intel/compiler/brw_fs_lower_regioning.cpp
>> 
>> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
>> index 5e7d32293b7..6b9874d2b80 100644
>> --- a/src/intel/Makefile.sources
>> +++ b/src/intel/Makefile.sources
>> @@ -64,6 +64,7 @@ COMPILER_FILES = \
>>  	compiler/brw_fs_live_variables.h \
>>  	compiler/brw_fs_lower_conversions.cpp \
>>  	compiler/brw_fs_lower_pack.cpp \
>> +	compiler/brw_fs_lower_regioning.cpp \
>>  	compiler/brw_fs_nir.cpp \
>>  	compiler/brw_fs_reg_allocate.cpp \
>>  	compiler/brw_fs_register_coalesce.cpp \
>> diff --git a/src/intel/compiler/brw_fs.cpp
>> b/src/intel/compiler/brw_fs.cpp
>> index 889509badab..caa7a798332 100644
>> --- a/src/intel/compiler/brw_fs.cpp
>> +++ b/src/intel/compiler/brw_fs.cpp
>> @@ -6471,7 +6471,10 @@ fs_visitor::optimize()
>>        OPT(dead_code_eliminate);
>>     }
>>  
>> -   if (OPT(lower_conversions)) {
>> +   progress = false;
>> +   OPT(lower_conversions);
>> +   OPT(lower_regioning);
>> +   if (progress) {
>>        OPT(opt_copy_propagation);
>>        OPT(dead_code_eliminate);
>>        OPT(lower_simd_width);
>> diff --git a/src/intel/compiler/brw_fs.h
>> b/src/intel/compiler/brw_fs.h
>> index dc36ecc21ac..36825754931 100644
>> --- a/src/intel/compiler/brw_fs.h
>> +++ b/src/intel/compiler/brw_fs.h
>> @@ -164,6 +164,7 @@ public:
>>     void lower_uniform_pull_constant_loads();
>>     bool lower_load_payload();
>>     bool lower_pack();
>> +   bool lower_regioning();
>>     bool lower_conversions();
>>     bool lower_logical_sends();
>>     bool lower_integer_multiplication();
>> @@ -536,24 +537,8 @@ namespace brw {
>>        }
>>     }
>>  
>> -   /**
>> -    * Remove any modifiers from the \p i-th source region of the
>> instruction,
>> -    * including negate, abs and any implicit type conversion to the
>> execution
>> -    * type.  Instead any source modifiers will be implemented as a
>> separate
>> -    * MOV instruction prior to the original instruction.
>> -    */
>> -   inline bool
>> -   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>> *inst, unsigned i)
>> -   {
>> -      assert(inst->components_read(i) == 1);
>> -      const fs_builder ibld(v, block, inst);
>> -      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
>> -
>> -      ibld.MOV(tmp, inst->src[i]);
>> -      inst->src[i] = tmp;
>> -
>> -      return true;
>> -   }
>> +   bool
>> +   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>> *inst, unsigned i);
>>  }
>>  
>>  void shuffle_from_32bit_read(const brw::fs_builder &bld,
>> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp
>> b/src/intel/compiler/brw_fs_lower_regioning.cpp
>> new file mode 100644
>> index 00000000000..9578622401d
>> --- /dev/null
>> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
>> @@ -0,0 +1,382 @@
>> +/*
>> + * Copyright © 2018 Intel Corporation
>> + *
>> + * Permission is hereby granted, free of charge, to any person
>> obtaining a
>> + * copy of this software and associated documentation files (the
>> "Software"),
>> + * to deal in the Software without restriction, including without
>> limitation
>> + * the rights to use, copy, modify, merge, publish, distribute,
>> sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom
>> the
>> + * Software is furnished to do so, subject to the following
>> conditions:
>> + *
>> + * The above copyright notice and this permission notice (including
>> the next
>> + * paragraph) shall be included in all copies or substantial
>> portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>> EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>> MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
>> EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
>> OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>> ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>> OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + */
>> +
>> +#include "brw_fs.h"
>> +#include "brw_cfg.h"
>> +#include "brw_fs_builder.h"
>> +
>> +using namespace brw;
>> +
>> +namespace {
>> +   /* From the SKL PRM Vol 2a, "Move":
>> +    *
>> +    * "A mov with the same source and destination type, no source
>> modifier,
>> +    *  and no saturation is a raw move. A packed byte destination
>> region (B
>> +    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be
>> written
>> +    *  using raw move."
>> +    */
>> +   bool
>> +   is_byte_raw_mov(const fs_inst *inst)
>> +   {
>> +      return type_sz(inst->dst.type) == 1 &&
>> +             inst->opcode == BRW_OPCODE_MOV &&
>> +             inst->src[0].type == inst->dst.type &&
>> +             !inst->saturate &&
>> +             !inst->src[0].negate &&
>> +             !inst->src[0].abs;
>> +   }
>> +
>> +   /*
>> +    * Return an acceptable byte stride for the destination of an
>> instruction
>> +    * that requires it to have some particular alignment.
>> +    */
>> +   unsigned
>> +   required_dst_byte_stride(const fs_inst *inst)
>> +   {
>> +      if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
>> +          !is_byte_raw_mov(inst)) {
>> +         return get_exec_type_size(inst);
>> +      } else {
>> +         unsigned stride = inst->dst.stride * type_sz(inst-
>> >dst.type);
>> +
>> +         for (unsigned i = 0; i < inst->sources; i++) {
>> +            if (!is_uniform(inst->src[i]))
>> +               stride = MAX2(stride, inst->src[i].stride *
>> +                             type_sz(inst->src[i].type));
>> +         }
>> +
>> +         return stride;
>> +      }
>> +   }
>> +
>> +   /*
>> +    * Return an acceptable byte sub-register offset for the
>> destination of an
>> +    * instruction that requires it to be aligned to the sub-register 
>> offset of
>> +    * the sources.
>> +    */
>> +   unsigned
>> +   required_dst_byte_offset(const fs_inst *inst)
>> +   {
>> +      for (unsigned i = 0; i < inst->sources; i++) {
>> +         if (!is_uniform(inst->src[i]))
>> +            if (reg_offset(inst->src[i]) % REG_SIZE !=
>> +                reg_offset(inst->dst) % REG_SIZE)
>> +               return 0;
>> +      }
>> +
>> +      return reg_offset(inst->dst) % REG_SIZE;
>> +   }
>> +
>> +   /*
>> +    * Return whether the instruction has an unsupported channel bit
>> layout
>> +    * specified for the i-th source region.
>> +    */
>> +   bool
>> +   has_invalid_src_region(const gen_device_info *devinfo, const
>> fs_inst *inst,
>> +                          unsigned i)
>> +   {
>> +      if (is_unordered(inst)) {
>> +         return false;
>> +      } else {
>> +         const unsigned dst_byte_stride = inst->dst.stride *
>> type_sz(inst->dst.type);
>> +         const unsigned src_byte_stride = inst->src[i].stride *
>> +            type_sz(inst->src[i].type);
>> +         const unsigned dst_byte_offset = reg_offset(inst->dst) %
>> REG_SIZE;
>> +         const unsigned src_byte_offset = reg_offset(inst->src[i]) %
>> REG_SIZE;
>> +
>> +         return has_dst_aligned_region_restriction(devinfo, inst) &&
>> +                !is_uniform(inst->src[i]) &&
>> +                (src_byte_stride != dst_byte_stride ||
>> +                 src_byte_offset != dst_byte_offset);
>> +      }
>> +   }
>> +
>> +   /*
>> +    * Return whether the instruction has an unsupported channel bit
>> layout
>> +    * specified for the destination region.
>> +    */
>> +   bool
>> +   has_invalid_dst_region(const gen_device_info *devinfo,
>> +                          const fs_inst *inst)
>> +   {
>> +      if (is_unordered(inst)) {
>> +         return false;
>> +      } else {
>> +         const brw_reg_type exec_type = get_exec_type(inst);
>> +         const unsigned dst_byte_offset = reg_offset(inst->dst) %
>> REG_SIZE;
>> +         const unsigned dst_byte_stride = inst->dst.stride *
>> type_sz(inst->dst.type);
>> +         const bool is_narrowing_conversion = !is_byte_raw_mov(inst)
>> &&
>> +            type_sz(inst->dst.type) < type_sz(exec_type);
>> +
>> +         return (has_dst_aligned_region_restriction(devinfo, inst)
>> &&
>> +                 (required_dst_byte_stride(inst) != dst_byte_stride
>> ||
>> +                  required_dst_byte_offset(inst) !=
>> dst_byte_offset)) ||
>> +                (is_narrowing_conversion &&
>> +                 required_dst_byte_stride(inst) != dst_byte_stride);
>> +      }
>> +   }
>> +
>> +   /*
>> +    * Return whether the instruction has unsupported source
>> modifiers
>> +    * specified for the i-th source region.
>> +    */
>> +   bool
>> +   has_invalid_src_modifiers(const gen_device_info *devinfo, const
>> fs_inst *inst,
>> +                             unsigned i)
>> +   {
>> +      return !inst->can_do_source_mods(devinfo) &&
>> +             (inst->src[i].negate || inst->src[i].abs);
>> +   }
>> +
>> +   /*
>> +    * Return whether the instruction has an unsupported type
>> conversion
>> +    * specified for the destination.
>> +    */
>> +   bool
>> +   has_invalid_conversion(const gen_device_info *devinfo, const
>> fs_inst *inst)
>> +   {
>> +      switch (inst->opcode) {
>> +      case BRW_OPCODE_MOV:
>> +         return false;
>> +      case BRW_OPCODE_SEL:
>> +         return inst->dst.type != get_exec_type(inst);
>> +      case SHADER_OPCODE_BROADCAST:
>> +      case SHADER_OPCODE_MOV_INDIRECT:
>> +         /* The source and destination types of these may are hard-
>> coded to
>> +          * integer at codegen time due to hardware limitations of
>> 64-bit
>> +          * types.
>> +          */
>> +         return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
>> +                 devinfo->is_cherryview ||
>> gen_device_info_is_9lp(devinfo)) &&
>> +                type_sz(inst->src[0].type) > 4 &&
>> +                inst->dst.type != inst->src[0].type;
>> +      default:
>> +         /* FIXME: We assume the opcodes don't explicitly mentioned
>> before
>> +          * just work fine with arbitrary conversions.
>> +          */
>> +         return false;
>> +      }
>> +   }
>> +
>> +   bool
>> +   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
>> +}
>> +
>> +namespace brw {
>> +   /**
>> +    * Remove any modifiers from the \p i-th source region of the
>> instruction,
>> +    * including negate, abs and any implicit type conversion to the
>> execution
>> +    * type.  Instead any source modifiers will be implemented as a
>> separate
>> +    * MOV instruction prior to the original instruction.
>> +    */
>> +   bool
>> +   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>> *inst, unsigned i)
>> +   {
>> +      assert(inst->components_read(i) == 1);
>> +      const fs_builder ibld(v, block, inst);
>> +      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
>> +
>> +      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
>> +      inst->src[i] = tmp;
>> +
>> +      return true;
>> +   }
>> +}
>> +
>> +namespace {
>> +   /**
>> +    * Remove any modifiers from the destination region of the
>> instruction,
>> +    * including saturate, conditional mod and any implicit type
>> conversion
>> +    * from the execution type.  Instead any destination modifiers
>> will be
>> +    * implemented as a separate MOV instruction after the original
>> +    * instruction.
>> +    */
>> +   bool
>> +   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>> *inst)
>> +   {
>> +      const fs_builder ibld(v, block, inst);
>> +      const brw_reg_type type = get_exec_type(inst);
>> +      /* Not strictly necessary, but if possible use a temporary
>> with the same
>> +       * channel alignment as the current destination in order to
>> avoid
>> +       * violating the restrictions enforced later on by
>> lower_src_region()
>> +       * and lower_dst_region(), which would introduce additional
>> copy
>> +       * instructions into the program unnecessarily.
>> +       */
>> +      const unsigned stride =
>> +         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type)
>> ? 1 :
>> +         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
>> +      const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride),
>> stride);
>> +
>> +      /* Emit a MOV taking care of all the destination modifiers. */
>> +      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
>> +      mov->saturate = inst->saturate;
>> +      mov->conditional_mod = inst->conditional_mod;
>> +      if (inst->opcode != BRW_OPCODE_SEL) {
>> +         mov->predicate = inst->predicate;
>> +         mov->predicate_inverse = inst->predicate_inverse;
>> +      }
>> +      mov->flag_subreg = inst->flag_subreg;
>> +      lower_instruction(v, block, mov);
>> +
>> +      /* Point the original instruction at the temporary, and clean
>> up any
>> +       * destination modifiers.
>> +       */
>> +      assert(inst->size_written == inst->dst.component_size(inst-
>> >exec_size));
>> +      inst->dst = tmp;
>> +      inst->size_written = inst->dst.component_size(inst-
>> >exec_size);
>> +      inst->saturate = false;
>> +      inst->conditional_mod = BRW_CONDITIONAL_NONE;
>
> I think this is not correct if if inst is a SEL instruction. Here is an
> example that is causing a regresion for me:
>
> With the original lower_conversions pass I have:
>
> sel.l(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B 
> mov(8) vgrf16+0.0<2>:B, vgrf15+0.0:W 
>
> Now, with this pass this is turned into:
>
> sel(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B 
> mov.l.f0.0(8) vgrf16+0.0<2>:B, vgrf15+0.0:W 
>
> So I guess the SEL instruction no longer works since it is not
> predicated and we have removed the conditional modifier as well.
>
> Maybe only move the conditional modifier to the MOV when the
> instruction is not a SEL like we do for the predicate?
>

Yeah good point, the semantics of conditional mods are inconsistent for
the SEL instruction -- Fixed up locally in the same way predicates are
handled for SEL in the same function.

>> +      return true;
>> +   }
>> +
>> +   /**
>> +    * Remove any non-trivial shuffling of data from the \p i-th
>> source region
>> +    * of the instruction.  Instead implement the region as a series
>> of integer
>> +    * copies into a temporary with the same channel layout as the
>> destination.
>> +    */
>> +   bool
>> +   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst,
>> unsigned i)
>> +   {
>> +      assert(inst->components_read(i) == 1);
>> +      const fs_builder ibld(v, block, inst);
>> +      const unsigned stride = type_sz(inst->dst.type) * inst-
>> >dst.stride /
>> +                              type_sz(inst->src[i].type);
>> +      assert(stride > 0);
>> +      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type,
>> stride),
>> +                                      stride);
>> +
>> +      /* Emit a series of 32-bit integer copies with any source
>> modifiers
>> +       * cleaned up (because their semantics are dependent on the
>> type).
>> +       */
>> +      const brw_reg_type raw_type =
>> brw_int_type(MIN2(type_sz(tmp.type), 4),
>> +                                                 false);
>> +      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
>> +      fs_reg raw_src = inst->src[i];
>> +      raw_src.negate = false;
>> +      raw_src.abs = false;
>> +
>> +      for (unsigned j = 0; j < n; j++)
>> +         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src,
>> raw_type, j));
>> +
>> +      /* Point the original instruction at the temporary, making
>> sure to keep
>> +       * any source modifiers in the instruction.
>> +       */
>> +      fs_reg lower_src = tmp;
>> +      lower_src.negate = inst->src[i].negate;
>> +      lower_src.abs = inst->src[i].abs;
>> +      inst->src[i] = lower_src;
>> +
>> +      return true;
>> +   }
>> +
>> +   /**
>> +    * Remove any non-trivial shuffling of data from the destination
>> region of
>> +    * the instruction.  Instead implement the region as a series of
>> integer
>> +    * copies from a temporary with a channel layout compatible with
>> the
>> +    * sources.
>> +    */
>> +   bool
>> +   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
>> +   {
>> +      const fs_builder ibld(v, block, inst);
>> +      const unsigned stride = required_dst_byte_stride(inst) /
>> +                              type_sz(inst->dst.type);
>> +      assert(stride > 0);
>> +      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type,
>> stride),
>> +                                      stride);
>> +
>> +      /* Emit a series of 32-bit integer copies from the temporary
>> into the
>> +       * original destination.
>> +       */
>> +      const brw_reg_type raw_type =
>> brw_int_type(MIN2(type_sz(tmp.type), 4),
>> +                                                 false);
>> +      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
>> +
>> +      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
>> +         /* Note that in general we cannot simply predicate the
>> copies on the
>> +          * same flag register as the original instruction, since it
>> may have
>> +          * been overwritten by the instruction itself.  Instead
>> initialize
>> +          * the temporary with the previous contents of the
>> destination
>> +          * register.
>> +          */
>> +         for (unsigned j = 0; j < n; j++)
>> +            ibld.MOV(subscript(tmp, raw_type, j),
>> +                     subscript(inst->dst, raw_type, j));
>> +      }
>> +
>> +      for (unsigned j = 0; j < n; j++)
>> +         ibld.at(block, inst->next).MOV(subscript(inst->dst,
>> raw_type, j),
>> +                                        subscript(tmp, raw_type,
>> j));
>> +
>> +      /* Point the original instruction at the temporary, making
>> sure to keep
>> +       * any destination modifiers in the instruction.
>> +       */
>> +      assert(inst->size_written == inst->dst.component_size(inst-
>> >exec_size));
>> +      inst->dst = tmp;
>> +      inst->size_written = inst->dst.component_size(inst-
>> >exec_size);
>> +
>> +      return true;
>> +   }
>> +
>> +   /**
>> +    * Legalize the source and destination regioning controls of the
>> specified
>> +    * instruction.
>> +    */
>> +   bool
>> +   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
>> +   {
>> +      const gen_device_info *devinfo = v->devinfo;
>> +      bool progress = false;
>> +
>> +      if (has_invalid_conversion(devinfo, inst))
>> +         progress |= lower_dst_modifiers(v, block, inst);
>> +
>> +      if (has_invalid_dst_region(devinfo, inst))
>> +         progress |= lower_dst_region(v, block, inst);
>> +
>> +      for (unsigned i = 0; i < inst->sources; i++) {
>> +         if (has_invalid_src_modifiers(devinfo, inst, i))
>> +            progress |= lower_src_modifiers(v, block, inst, i);
>> +
>> +         if (has_invalid_src_region(devinfo, inst, i))
>> +            progress |= lower_src_region(v, block, inst, i);
>> +      }
>> +
>> +      return progress;
>> +   }
>> +}
>> +
>> +bool
>> +fs_visitor::lower_regioning()
>> +{
>> +   bool progress = false;
>> +
>> +   foreach_block_and_inst(block, fs_inst, inst, cfg)
>> +      progress |= lower_instruction(this, block, inst);
>> +
>> +   if (progress)
>> +      invalidate_live_intervals();
>> +
>> +   return progress;
>> +}
>> diff --git a/src/intel/compiler/brw_ir_fs.h
>> b/src/intel/compiler/brw_ir_fs.h
>> index 5bb92e4cc86..3c23fb375e4 100644
>> --- a/src/intel/compiler/brw_ir_fs.h
>> +++ b/src/intel/compiler/brw_ir_fs.h
>> @@ -486,6 +486,16 @@ get_exec_type_size(const fs_inst *inst)
>>     return type_sz(get_exec_type(inst));
>>  }
>>  
>> +/**
>> + * Return whether the instruction isn't an ALU instruction and
>> cannot be
>> + * assumed to complete in-order.
>> + */
>> +static inline bool
>> +is_unordered(const fs_inst *inst)
>> +{
>> +   return inst->mlen || inst->is_send_from_grf() || inst->is_math();
>> +}
>> +
>>  /**
>>   * Return whether the following regioning restriction applies to the
>> specified
>>   * instruction.  From the Cherryview PRM Vol 7. "Register Region
>> diff --git a/src/intel/compiler/meson.build
>> b/src/intel/compiler/meson.build
>> index 69ce2eab4cf..4af134b418e 100644
>> --- a/src/intel/compiler/meson.build
>> +++ b/src/intel/compiler/meson.build
>> @@ -57,6 +57,7 @@ libintel_compiler_files = files(
>>    'brw_fs_live_variables.h',
>>    'brw_fs_lower_conversions.cpp',
>>    'brw_fs_lower_pack.cpp',
>> +  'brw_fs_lower_regioning.cpp',
>>    'brw_fs_nir.cpp',
>>    'brw_fs_reg_allocate.cpp',
>>    'brw_fs_register_coalesce.cpp',
Francisco Jerez <currojerez@riseup.net> writes:

> Iago Toral <itoral@igalia.com> writes:
>
>> On Sat, 2018-12-29 at 12:39 -0800, Francisco Jerez wrote:
>>> This legalization pass is meant to handle situations where the source
>>> or destination regioning controls of an instruction are unsupported
>>> by
>>> the hardware and need to be lowered away into separate instructions.
>>> This should be more reliable and future-proof than the current
>>> approach of handling CHV/BXT restrictions manually all over the
>>> visitor.  The same mechanism is leveraged to lower unsupported type
>>> conversions easily, which obsoletes the lower_conversions pass.
>>> ---
>>>  src/intel/Makefile.sources                    |   1 +
>>>  src/intel/compiler/brw_fs.cpp                 |   5 +-
>>>  src/intel/compiler/brw_fs.h                   |  21 +-
>>>  src/intel/compiler/brw_fs_lower_regioning.cpp | 382
>>> ++++++++++++++++++
>>>  src/intel/compiler/brw_ir_fs.h                |  10 +
>>>  src/intel/compiler/meson.build                |   1 +
>>>  6 files changed, 401 insertions(+), 19 deletions(-)
>>>  create mode 100644 src/intel/compiler/brw_fs_lower_regioning.cpp
>>> 
>>> diff --git a/src/intel/Makefile.sources b/src/intel/Makefile.sources
>>> index 5e7d32293b7..6b9874d2b80 100644
>>> --- a/src/intel/Makefile.sources
>>> +++ b/src/intel/Makefile.sources
>>> @@ -64,6 +64,7 @@ COMPILER_FILES = \
>>>  	compiler/brw_fs_live_variables.h \
>>>  	compiler/brw_fs_lower_conversions.cpp \
>>>  	compiler/brw_fs_lower_pack.cpp \
>>> +	compiler/brw_fs_lower_regioning.cpp \
>>>  	compiler/brw_fs_nir.cpp \
>>>  	compiler/brw_fs_reg_allocate.cpp \
>>>  	compiler/brw_fs_register_coalesce.cpp \
>>> diff --git a/src/intel/compiler/brw_fs.cpp
>>> b/src/intel/compiler/brw_fs.cpp
>>> index 889509badab..caa7a798332 100644
>>> --- a/src/intel/compiler/brw_fs.cpp
>>> +++ b/src/intel/compiler/brw_fs.cpp
>>> @@ -6471,7 +6471,10 @@ fs_visitor::optimize()
>>>        OPT(dead_code_eliminate);
>>>     }
>>>  
>>> -   if (OPT(lower_conversions)) {
>>> +   progress = false;
>>> +   OPT(lower_conversions);
>>> +   OPT(lower_regioning);
>>> +   if (progress) {
>>>        OPT(opt_copy_propagation);
>>>        OPT(dead_code_eliminate);
>>>        OPT(lower_simd_width);
>>> diff --git a/src/intel/compiler/brw_fs.h
>>> b/src/intel/compiler/brw_fs.h
>>> index dc36ecc21ac..36825754931 100644
>>> --- a/src/intel/compiler/brw_fs.h
>>> +++ b/src/intel/compiler/brw_fs.h
>>> @@ -164,6 +164,7 @@ public:
>>>     void lower_uniform_pull_constant_loads();
>>>     bool lower_load_payload();
>>>     bool lower_pack();
>>> +   bool lower_regioning();
>>>     bool lower_conversions();
>>>     bool lower_logical_sends();
>>>     bool lower_integer_multiplication();
>>> @@ -536,24 +537,8 @@ namespace brw {
>>>        }
>>>     }
>>>  
>>> -   /**
>>> -    * Remove any modifiers from the \p i-th source region of the
>>> instruction,
>>> -    * including negate, abs and any implicit type conversion to the
>>> execution
>>> -    * type.  Instead any source modifiers will be implemented as a
>>> separate
>>> -    * MOV instruction prior to the original instruction.
>>> -    */
>>> -   inline bool
>>> -   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>>> *inst, unsigned i)
>>> -   {
>>> -      assert(inst->components_read(i) == 1);
>>> -      const fs_builder ibld(v, block, inst);
>>> -      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
>>> -
>>> -      ibld.MOV(tmp, inst->src[i]);
>>> -      inst->src[i] = tmp;
>>> -
>>> -      return true;
>>> -   }
>>> +   bool
>>> +   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>>> *inst, unsigned i);
>>>  }
>>>  
>>>  void shuffle_from_32bit_read(const brw::fs_builder &bld,
>>> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp
>>> b/src/intel/compiler/brw_fs_lower_regioning.cpp
>>> new file mode 100644
>>> index 00000000000..9578622401d
>>> --- /dev/null
>>> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
>>> @@ -0,0 +1,382 @@
>>> +/*
>>> + * Copyright © 2018 Intel Corporation
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person
>>> obtaining a
>>> + * copy of this software and associated documentation files (the
>>> "Software"),
>>> + * to deal in the Software without restriction, including without
>>> limitation
>>> + * the rights to use, copy, modify, merge, publish, distribute,
>>> sublicense,
>>> + * and/or sell copies of the Software, and to permit persons to whom
>>> the
>>> + * Software is furnished to do so, subject to the following
>>> conditions:
>>> + *
>>> + * The above copyright notice and this permission notice (including
>>> the next
>>> + * paragraph) shall be included in all copies or substantial
>>> portions of the
>>> + * Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
>>> EXPRESS OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
>>> MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
>>> EVENT SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
>>> OR OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
>>> ARISING
>>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
>>> OTHER DEALINGS
>>> + * IN THE SOFTWARE.
>>> + */
>>> +
>>> +#include "brw_fs.h"
>>> +#include "brw_cfg.h"
>>> +#include "brw_fs_builder.h"
>>> +
>>> +using namespace brw;
>>> +
>>> +namespace {
>>> +   /* From the SKL PRM Vol 2a, "Move":
>>> +    *
>>> +    * "A mov with the same source and destination type, no source
>>> modifier,
>>> +    *  and no saturation is a raw move. A packed byte destination
>>> region (B
>>> +    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be
>>> written
>>> +    *  using raw move."
>>> +    */
>>> +   bool
>>> +   is_byte_raw_mov(const fs_inst *inst)
>>> +   {
>>> +      return type_sz(inst->dst.type) == 1 &&
>>> +             inst->opcode == BRW_OPCODE_MOV &&
>>> +             inst->src[0].type == inst->dst.type &&
>>> +             !inst->saturate &&
>>> +             !inst->src[0].negate &&
>>> +             !inst->src[0].abs;
>>> +   }
>>> +
>>> +   /*
>>> +    * Return an acceptable byte stride for the destination of an
>>> instruction
>>> +    * that requires it to have some particular alignment.
>>> +    */
>>> +   unsigned
>>> +   required_dst_byte_stride(const fs_inst *inst)
>>> +   {
>>> +      if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
>>> +          !is_byte_raw_mov(inst)) {
>>> +         return get_exec_type_size(inst);
>>> +      } else {
>>> +         unsigned stride = inst->dst.stride * type_sz(inst-
>>> >dst.type);
>>> +
>>> +         for (unsigned i = 0; i < inst->sources; i++) {
>>> +            if (!is_uniform(inst->src[i]))
>>> +               stride = MAX2(stride, inst->src[i].stride *
>>> +                             type_sz(inst->src[i].type));
>>> +         }
>>> +
>>> +         return stride;
>>> +      }
>>> +   }
>>> +
>>> +   /*
>>> +    * Return an acceptable byte sub-register offset for the
>>> destination of an
>>> +    * instruction that requires it to be aligned to the sub-register 
>>> offset of
>>> +    * the sources.
>>> +    */
>>> +   unsigned
>>> +   required_dst_byte_offset(const fs_inst *inst)
>>> +   {
>>> +      for (unsigned i = 0; i < inst->sources; i++) {
>>> +         if (!is_uniform(inst->src[i]))
>>> +            if (reg_offset(inst->src[i]) % REG_SIZE !=
>>> +                reg_offset(inst->dst) % REG_SIZE)
>>> +               return 0;
>>> +      }
>>> +
>>> +      return reg_offset(inst->dst) % REG_SIZE;
>>> +   }
>>> +
>>> +   /*
>>> +    * Return whether the instruction has an unsupported channel bit
>>> layout
>>> +    * specified for the i-th source region.
>>> +    */
>>> +   bool
>>> +   has_invalid_src_region(const gen_device_info *devinfo, const
>>> fs_inst *inst,
>>> +                          unsigned i)
>>> +   {
>>> +      if (is_unordered(inst)) {
>>> +         return false;
>>> +      } else {
>>> +         const unsigned dst_byte_stride = inst->dst.stride *
>>> type_sz(inst->dst.type);
>>> +         const unsigned src_byte_stride = inst->src[i].stride *
>>> +            type_sz(inst->src[i].type);
>>> +         const unsigned dst_byte_offset = reg_offset(inst->dst) %
>>> REG_SIZE;
>>> +         const unsigned src_byte_offset = reg_offset(inst->src[i]) %
>>> REG_SIZE;
>>> +
>>> +         return has_dst_aligned_region_restriction(devinfo, inst) &&
>>> +                !is_uniform(inst->src[i]) &&
>>> +                (src_byte_stride != dst_byte_stride ||
>>> +                 src_byte_offset != dst_byte_offset);
>>> +      }
>>> +   }
>>> +
>>> +   /*
>>> +    * Return whether the instruction has an unsupported channel bit
>>> layout
>>> +    * specified for the destination region.
>>> +    */
>>> +   bool
>>> +   has_invalid_dst_region(const gen_device_info *devinfo,
>>> +                          const fs_inst *inst)
>>> +   {
>>> +      if (is_unordered(inst)) {
>>> +         return false;
>>> +      } else {
>>> +         const brw_reg_type exec_type = get_exec_type(inst);
>>> +         const unsigned dst_byte_offset = reg_offset(inst->dst) %
>>> REG_SIZE;
>>> +         const unsigned dst_byte_stride = inst->dst.stride *
>>> type_sz(inst->dst.type);
>>> +         const bool is_narrowing_conversion = !is_byte_raw_mov(inst)
>>> &&
>>> +            type_sz(inst->dst.type) < type_sz(exec_type);
>>> +
>>> +         return (has_dst_aligned_region_restriction(devinfo, inst)
>>> &&
>>> +                 (required_dst_byte_stride(inst) != dst_byte_stride
>>> ||
>>> +                  required_dst_byte_offset(inst) !=
>>> dst_byte_offset)) ||
>>> +                (is_narrowing_conversion &&
>>> +                 required_dst_byte_stride(inst) != dst_byte_stride);
>>> +      }
>>> +   }
>>> +
>>> +   /*
>>> +    * Return whether the instruction has unsupported source
>>> modifiers
>>> +    * specified for the i-th source region.
>>> +    */
>>> +   bool
>>> +   has_invalid_src_modifiers(const gen_device_info *devinfo, const
>>> fs_inst *inst,
>>> +                             unsigned i)
>>> +   {
>>> +      return !inst->can_do_source_mods(devinfo) &&
>>> +             (inst->src[i].negate || inst->src[i].abs);
>>> +   }
>>> +
>>> +   /*
>>> +    * Return whether the instruction has an unsupported type
>>> conversion
>>> +    * specified for the destination.
>>> +    */
>>> +   bool
>>> +   has_invalid_conversion(const gen_device_info *devinfo, const
>>> fs_inst *inst)
>>> +   {
>>> +      switch (inst->opcode) {
>>> +      case BRW_OPCODE_MOV:
>>> +         return false;
>>> +      case BRW_OPCODE_SEL:
>>> +         return inst->dst.type != get_exec_type(inst);
>>> +      case SHADER_OPCODE_BROADCAST:
>>> +      case SHADER_OPCODE_MOV_INDIRECT:
>>> +         /* The source and destination types of these may are hard-
>>> coded to
>>> +          * integer at codegen time due to hardware limitations of
>>> 64-bit
>>> +          * types.
>>> +          */
>>> +         return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
>>> +                 devinfo->is_cherryview ||
>>> gen_device_info_is_9lp(devinfo)) &&
>>> +                type_sz(inst->src[0].type) > 4 &&
>>> +                inst->dst.type != inst->src[0].type;
>>> +      default:
>>> +         /* FIXME: We assume the opcodes don't explicitly mentioned
>>> before
>>> +          * just work fine with arbitrary conversions.
>>> +          */
>>> +         return false;
>>> +      }
>>> +   }
>>> +
>>> +   bool
>>> +   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
>>> +}
>>> +
>>> +namespace brw {
>>> +   /**
>>> +    * Remove any modifiers from the \p i-th source region of the
>>> instruction,
>>> +    * including negate, abs and any implicit type conversion to the
>>> execution
>>> +    * type.  Instead any source modifiers will be implemented as a
>>> separate
>>> +    * MOV instruction prior to the original instruction.
>>> +    */
>>> +   bool
>>> +   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>>> *inst, unsigned i)
>>> +   {
>>> +      assert(inst->components_read(i) == 1);
>>> +      const fs_builder ibld(v, block, inst);
>>> +      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
>>> +
>>> +      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
>>> +      inst->src[i] = tmp;
>>> +
>>> +      return true;
>>> +   }
>>> +}
>>> +
>>> +namespace {
>>> +   /**
>>> +    * Remove any modifiers from the destination region of the
>>> instruction,
>>> +    * including saturate, conditional mod and any implicit type
>>> conversion
>>> +    * from the execution type.  Instead any destination modifiers
>>> will be
>>> +    * implemented as a separate MOV instruction after the original
>>> +    * instruction.
>>> +    */
>>> +   bool
>>> +   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst
>>> *inst)
>>> +   {
>>> +      const fs_builder ibld(v, block, inst);
>>> +      const brw_reg_type type = get_exec_type(inst);
>>> +      /* Not strictly necessary, but if possible use a temporary
>>> with the same
>>> +       * channel alignment as the current destination in order to
>>> avoid
>>> +       * violating the restrictions enforced later on by
>>> lower_src_region()
>>> +       * and lower_dst_region(), which would introduce additional
>>> copy
>>> +       * instructions into the program unnecessarily.
>>> +       */
>>> +      const unsigned stride =
>>> +         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type)
>>> ? 1 :
>>> +         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
>>> +      const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride),
>>> stride);
>>> +
>>> +      /* Emit a MOV taking care of all the destination modifiers. */
>>> +      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
>>> +      mov->saturate = inst->saturate;
>>> +      mov->conditional_mod = inst->conditional_mod;
>>> +      if (inst->opcode != BRW_OPCODE_SEL) {
>>> +         mov->predicate = inst->predicate;
>>> +         mov->predicate_inverse = inst->predicate_inverse;
>>> +      }
>>> +      mov->flag_subreg = inst->flag_subreg;
>>> +      lower_instruction(v, block, mov);
>>> +
>>> +      /* Point the original instruction at the temporary, and clean
>>> up any
>>> +       * destination modifiers.
>>> +       */
>>> +      assert(inst->size_written == inst->dst.component_size(inst-
>>> >exec_size));
>>> +      inst->dst = tmp;
>>> +      inst->size_written = inst->dst.component_size(inst-
>>> >exec_size);
>>> +      inst->saturate = false;
>>> +      inst->conditional_mod = BRW_CONDITIONAL_NONE;
>>
>> I think this is not correct if if inst is a SEL instruction. Here is an
>> example that is causing a regresion for me:
>>
>> With the original lower_conversions pass I have:
>>
>> sel.l(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B 
>> mov(8) vgrf16+0.0<2>:B, vgrf15+0.0:W 
>>
>> Now, with this pass this is turned into:
>>
>> sel(8) vgrf15+0.0:W, vgrf3+0.0:B, vgrf5+0.0:B 
>> mov.l.f0.0(8) vgrf16+0.0<2>:B, vgrf15+0.0:W 
>>
>> So I guess the SEL instruction no longer works since it is not
>> predicated and we have removed the conditional modifier as well.
>>
>> Maybe only move the conditional modifier to the MOV when the
>> instruction is not a SEL like we do for the predicate?
>>
>
> Yeah good point, the semantics of conditional mods are inconsistent for
> the SEL instruction -- Fixed up locally in the same way predicates are
> handled for SEL in the same function.
>

It just occurred to me that there are a couple other ISA instructions
with non-standard semantics for the conditional mod that need to be
special-cased here as well.   I'll reply with a fix in a minute.

>>> +      return true;
>>> +   }
>>> +
>>> +   /**
>>> +    * Remove any non-trivial shuffling of data from the \p i-th
>>> source region
>>> +    * of the instruction.  Instead implement the region as a series
>>> of integer
>>> +    * copies into a temporary with the same channel layout as the
>>> destination.
>>> +    */
>>> +   bool
>>> +   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst,
>>> unsigned i)
>>> +   {
>>> +      assert(inst->components_read(i) == 1);
>>> +      const fs_builder ibld(v, block, inst);
>>> +      const unsigned stride = type_sz(inst->dst.type) * inst-
>>> >dst.stride /
>>> +                              type_sz(inst->src[i].type);
>>> +      assert(stride > 0);
>>> +      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type,
>>> stride),
>>> +                                      stride);
>>> +
>>> +      /* Emit a series of 32-bit integer copies with any source
>>> modifiers
>>> +       * cleaned up (because their semantics are dependent on the
>>> type).
>>> +       */
>>> +      const brw_reg_type raw_type =
>>> brw_int_type(MIN2(type_sz(tmp.type), 4),
>>> +                                                 false);
>>> +      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
>>> +      fs_reg raw_src = inst->src[i];
>>> +      raw_src.negate = false;
>>> +      raw_src.abs = false;
>>> +
>>> +      for (unsigned j = 0; j < n; j++)
>>> +         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src,
>>> raw_type, j));
>>> +
>>> +      /* Point the original instruction at the temporary, making
>>> sure to keep
>>> +       * any source modifiers in the instruction.
>>> +       */
>>> +      fs_reg lower_src = tmp;
>>> +      lower_src.negate = inst->src[i].negate;
>>> +      lower_src.abs = inst->src[i].abs;
>>> +      inst->src[i] = lower_src;
>>> +
>>> +      return true;
>>> +   }
>>> +
>>> +   /**
>>> +    * Remove any non-trivial shuffling of data from the destination
>>> region of
>>> +    * the instruction.  Instead implement the region as a series of
>>> integer
>>> +    * copies from a temporary with a channel layout compatible with
>>> the
>>> +    * sources.
>>> +    */
>>> +   bool
>>> +   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
>>> +   {
>>> +      const fs_builder ibld(v, block, inst);
>>> +      const unsigned stride = required_dst_byte_stride(inst) /
>>> +                              type_sz(inst->dst.type);
>>> +      assert(stride > 0);
>>> +      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type,
>>> stride),
>>> +                                      stride);
>>> +
>>> +      /* Emit a series of 32-bit integer copies from the temporary
>>> into the
>>> +       * original destination.
>>> +       */
>>> +      const brw_reg_type raw_type =
>>> brw_int_type(MIN2(type_sz(tmp.type), 4),
>>> +                                                 false);
>>> +      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
>>> +
>>> +      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
>>> +         /* Note that in general we cannot simply predicate the
>>> copies on the
>>> +          * same flag register as the original instruction, since it
>>> may have
>>> +          * been overwritten by the instruction itself.  Instead
>>> initialize
>>> +          * the temporary with the previous contents of the
>>> destination
>>> +          * register.
>>> +          */
>>> +         for (unsigned j = 0; j < n; j++)
>>> +            ibld.MOV(subscript(tmp, raw_type, j),
>>> +                     subscript(inst->dst, raw_type, j));
>>> +      }
>>> +
>>> +      for (unsigned j = 0; j < n; j++)
>>> +         ibld.at(block, inst->next).MOV(subscript(inst->dst,
>>> raw_type, j),
>>> +                                        subscript(tmp, raw_type,
>>> j));
>>> +
>>> +      /* Point the original instruction at the temporary, making
>>> sure to keep
>>> +       * any destination modifiers in the instruction.
>>> +       */
>>> +      assert(inst->size_written == inst->dst.component_size(inst-
>>> >exec_size));
>>> +      inst->dst = tmp;
>>> +      inst->size_written = inst->dst.component_size(inst-
>>> >exec_size);
>>> +
>>> +      return true;
>>> +   }
>>> +
>>> +   /**
>>> +    * Legalize the source and destination regioning controls of the
>>> specified
>>> +    * instruction.
>>> +    */
>>> +   bool
>>> +   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
>>> +   {
>>> +      const gen_device_info *devinfo = v->devinfo;
>>> +      bool progress = false;
>>> +
>>> +      if (has_invalid_conversion(devinfo, inst))
>>> +         progress |= lower_dst_modifiers(v, block, inst);
>>> +
>>> +      if (has_invalid_dst_region(devinfo, inst))
>>> +         progress |= lower_dst_region(v, block, inst);
>>> +
>>> +      for (unsigned i = 0; i < inst->sources; i++) {
>>> +         if (has_invalid_src_modifiers(devinfo, inst, i))
>>> +            progress |= lower_src_modifiers(v, block, inst, i);
>>> +
>>> +         if (has_invalid_src_region(devinfo, inst, i))
>>> +            progress |= lower_src_region(v, block, inst, i);
>>> +      }
>>> +
>>> +      return progress;
>>> +   }
>>> +}
>>> +
>>> +bool
>>> +fs_visitor::lower_regioning()
>>> +{
>>> +   bool progress = false;
>>> +
>>> +   foreach_block_and_inst(block, fs_inst, inst, cfg)
>>> +      progress |= lower_instruction(this, block, inst);
>>> +
>>> +   if (progress)
>>> +      invalidate_live_intervals();
>>> +
>>> +   return progress;
>>> +}
>>> diff --git a/src/intel/compiler/brw_ir_fs.h
>>> b/src/intel/compiler/brw_ir_fs.h
>>> index 5bb92e4cc86..3c23fb375e4 100644
>>> --- a/src/intel/compiler/brw_ir_fs.h
>>> +++ b/src/intel/compiler/brw_ir_fs.h
>>> @@ -486,6 +486,16 @@ get_exec_type_size(const fs_inst *inst)
>>>     return type_sz(get_exec_type(inst));
>>>  }
>>>  
>>> +/**
>>> + * Return whether the instruction isn't an ALU instruction and
>>> cannot be
>>> + * assumed to complete in-order.
>>> + */
>>> +static inline bool
>>> +is_unordered(const fs_inst *inst)
>>> +{
>>> +   return inst->mlen || inst->is_send_from_grf() || inst->is_math();
>>> +}
>>> +
>>>  /**
>>>   * Return whether the following regioning restriction applies to the
>>> specified
>>>   * instruction.  From the Cherryview PRM Vol 7. "Register Region
>>> diff --git a/src/intel/compiler/meson.build
>>> b/src/intel/compiler/meson.build
>>> index 69ce2eab4cf..4af134b418e 100644
>>> --- a/src/intel/compiler/meson.build
>>> +++ b/src/intel/compiler/meson.build
>>> @@ -57,6 +57,7 @@ libintel_compiler_files = files(
>>>    'brw_fs_live_variables.h',
>>>    'brw_fs_lower_conversions.cpp',
>>>    'brw_fs_lower_pack.cpp',
>>> +  'brw_fs_lower_regioning.cpp',
>>>    'brw_fs_nir.cpp',
>>>    'brw_fs_reg_allocate.cpp',
>>>    'brw_fs_register_coalesce.cpp',
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev