[1/5] intel/fs: Exclude control sources from execution type and region alignment calculations.

Submitted by Francisco Jerez on Jan. 19, 2019, 12:08 a.m.

Details

Message ID 20190119000902.11597-1-currojerez@riseup.net
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Francisco Jerez Jan. 19, 2019, 12:08 a.m.
Currently the execution type calculation will return a bogus value in
cases like:

  mov_indirect(8) vgrf0:w, vgrf1:w, vgrf2:ud, 32u

Which will be considered to have a 32-bit integer execution type even
though the actual indirect move operation will be carried out with
16-bit precision.

Similarly there's no need to apply the CHV/BXT double-precision region
alignment restrictions to such control sources, since they aren't
directly involved in the double-precision arithmetic operations
emitted by these virtual instructions.  Applying the CHV/BXT
restrictions to control sources was expected to be harmless if mildly
inefficient, but unfortunately it exposed problems at codegen level
for virtual instructions (namely the SHUFFLE instruction used for the
Vulkan 1.1 subgroup feature) that weren't prepared to accept control
sources with an arbitrary strided region.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109328
Reported-by: Mark Janes <mark.a.janes@intel.com>
Fixes: efa4e4bc5fc "intel/fs: Introduce regioning lowering pass."
---
 src/intel/compiler/brw_fs.cpp                 | 54 +++++++++++++++++++
 src/intel/compiler/brw_fs_lower_regioning.cpp |  6 +--
 src/intel/compiler/brw_ir_fs.h                | 10 +++-
 3 files changed, 66 insertions(+), 4 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0359eb079f7..f475b617df2 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -271,6 +271,60 @@  fs_inst::is_send_from_grf() const
    }
 }
 
+bool
+fs_inst::is_control_source(unsigned arg) const
+{
+   switch (opcode) {
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+      return arg == 0;
+
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_SHUFFLE:
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+   case SHADER_OPCODE_IMAGE_SIZE:
+   case SHADER_OPCODE_GET_BUFFER_SIZE:
+      return arg == 1;
+
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_CLUSTER_BROADCAST:
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_BYTE_SCATTERED_READ:
+   case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+      return arg == 1 || arg == 2;
+
+   default:
+      return false;
+   }
+}
+
 /**
  * Returns true if this instruction's sources and destinations cannot
  * safely be the same register.
diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
index df50993dee6..6a3c23892b4 100644
--- a/src/intel/compiler/brw_fs_lower_regioning.cpp
+++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
@@ -74,7 +74,7 @@  namespace {
          unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
 
          for (unsigned i = 0; i < inst->sources; i++) {
-            if (!is_uniform(inst->src[i]))
+            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
                stride = MAX2(stride, inst->src[i].stride *
                              type_sz(inst->src[i].type));
          }
@@ -92,7 +92,7 @@  namespace {
    required_dst_byte_offset(const fs_inst *inst)
    {
       for (unsigned i = 0; i < inst->sources; i++) {
-         if (!is_uniform(inst->src[i]))
+         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
             if (reg_offset(inst->src[i]) % REG_SIZE !=
                 reg_offset(inst->dst) % REG_SIZE)
                return 0;
@@ -109,7 +109,7 @@  namespace {
    has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
                           unsigned i)
    {
-      if (is_unordered(inst)) {
+      if (is_unordered(inst) || inst->is_control_source(i)) {
          return false;
       } else {
          const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
index 08e3d83d910..0a0ba1d363a 100644
--- a/src/intel/compiler/brw_ir_fs.h
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -358,6 +358,13 @@  public:
    bool can_change_types() const;
    bool has_source_and_destination_hazard() const;
 
+   /**
+    * Return whether \p arg is a control source of a virtual instruction which
+    * shouldn't contribute to the execution type and usual regioning
+    * restriction calculations of arithmetic instructions.
+    */
+   bool is_control_source(unsigned arg) const;
+
    /**
     * Return the subset of flag registers read by the instruction as a bitset
     * with byte granularity.
@@ -462,7 +469,8 @@  get_exec_type(const fs_inst *inst)
    brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
 
    for (int i = 0; i < inst->sources; i++) {
-      if (inst->src[i].file != BAD_FILE) {
+      if (inst->src[i].file != BAD_FILE &&
+          !inst->is_control_source(i)) {
          const brw_reg_type t = get_exec_type(inst->src[i].type);
          if (type_sz(t) > type_sz(exec_type))
             exec_type = t;

Comments

Fixes all subgroup test failures in vulkancts on Icelake.
Series is:
Tested-by: Anuj Phogat <anuj.phogat@gmail.com>
On Fri, Jan 18, 2019 at 4:09 PM Francisco Jerez <currojerez@riseup.net> wrote:
>
> Currently the execution type calculation will return a bogus value in
> cases like:
>
>   mov_indirect(8) vgrf0:w, vgrf1:w, vgrf2:ud, 32u
>
> Which will be considered to have a 32-bit integer execution type even
> though the actual indirect move operation will be carried out with
> 16-bit precision.
>
> Similarly there's no need to apply the CHV/BXT double-precision region
> alignment restrictions to such control sources, since they aren't
> directly involved in the double-precision arithmetic operations
> emitted by these virtual instructions.  Applying the CHV/BXT
> restrictions to control sources was expected to be harmless if mildly
> inefficient, but unfortunately it exposed problems at codegen level
> for virtual instructions (namely the SHUFFLE instruction used for the
> Vulkan 1.1 subgroup feature) that weren't prepared to accept control
> sources with an arbitrary strided region.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109328
> Reported-by: Mark Janes <mark.a.janes@intel.com>
> Fixes: efa4e4bc5fc "intel/fs: Introduce regioning lowering pass."
> ---
>  src/intel/compiler/brw_fs.cpp                 | 54 +++++++++++++++++++
>  src/intel/compiler/brw_fs_lower_regioning.cpp |  6 +--
>  src/intel/compiler/brw_ir_fs.h                | 10 +++-
>  3 files changed, 66 insertions(+), 4 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 0359eb079f7..f475b617df2 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -271,6 +271,60 @@ fs_inst::is_send_from_grf() const
>     }
>  }
>
> +bool
> +fs_inst::is_control_source(unsigned arg) const
> +{
> +   switch (opcode) {
> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 is now removed from
master. This series need a rebase.
> +      return arg == 0;
> +
> +   case SHADER_OPCODE_BROADCAST:
> +   case SHADER_OPCODE_SHUFFLE:
> +   case SHADER_OPCODE_QUAD_SWIZZLE:
> +   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
> +   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
> +   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
> +   case SHADER_OPCODE_IMAGE_SIZE:
> +   case SHADER_OPCODE_GET_BUFFER_SIZE:
> +      return arg == 1;
> +
> +   case SHADER_OPCODE_MOV_INDIRECT:
> +   case SHADER_OPCODE_CLUSTER_BROADCAST:
> +   case SHADER_OPCODE_TEX:
> +   case FS_OPCODE_TXB:
> +   case SHADER_OPCODE_TXD:
> +   case SHADER_OPCODE_TXF:
> +   case SHADER_OPCODE_TXF_LZ:
> +   case SHADER_OPCODE_TXF_CMS:
> +   case SHADER_OPCODE_TXF_CMS_W:
> +   case SHADER_OPCODE_TXF_UMS:
> +   case SHADER_OPCODE_TXF_MCS:
> +   case SHADER_OPCODE_TXL:
> +   case SHADER_OPCODE_TXL_LZ:
> +   case SHADER_OPCODE_TXS:
> +   case SHADER_OPCODE_LOD:
> +   case SHADER_OPCODE_TG4:
> +   case SHADER_OPCODE_TG4_OFFSET:
> +   case SHADER_OPCODE_SAMPLEINFO:
> +   case SHADER_OPCODE_UNTYPED_ATOMIC:
> +   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
> +   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
> +   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
> +   case SHADER_OPCODE_BYTE_SCATTERED_READ:
> +   case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
> +   case SHADER_OPCODE_TYPED_ATOMIC:
> +   case SHADER_OPCODE_TYPED_SURFACE_READ:
> +   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
> +      return arg == 1 || arg == 2;
> +
> +   default:
> +      return false;
> +   }
> +}
> +
>  /**
>   * Returns true if this instruction's sources and destinations cannot
>   * safely be the same register.
> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp b/src/intel/compiler/brw_fs_lower_regioning.cpp
> index df50993dee6..6a3c23892b4 100644
> --- a/src/intel/compiler/brw_fs_lower_regioning.cpp
> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
> @@ -74,7 +74,7 @@ namespace {
>           unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
>
>           for (unsigned i = 0; i < inst->sources; i++) {
> -            if (!is_uniform(inst->src[i]))
> +            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
>                 stride = MAX2(stride, inst->src[i].stride *
>                               type_sz(inst->src[i].type));
>           }
> @@ -92,7 +92,7 @@ namespace {
>     required_dst_byte_offset(const fs_inst *inst)
>     {
>        for (unsigned i = 0; i < inst->sources; i++) {
> -         if (!is_uniform(inst->src[i]))
> +         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
>              if (reg_offset(inst->src[i]) % REG_SIZE !=
>                  reg_offset(inst->dst) % REG_SIZE)
>                 return 0;
> @@ -109,7 +109,7 @@ namespace {
>     has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
>                            unsigned i)
>     {
> -      if (is_unordered(inst)) {
> +      if (is_unordered(inst) || inst->is_control_source(i)) {
>           return false;
>        } else {
>           const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
> diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
> index 08e3d83d910..0a0ba1d363a 100644
> --- a/src/intel/compiler/brw_ir_fs.h
> +++ b/src/intel/compiler/brw_ir_fs.h
> @@ -358,6 +358,13 @@ public:
>     bool can_change_types() const;
>     bool has_source_and_destination_hazard() const;
>
> +   /**
> +    * Return whether \p arg is a control source of a virtual instruction which
> +    * shouldn't contribute to the execution type and usual regioning
> +    * restriction calculations of arithmetic instructions.
> +    */
> +   bool is_control_source(unsigned arg) const;
> +
>     /**
>      * Return the subset of flag registers read by the instruction as a bitset
>      * with byte granularity.
> @@ -462,7 +469,8 @@ get_exec_type(const fs_inst *inst)
>     brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
>
>     for (int i = 0; i < inst->sources; i++) {
> -      if (inst->src[i].file != BAD_FILE) {
> +      if (inst->src[i].file != BAD_FILE &&
> +          !inst->is_control_source(i)) {
>           const brw_reg_type t = get_exec_type(inst->src[i].type);
>           if (type_sz(t) > type_sz(exec_type))
>              exec_type = t;
> --
> 2.19.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Fri, Jan 18, 2019 at 6:09 PM Francisco Jerez <currojerez@riseup.net>
wrote:

> Currently the execution type calculation will return a bogus value in
> cases like:
>
>   mov_indirect(8) vgrf0:w, vgrf1:w, vgrf2:ud, 32u
>
> Which will be considered to have a 32-bit integer execution type even
> though the actual indirect move operation will be carried out with
> 16-bit precision.
>
> Similarly there's no need to apply the CHV/BXT double-precision region
> alignment restrictions to such control sources, since they aren't
> directly involved in the double-precision arithmetic operations
> emitted by these virtual instructions.  Applying the CHV/BXT
> restrictions to control sources was expected to be harmless if mildly
> inefficient, but unfortunately it exposed problems at codegen level
> for virtual instructions (namely the SHUFFLE instruction used for the
> Vulkan 1.1 subgroup feature) that weren't prepared to accept control
> sources with an arbitrary strided region.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109328
> Reported-by: Mark Janes <mark.a.janes@intel.com>
> Fixes: efa4e4bc5fc "intel/fs: Introduce regioning lowering pass."
> ---
>  src/intel/compiler/brw_fs.cpp                 | 54 +++++++++++++++++++
>  src/intel/compiler/brw_fs_lower_regioning.cpp |  6 +--
>  src/intel/compiler/brw_ir_fs.h                | 10 +++-
>  3 files changed, 66 insertions(+), 4 deletions(-)
>
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 0359eb079f7..f475b617df2 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -271,6 +271,60 @@ fs_inst::is_send_from_grf() const
>     }
>  }
>
> +bool
> +fs_inst::is_control_source(unsigned arg) const
> +{
> +   switch (opcode) {
> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
> +      return arg == 0;
> +
> +   case SHADER_OPCODE_BROADCAST:
> +   case SHADER_OPCODE_SHUFFLE:
> +   case SHADER_OPCODE_QUAD_SWIZZLE:
> +   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
> +   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
> +   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
> +   case SHADER_OPCODE_IMAGE_SIZE:
> +   case SHADER_OPCODE_GET_BUFFER_SIZE:
> +      return arg == 1;
> +
> +   case SHADER_OPCODE_MOV_INDIRECT:
> +   case SHADER_OPCODE_CLUSTER_BROADCAST:
> +   case SHADER_OPCODE_TEX:
> +   case FS_OPCODE_TXB:
> +   case SHADER_OPCODE_TXD:
> +   case SHADER_OPCODE_TXF:
> +   case SHADER_OPCODE_TXF_LZ:
> +   case SHADER_OPCODE_TXF_CMS:
> +   case SHADER_OPCODE_TXF_CMS_W:
> +   case SHADER_OPCODE_TXF_UMS:
> +   case SHADER_OPCODE_TXF_MCS:
> +   case SHADER_OPCODE_TXL:
> +   case SHADER_OPCODE_TXL_LZ:
> +   case SHADER_OPCODE_TXS:
> +   case SHADER_OPCODE_LOD:
> +   case SHADER_OPCODE_TG4:
> +   case SHADER_OPCODE_TG4_OFFSET:
> +   case SHADER_OPCODE_SAMPLEINFO:
> +   case SHADER_OPCODE_UNTYPED_ATOMIC:
> +   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
> +   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
> +   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
> +   case SHADER_OPCODE_BYTE_SCATTERED_READ:
> +   case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
> +   case SHADER_OPCODE_TYPED_ATOMIC:
> +   case SHADER_OPCODE_TYPED_SURFACE_READ:
> +   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
>

As of b284d222db, we are no longer using many of the opcodes in this list
(gen7 pull constant loads, [un]typed surface reads/writes, etc.)  It will
need to be rebased and we need to add SHADER_OPCODE_SEND to the list.
Fortunately, the changes to add SHADER_OPCODE_SEND landed before the 19.0
cut-off so there is no need to make two versions for backporting.

Other than that, this patch seems perfectly reasonable to me

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>

If you want me to hand-review the new list of opcodes, feel free to send a
v2 and cc me.


> +      return arg == 1 || arg == 2;
> +
> +   default:
> +      return false;
> +   }
> +}
> +
>  /**
>   * Returns true if this instruction's sources and destinations cannot
>   * safely be the same register.
> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp
> b/src/intel/compiler/brw_fs_lower_regioning.cpp
> index df50993dee6..6a3c23892b4 100644
> --- a/src/intel/compiler/brw_fs_lower_regioning.cpp
> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
> @@ -74,7 +74,7 @@ namespace {
>           unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
>
>           for (unsigned i = 0; i < inst->sources; i++) {
> -            if (!is_uniform(inst->src[i]))
> +            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
>                 stride = MAX2(stride, inst->src[i].stride *
>                               type_sz(inst->src[i].type));
>           }
> @@ -92,7 +92,7 @@ namespace {
>     required_dst_byte_offset(const fs_inst *inst)
>     {
>        for (unsigned i = 0; i < inst->sources; i++) {
> -         if (!is_uniform(inst->src[i]))
> +         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
>              if (reg_offset(inst->src[i]) % REG_SIZE !=
>                  reg_offset(inst->dst) % REG_SIZE)
>                 return 0;
> @@ -109,7 +109,7 @@ namespace {
>     has_invalid_src_region(const gen_device_info *devinfo, const fs_inst
> *inst,
>                            unsigned i)
>     {
> -      if (is_unordered(inst)) {
> +      if (is_unordered(inst) || inst->is_control_source(i)) {
>           return false;
>        } else {
>           const unsigned dst_byte_stride = inst->dst.stride *
> type_sz(inst->dst.type);
> diff --git a/src/intel/compiler/brw_ir_fs.h
> b/src/intel/compiler/brw_ir_fs.h
> index 08e3d83d910..0a0ba1d363a 100644
> --- a/src/intel/compiler/brw_ir_fs.h
> +++ b/src/intel/compiler/brw_ir_fs.h
> @@ -358,6 +358,13 @@ public:
>     bool can_change_types() const;
>     bool has_source_and_destination_hazard() const;
>
> +   /**
> +    * Return whether \p arg is a control source of a virtual instruction
> which
> +    * shouldn't contribute to the execution type and usual regioning
> +    * restriction calculations of arithmetic instructions.
> +    */
> +   bool is_control_source(unsigned arg) const;
> +
>     /**
>      * Return the subset of flag registers read by the instruction as a
> bitset
>      * with byte granularity.
> @@ -462,7 +469,8 @@ get_exec_type(const fs_inst *inst)
>     brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
>
>     for (int i = 0; i < inst->sources; i++) {
> -      if (inst->src[i].file != BAD_FILE) {
> +      if (inst->src[i].file != BAD_FILE &&
> +          !inst->is_control_source(i)) {
>           const brw_reg_type t = get_exec_type(inst->src[i].type);
>           if (type_sz(t) > type_sz(exec_type))
>              exec_type = t;
> --
> 2.19.2
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
Jason Ekstrand <jason@jlekstrand.net> writes:

> On Fri, Jan 18, 2019 at 6:09 PM Francisco Jerez <currojerez@riseup.net>
> wrote:
>
>> Currently the execution type calculation will return a bogus value in
>> cases like:
>>
>>   mov_indirect(8) vgrf0:w, vgrf1:w, vgrf2:ud, 32u
>>
>> Which will be considered to have a 32-bit integer execution type even
>> though the actual indirect move operation will be carried out with
>> 16-bit precision.
>>
>> Similarly there's no need to apply the CHV/BXT double-precision region
>> alignment restrictions to such control sources, since they aren't
>> directly involved in the double-precision arithmetic operations
>> emitted by these virtual instructions.  Applying the CHV/BXT
>> restrictions to control sources was expected to be harmless if mildly
>> inefficient, but unfortunately it exposed problems at codegen level
>> for virtual instructions (namely the SHUFFLE instruction used for the
>> Vulkan 1.1 subgroup feature) that weren't prepared to accept control
>> sources with an arbitrary strided region.
>>
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109328
>> Reported-by: Mark Janes <mark.a.janes@intel.com>
>> Fixes: efa4e4bc5fc "intel/fs: Introduce regioning lowering pass."
>> ---
>>  src/intel/compiler/brw_fs.cpp                 | 54 +++++++++++++++++++
>>  src/intel/compiler/brw_fs_lower_regioning.cpp |  6 +--
>>  src/intel/compiler/brw_ir_fs.h                | 10 +++-
>>  3 files changed, 66 insertions(+), 4 deletions(-)
>>
>> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
>> index 0359eb079f7..f475b617df2 100644
>> --- a/src/intel/compiler/brw_fs.cpp
>> +++ b/src/intel/compiler/brw_fs.cpp
>> @@ -271,6 +271,60 @@ fs_inst::is_send_from_grf() const
>>     }
>>  }
>>
>> +bool
>> +fs_inst::is_control_source(unsigned arg) const
>> +{
>> +   switch (opcode) {
>> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
>> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
>> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
>> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
>> +      return arg == 0;
>> +
>> +   case SHADER_OPCODE_BROADCAST:
>> +   case SHADER_OPCODE_SHUFFLE:
>> +   case SHADER_OPCODE_QUAD_SWIZZLE:
>> +   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
>> +   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
>> +   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
>> +   case SHADER_OPCODE_IMAGE_SIZE:
>> +   case SHADER_OPCODE_GET_BUFFER_SIZE:
>> +      return arg == 1;
>> +
>> +   case SHADER_OPCODE_MOV_INDIRECT:
>> +   case SHADER_OPCODE_CLUSTER_BROADCAST:
>> +   case SHADER_OPCODE_TEX:
>> +   case FS_OPCODE_TXB:
>> +   case SHADER_OPCODE_TXD:
>> +   case SHADER_OPCODE_TXF:
>> +   case SHADER_OPCODE_TXF_LZ:
>> +   case SHADER_OPCODE_TXF_CMS:
>> +   case SHADER_OPCODE_TXF_CMS_W:
>> +   case SHADER_OPCODE_TXF_UMS:
>> +   case SHADER_OPCODE_TXF_MCS:
>> +   case SHADER_OPCODE_TXL:
>> +   case SHADER_OPCODE_TXL_LZ:
>> +   case SHADER_OPCODE_TXS:
>> +   case SHADER_OPCODE_LOD:
>> +   case SHADER_OPCODE_TG4:
>> +   case SHADER_OPCODE_TG4_OFFSET:
>> +   case SHADER_OPCODE_SAMPLEINFO:
>> +   case SHADER_OPCODE_UNTYPED_ATOMIC:
>> +   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
>> +   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
>> +   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
>> +   case SHADER_OPCODE_BYTE_SCATTERED_READ:
>> +   case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
>> +   case SHADER_OPCODE_TYPED_ATOMIC:
>> +   case SHADER_OPCODE_TYPED_SURFACE_READ:
>> +   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
>>
>
> As of b284d222db, we are no longer using many of the opcodes in this list
> (gen7 pull constant loads, [un]typed surface reads/writes, etc.)  It will
> need to be rebased and we need to add SHADER_OPCODE_SEND to the list.
> Fortunately, the changes to add SHADER_OPCODE_SEND landed before the 19.0
> cut-off so there is no need to make two versions for backporting.
>

Yes, that's roughly what I had done during one of my previous rebases of
this series, see:

https://cgit.freedesktop.org/~currojerez/mesa/commit/?h=jenkins&id=30f8f3ff48b02ead688705e0679a98c0d6c9c87e

> Other than that, this patch seems perfectly reasonable to me
>
> Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
>
> If you want me to hand-review the new list of opcodes, feel free to send a
> v2 and cc me.
>
>
>> +      return arg == 1 || arg == 2;
>> +
>> +   default:
>> +      return false;
>> +   }
>> +}
>> +
>>  /**
>>   * Returns true if this instruction's sources and destinations cannot
>>   * safely be the same register.
>> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp
>> b/src/intel/compiler/brw_fs_lower_regioning.cpp
>> index df50993dee6..6a3c23892b4 100644
>> --- a/src/intel/compiler/brw_fs_lower_regioning.cpp
>> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
>> @@ -74,7 +74,7 @@ namespace {
>>           unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
>>
>>           for (unsigned i = 0; i < inst->sources; i++) {
>> -            if (!is_uniform(inst->src[i]))
>> +            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
>>                 stride = MAX2(stride, inst->src[i].stride *
>>                               type_sz(inst->src[i].type));
>>           }
>> @@ -92,7 +92,7 @@ namespace {
>>     required_dst_byte_offset(const fs_inst *inst)
>>     {
>>        for (unsigned i = 0; i < inst->sources; i++) {
>> -         if (!is_uniform(inst->src[i]))
>> +         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
>>              if (reg_offset(inst->src[i]) % REG_SIZE !=
>>                  reg_offset(inst->dst) % REG_SIZE)
>>                 return 0;
>> @@ -109,7 +109,7 @@ namespace {
>>     has_invalid_src_region(const gen_device_info *devinfo, const fs_inst
>> *inst,
>>                            unsigned i)
>>     {
>> -      if (is_unordered(inst)) {
>> +      if (is_unordered(inst) || inst->is_control_source(i)) {
>>           return false;
>>        } else {
>>           const unsigned dst_byte_stride = inst->dst.stride *
>> type_sz(inst->dst.type);
>> diff --git a/src/intel/compiler/brw_ir_fs.h
>> b/src/intel/compiler/brw_ir_fs.h
>> index 08e3d83d910..0a0ba1d363a 100644
>> --- a/src/intel/compiler/brw_ir_fs.h
>> +++ b/src/intel/compiler/brw_ir_fs.h
>> @@ -358,6 +358,13 @@ public:
>>     bool can_change_types() const;
>>     bool has_source_and_destination_hazard() const;
>>
>> +   /**
>> +    * Return whether \p arg is a control source of a virtual instruction
>> which
>> +    * shouldn't contribute to the execution type and usual regioning
>> +    * restriction calculations of arithmetic instructions.
>> +    */
>> +   bool is_control_source(unsigned arg) const;
>> +
>>     /**
>>      * Return the subset of flag registers read by the instruction as a
>> bitset
>>      * with byte granularity.
>> @@ -462,7 +469,8 @@ get_exec_type(const fs_inst *inst)
>>     brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
>>
>>     for (int i = 0; i < inst->sources; i++) {
>> -      if (inst->src[i].file != BAD_FILE) {
>> +      if (inst->src[i].file != BAD_FILE &&
>> +          !inst->is_control_source(i)) {
>>           const brw_reg_type t = get_exec_type(inst->src[i].type);
>>           if (type_sz(t) > type_sz(exec_type))
>>              exec_type = t;
>> --
>> 2.19.2
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>
On Thu, Feb 14, 2019 at 9:33 PM Francisco Jerez <currojerez@riseup.net>
wrote:

> Jason Ekstrand <jason@jlekstrand.net> writes:
>
> > On Fri, Jan 18, 2019 at 6:09 PM Francisco Jerez <currojerez@riseup.net>
> > wrote:
> >
> >> Currently the execution type calculation will return a bogus value in
> >> cases like:
> >>
> >>   mov_indirect(8) vgrf0:w, vgrf1:w, vgrf2:ud, 32u
> >>
> >> Which will be considered to have a 32-bit integer execution type even
> >> though the actual indirect move operation will be carried out with
> >> 16-bit precision.
> >>
> >> Similarly there's no need to apply the CHV/BXT double-precision region
> >> alignment restrictions to such control sources, since they aren't
> >> directly involved in the double-precision arithmetic operations
> >> emitted by these virtual instructions.  Applying the CHV/BXT
> >> restrictions to control sources was expected to be harmless if mildly
> >> inefficient, but unfortunately it exposed problems at codegen level
> >> for virtual instructions (namely the SHUFFLE instruction used for the
> >> Vulkan 1.1 subgroup feature) that weren't prepared to accept control
> >> sources with an arbitrary strided region.
> >>
> >> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109328
> >> Reported-by: Mark Janes <mark.a.janes@intel.com>
> >> Fixes: efa4e4bc5fc "intel/fs: Introduce regioning lowering pass."
> >> ---
> >>  src/intel/compiler/brw_fs.cpp                 | 54 +++++++++++++++++++
> >>  src/intel/compiler/brw_fs_lower_regioning.cpp |  6 +--
> >>  src/intel/compiler/brw_ir_fs.h                | 10 +++-
> >>  3 files changed, 66 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/src/intel/compiler/brw_fs.cpp
> b/src/intel/compiler/brw_fs.cpp
> >> index 0359eb079f7..f475b617df2 100644
> >> --- a/src/intel/compiler/brw_fs.cpp
> >> +++ b/src/intel/compiler/brw_fs.cpp
> >> @@ -271,6 +271,60 @@ fs_inst::is_send_from_grf() const
> >>     }
> >>  }
> >>
> >> +bool
> >> +fs_inst::is_control_source(unsigned arg) const
> >> +{
> >> +   switch (opcode) {
> >> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
> >> +   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
> >> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
> >> +   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
> >> +      return arg == 0;
> >> +
> >> +   case SHADER_OPCODE_BROADCAST:
> >> +   case SHADER_OPCODE_SHUFFLE:
> >> +   case SHADER_OPCODE_QUAD_SWIZZLE:
> >> +   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
> >> +   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
> >> +   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
> >> +   case SHADER_OPCODE_IMAGE_SIZE:
> >> +   case SHADER_OPCODE_GET_BUFFER_SIZE:
> >> +      return arg == 1;
> >> +
> >> +   case SHADER_OPCODE_MOV_INDIRECT:
> >> +   case SHADER_OPCODE_CLUSTER_BROADCAST:
> >> +   case SHADER_OPCODE_TEX:
> >> +   case FS_OPCODE_TXB:
> >> +   case SHADER_OPCODE_TXD:
> >> +   case SHADER_OPCODE_TXF:
> >> +   case SHADER_OPCODE_TXF_LZ:
> >> +   case SHADER_OPCODE_TXF_CMS:
> >> +   case SHADER_OPCODE_TXF_CMS_W:
> >> +   case SHADER_OPCODE_TXF_UMS:
> >> +   case SHADER_OPCODE_TXF_MCS:
> >> +   case SHADER_OPCODE_TXL:
> >> +   case SHADER_OPCODE_TXL_LZ:
> >> +   case SHADER_OPCODE_TXS:
> >> +   case SHADER_OPCODE_LOD:
> >> +   case SHADER_OPCODE_TG4:
> >> +   case SHADER_OPCODE_TG4_OFFSET:
> >> +   case SHADER_OPCODE_SAMPLEINFO:
> >> +   case SHADER_OPCODE_UNTYPED_ATOMIC:
> >> +   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT:
> >> +   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
> >> +   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
> >> +   case SHADER_OPCODE_BYTE_SCATTERED_READ:
> >> +   case SHADER_OPCODE_BYTE_SCATTERED_WRITE:
> >> +   case SHADER_OPCODE_TYPED_ATOMIC:
> >> +   case SHADER_OPCODE_TYPED_SURFACE_READ:
> >> +   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
> >>
> >
> > As of b284d222db, we are no longer using many of the opcodes in this list
> > (gen7 pull constant loads, [un]typed surface reads/writes, etc.)  It will
> > need to be rebased and we need to add SHADER_OPCODE_SEND to the list.
> > Fortunately, the changes to add SHADER_OPCODE_SEND landed before the 19.0
> > cut-off so there is no need to make two versions for backporting.
> >
>
> Yes, that's roughly what I had done during one of my previous rebases of
> this series, see:
>
>
> https://cgit.freedesktop.org/~currojerez/mesa/commit/?h=jenkins&id=30f8f3ff48b02ead688705e0679a98c0d6c9c87e
>

Looks good.  Some of those opcodes are no longer used.  I sent out a MR to
clean some of it up but I think it's better if this lands first so that it
can be more cleanly back-ported

Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>


> > Other than that, this patch seems perfectly reasonable to me
> >
> > Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
> >
> > If you want me to hand-review the new list of opcodes, feel free to send
> a
> > v2 and cc me.
> >
> >
> >> +      return arg == 1 || arg == 2;
> >> +
> >> +   default:
> >> +      return false;
> >> +   }
> >> +}
> >> +
> >>  /**
> >>   * Returns true if this instruction's sources and destinations cannot
> >>   * safely be the same register.
> >> diff --git a/src/intel/compiler/brw_fs_lower_regioning.cpp
> >> b/src/intel/compiler/brw_fs_lower_regioning.cpp
> >> index df50993dee6..6a3c23892b4 100644
> >> --- a/src/intel/compiler/brw_fs_lower_regioning.cpp
> >> +++ b/src/intel/compiler/brw_fs_lower_regioning.cpp
> >> @@ -74,7 +74,7 @@ namespace {
> >>           unsigned stride = inst->dst.stride * type_sz(inst->dst.type);
> >>
> >>           for (unsigned i = 0; i < inst->sources; i++) {
> >> -            if (!is_uniform(inst->src[i]))
> >> +            if (!is_uniform(inst->src[i]) &&
> !inst->is_control_source(i))
> >>                 stride = MAX2(stride, inst->src[i].stride *
> >>                               type_sz(inst->src[i].type));
> >>           }
> >> @@ -92,7 +92,7 @@ namespace {
> >>     required_dst_byte_offset(const fs_inst *inst)
> >>     {
> >>        for (unsigned i = 0; i < inst->sources; i++) {
> >> -         if (!is_uniform(inst->src[i]))
> >> +         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
> >>              if (reg_offset(inst->src[i]) % REG_SIZE !=
> >>                  reg_offset(inst->dst) % REG_SIZE)
> >>                 return 0;
> >> @@ -109,7 +109,7 @@ namespace {
> >>     has_invalid_src_region(const gen_device_info *devinfo, const fs_inst
> >> *inst,
> >>                            unsigned i)
> >>     {
> >> -      if (is_unordered(inst)) {
> >> +      if (is_unordered(inst) || inst->is_control_source(i)) {
> >>           return false;
> >>        } else {
> >>           const unsigned dst_byte_stride = inst->dst.stride *
> >> type_sz(inst->dst.type);
> >> diff --git a/src/intel/compiler/brw_ir_fs.h
> >> b/src/intel/compiler/brw_ir_fs.h
> >> index 08e3d83d910..0a0ba1d363a 100644
> >> --- a/src/intel/compiler/brw_ir_fs.h
> >> +++ b/src/intel/compiler/brw_ir_fs.h
> >> @@ -358,6 +358,13 @@ public:
> >>     bool can_change_types() const;
> >>     bool has_source_and_destination_hazard() const;
> >>
> >> +   /**
> >> +    * Return whether \p arg is a control source of a virtual
> instruction
> >> which
> >> +    * shouldn't contribute to the execution type and usual regioning
> >> +    * restriction calculations of arithmetic instructions.
> >> +    */
> >> +   bool is_control_source(unsigned arg) const;
> >> +
> >>     /**
> >>      * Return the subset of flag registers read by the instruction as a
> >> bitset
> >>      * with byte granularity.
> >> @@ -462,7 +469,8 @@ get_exec_type(const fs_inst *inst)
> >>     brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
> >>
> >>     for (int i = 0; i < inst->sources; i++) {
> >> -      if (inst->src[i].file != BAD_FILE) {
> >> +      if (inst->src[i].file != BAD_FILE &&
> >> +          !inst->is_control_source(i)) {
> >>           const brw_reg_type t = get_exec_type(inst->src[i].type);
> >>           if (type_sz(t) > type_sz(exec_type))
> >>              exec_type = t;
> >> --
> >> 2.19.2
> >>
> >> _______________________________________________
> >> mesa-dev mailing list
> >> mesa-dev@lists.freedesktop.org
> >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> >>
>