[02/10] intel/fs: Implement quad swizzles on ICL+.

Submitted by Francisco Jerez on Dec. 29, 2018, 8:38 p.m.

Details

Message ID 20181229203904.18847-3-currojerez@riseup.net
State New
Headers show
Series "Series without cover letter" ( rev: 2 1 ) in Mesa

Not browsing as part of any series.

Commit Message

Francisco Jerez Dec. 29, 2018, 8:38 p.m.
Align16 is no longer a thing, so a new implementation is provided
using Align1 instead.  Not all possible swizzles can be represented as
a single Align1 region, but some fast paths are provided for
frequently used swizzles that can be represented efficiently in Align1
mode.

Fixes ~90 subgroup quad swap Vulkan CTS tests.

Cc: mesa-stable@lists.freedesktop.org
---
 src/intel/compiler/brw_fs.cpp           | 25 +++++++-
 src/intel/compiler/brw_fs.h             |  4 ++
 src/intel/compiler/brw_fs_generator.cpp | 82 ++++++++++++++++++++-----
 3 files changed, 93 insertions(+), 18 deletions(-)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 2f0f0151219..97544fdf465 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -315,6 +315,20 @@  fs_inst::has_source_and_destination_hazard() const
        * may stomp all over it.
        */
       return true;
+   case SHADER_OPCODE_QUAD_SWIZZLE:
+      switch (src[1].ud) {
+      case BRW_SWIZZLE_XXXX:
+      case BRW_SWIZZLE_YYYY:
+      case BRW_SWIZZLE_ZZZZ:
+      case BRW_SWIZZLE_WWWW:
+      case BRW_SWIZZLE_XXZZ:
+      case BRW_SWIZZLE_YYWW:
+      case BRW_SWIZZLE_XYXY:
+      case BRW_SWIZZLE_ZWZW:
+         return false;
+      default:
+         return !is_uniform(src[0]);
+      }
    default:
       /* The SIMD16 compressed instruction
        *
@@ -5579,9 +5593,14 @@  get_lowered_simd_width(const struct gen_device_info *devinfo,
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
       return MIN2(8, inst->exec_size);
 
-   case SHADER_OPCODE_QUAD_SWIZZLE:
-      return 8;
-
+   case SHADER_OPCODE_QUAD_SWIZZLE: {
+      const unsigned swiz = inst->src[1].ud;
+      return (is_uniform(inst->src[0]) ?
+                 get_fpu_lowered_simd_width(devinfo, inst) :
+              devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
+              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
+              get_fpu_lowered_simd_width(devinfo, inst));
+   }
    case SHADER_OPCODE_MOV_INDIRECT: {
       /* From IVB and HSW PRMs:
        *
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 53d9b6ce7bf..dc36ecc21ac 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -480,6 +480,10 @@  private:
                          struct brw_reg src,
                          struct brw_reg idx);
 
+   void generate_quad_swizzle(const fs_inst *inst,
+                              struct brw_reg dst, struct brw_reg src,
+                              unsigned swiz);
+
    bool patch_discard_jumps_to_fb_writes();
 
    const struct brw_compiler *compiler;
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 08dd83dded7..84627e83132 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -582,6 +582,72 @@  fs_generator::generate_shuffle(fs_inst *inst,
    }
 }
 
+void
+fs_generator::generate_quad_swizzle(const fs_inst *inst,
+                                    struct brw_reg dst, struct brw_reg src,
+                                    unsigned swiz)
+{
+   /* Requires a quad. */
+   assert(inst->exec_size >= 4);
+
+   if (src.file == BRW_IMMEDIATE_VALUE ||
+       has_scalar_region(src)) {
+      /* The value is uniform across all channels */
+      brw_MOV(p, dst, src);
+
+   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
+      /* This only works on 8-wide 32-bit values */
+      assert(inst->exec_size == 8);
+      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src.vstride == src.width + 1);
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      struct brw_reg swiz_src = stride(src, 4, 4, 1);
+      swiz_src.swizzle = swiz;
+      brw_MOV(p, dst, swiz_src);
+
+   } else {
+      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src.vstride == src.width + 1);
+      const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
+
+      switch (swiz) {
+      case BRW_SWIZZLE_XXXX:
+      case BRW_SWIZZLE_YYYY:
+      case BRW_SWIZZLE_ZZZZ:
+      case BRW_SWIZZLE_WWWW:
+         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
+         break;
+
+      case BRW_SWIZZLE_XXZZ:
+      case BRW_SWIZZLE_YYWW:
+         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
+         break;
+
+      case BRW_SWIZZLE_XYXY:
+      case BRW_SWIZZLE_ZWZW:
+         assert(inst->exec_size == 4);
+         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
+         break;
+
+      default:
+         assert(inst->force_writemask_all);
+         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
+
+         for (unsigned c = 0; c < 4; c++) {
+            brw_inst *insn = brw_MOV(
+               p, stride(suboffset(dst, c),
+                         4 * inst->dst.stride, 1, 4 * inst->dst.stride),
+               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
+
+            brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
+            brw_inst_set_no_dd_check(devinfo, insn, c > 0);
+         }
+
+         break;
+      }
+   }
+}
+
 void
 fs_generator::generate_urb_read(fs_inst *inst,
                                 struct brw_reg dst,
@@ -2303,23 +2369,9 @@  fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
 
       case SHADER_OPCODE_QUAD_SWIZZLE:
-         /* This only works on 8-wide 32-bit values */
-         assert(inst->exec_size == 8);
-         assert(type_sz(src[0].type) == 4);
-         assert(inst->force_writemask_all);
          assert(src[1].file == BRW_IMMEDIATE_VALUE);
          assert(src[1].type == BRW_REGISTER_TYPE_UD);
-
-         if (src[0].file == BRW_IMMEDIATE_VALUE ||
-             (src[0].vstride == 0 && src[0].hstride == 0)) {
-            /* The value is uniform across all channels */
-            brw_MOV(p, dst, src[0]);
-         } else {
-            brw_set_default_access_mode(p, BRW_ALIGN_16);
-            struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
-            swiz_src.swizzle = inst->src[1].ud;
-            brw_MOV(p, dst, swiz_src);
-         }
+         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
          break;
 
       case SHADER_OPCODE_CLUSTER_BROADCAST: {

Comments

On Sat, 2018-12-29 at 12:38 -0800, Francisco Jerez wrote:
> Align16 is no longer a thing, so a new implementation is provided
> using Align1 instead.  Not all possible swizzles can be represented
> as
> a single Align1 region, but some fast paths are provided for
> frequently used swizzles that can be represented efficiently in
> Align1
> mode.
> 
> Fixes ~90 subgroup quad swap Vulkan CTS tests.
> 
> Cc: mesa-stable@lists.freedesktop.org
> ---
>  src/intel/compiler/brw_fs.cpp           | 25 +++++++-
>  src/intel/compiler/brw_fs.h             |  4 ++
>  src/intel/compiler/brw_fs_generator.cpp | 82 ++++++++++++++++++++---
> --
>  3 files changed, 93 insertions(+), 18 deletions(-)
> 
> diff --git a/src/intel/compiler/brw_fs.cpp
> b/src/intel/compiler/brw_fs.cpp
> index 2f0f0151219..97544fdf465 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard()
> const
>         * may stomp all over it.
>         */
>        return true;
> +   case SHADER_OPCODE_QUAD_SWIZZLE:
> +      switch (src[1].ud) {

Maybe it is worth adding a small comment here indicating that these are
the cases where we implement the opcode as a single instruction and
refer to the generator for details?

> +      case BRW_SWIZZLE_XXXX:
> +      case BRW_SWIZZLE_YYYY:
> +      case BRW_SWIZZLE_ZZZZ:
> +      case BRW_SWIZZLE_WWWW:
> +      case BRW_SWIZZLE_XXZZ:
> +      case BRW_SWIZZLE_YYWW:
> +      case BRW_SWIZZLE_XYXY:
> +      case BRW_SWIZZLE_ZWZW:
> +         return false;
> +      default:
> +         return !is_uniform(src[0]);

Shouldn't this be:

return !is_uniform(src[0]) ||
       (devinfo->gen < 11 && type_sz(src.type) == 4);

Since in that case we also implement the opcode with a single ALIGN16
instruction.

> +      }
>     default:
>        /* The SIMD16 compressed instruction
>         *
> @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct
> gen_device_info *devinfo,
>     case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
>        return MIN2(8, inst->exec_size);
>  
> -   case SHADER_OPCODE_QUAD_SWIZZLE:
> -      return 8;
> -
> +   case SHADER_OPCODE_QUAD_SWIZZLE: {
> +      const unsigned swiz = inst->src[1].ud;
> +      return (is_uniform(inst->src[0]) ?
> +                 get_fpu_lowered_simd_width(devinfo, inst) :
> +              devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ?
> 8 :
> +              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ?
> 4 :
> +              get_fpu_lowered_simd_width(devinfo, inst));
> +   }
>     case SHADER_OPCODE_MOV_INDIRECT: {
>        /* From IVB and HSW PRMs:
>         *
> diff --git a/src/intel/compiler/brw_fs.h
> b/src/intel/compiler/brw_fs.h
> index 53d9b6ce7bf..dc36ecc21ac 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -480,6 +480,10 @@ private:
>                           struct brw_reg src,
>                           struct brw_reg idx);
>  
> +   void generate_quad_swizzle(const fs_inst *inst,
> +                              struct brw_reg dst, struct brw_reg
> src,
> +                              unsigned swiz);
> +
>     bool patch_discard_jumps_to_fb_writes();
>  
>     const struct brw_compiler *compiler;
> diff --git a/src/intel/compiler/brw_fs_generator.cpp
> b/src/intel/compiler/brw_fs_generator.cpp
> index 08dd83dded7..84627e83132 100644
> --- a/src/intel/compiler/brw_fs_generator.cpp
> +++ b/src/intel/compiler/brw_fs_generator.cpp
> @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
>     }
>  }
>  
> +void
> +fs_generator::generate_quad_swizzle(const fs_inst *inst,
> +                                    struct brw_reg dst, struct
> brw_reg src,
> +                                    unsigned swiz)
> +{
> +   /* Requires a quad. */
> +   assert(inst->exec_size >= 4);
> +
> +   if (src.file == BRW_IMMEDIATE_VALUE ||
> +       has_scalar_region(src)) {
> +      /* The value is uniform across all channels */
> +      brw_MOV(p, dst, src);
> +
> +   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
> +      /* This only works on 8-wide 32-bit values */
> +      assert(inst->exec_size == 8);
> +      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
> +      assert(src.vstride == src.width + 1);
> +      brw_set_default_access_mode(p, BRW_ALIGN_16);
> +      struct brw_reg swiz_src = stride(src, 4, 4, 1);
> +      swiz_src.swizzle = swiz;
> +      brw_MOV(p, dst, swiz_src);
> +

Extra blank line.

> +   } else {
> +      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
> +      assert(src.vstride == src.width + 1);
> +      const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz,
> 0));
> +
> +      switch (swiz) {
> +      case BRW_SWIZZLE_XXXX:
> +      case BRW_SWIZZLE_YYYY:
> +      case BRW_SWIZZLE_ZZZZ:
> +      case BRW_SWIZZLE_WWWW:
> +         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
> +         break;
> +
> +      case BRW_SWIZZLE_XXZZ:
> +      case BRW_SWIZZLE_YYWW:
> +         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
> +         break;
> +
> +      case BRW_SWIZZLE_XYXY:
> +      case BRW_SWIZZLE_ZWZW:
> +         assert(inst->exec_size == 4);
> +         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
> +         break;
> +
> +      default:
> +         assert(inst->force_writemask_all);
> +         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
> +
> +         for (unsigned c = 0; c < 4; c++) {
> +            brw_inst *insn = brw_MOV(
> +               p, stride(suboffset(dst, c),
> +                         4 * inst->dst.stride, 1, 4 * inst-
> >dst.stride),
> +               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1,
> 0));
> +
> +            brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
> +            brw_inst_set_no_dd_check(devinfo, insn, c > 0);
> +         }
> +
> +         break;
> +      }
> +   }
> +}
> +
>  void
>  fs_generator::generate_urb_read(fs_inst *inst,
>                                  struct brw_reg dst,
> @@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t *cfg,
> int dispatch_width)
>           break;
>  
>        case SHADER_OPCODE_QUAD_SWIZZLE:
> -         /* This only works on 8-wide 32-bit values */
> -         assert(inst->exec_size == 8);
> -         assert(type_sz(src[0].type) == 4);
> -         assert(inst->force_writemask_all);
>           assert(src[1].file == BRW_IMMEDIATE_VALUE);
>           assert(src[1].type == BRW_REGISTER_TYPE_UD);
> -
> -         if (src[0].file == BRW_IMMEDIATE_VALUE ||
> -             (src[0].vstride == 0 && src[0].hstride == 0)) {
> -            /* The value is uniform across all channels */
> -            brw_MOV(p, dst, src[0]);
> -         } else {
> -            brw_set_default_access_mode(p, BRW_ALIGN_16);
> -            struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
> -            swiz_src.swizzle = inst->src[1].ud;
> -            brw_MOV(p, dst, swiz_src);
> -         }
> +         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
>           break;
>  
>        case SHADER_OPCODE_CLUSTER_BROADCAST: {
Iago Toral <itoral@igalia.com> writes:

> On Sat, 2018-12-29 at 12:38 -0800, Francisco Jerez wrote:
>> Align16 is no longer a thing, so a new implementation is provided
>> using Align1 instead.  Not all possible swizzles can be represented
>> as
>> a single Align1 region, but some fast paths are provided for
>> frequently used swizzles that can be represented efficiently in
>> Align1
>> mode.
>> 
>> Fixes ~90 subgroup quad swap Vulkan CTS tests.
>> 
>> Cc: mesa-stable@lists.freedesktop.org
>> ---
>>  src/intel/compiler/brw_fs.cpp           | 25 +++++++-
>>  src/intel/compiler/brw_fs.h             |  4 ++
>>  src/intel/compiler/brw_fs_generator.cpp | 82 ++++++++++++++++++++---
>> --
>>  3 files changed, 93 insertions(+), 18 deletions(-)
>> 
>> diff --git a/src/intel/compiler/brw_fs.cpp
>> b/src/intel/compiler/brw_fs.cpp
>> index 2f0f0151219..97544fdf465 100644
>> --- a/src/intel/compiler/brw_fs.cpp
>> +++ b/src/intel/compiler/brw_fs.cpp
>> @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard()
>> const
>>         * may stomp all over it.
>>         */
>>        return true;
>> +   case SHADER_OPCODE_QUAD_SWIZZLE:
>> +      switch (src[1].ud) {
>
> Maybe it is worth adding a small comment here indicating that these are
> the cases where we implement the opcode as a single instruction and
> refer to the generator for details?
>

Yeah, fixed up locally.

>> +      case BRW_SWIZZLE_XXXX:
>> +      case BRW_SWIZZLE_YYYY:
>> +      case BRW_SWIZZLE_ZZZZ:
>> +      case BRW_SWIZZLE_WWWW:
>> +      case BRW_SWIZZLE_XXZZ:
>> +      case BRW_SWIZZLE_YYWW:
>> +      case BRW_SWIZZLE_XYXY:
>> +      case BRW_SWIZZLE_ZWZW:
>> +         return false;
>> +      default:
>> +         return !is_uniform(src[0]);
>
> Shouldn't this be:
>
> return !is_uniform(src[0]) ||
>        (devinfo->gen < 11 && type_sz(src.type) == 4);
>
> Since in that case we also implement the opcode with a single ALIGN16
> instruction.
>

Not really.  Maybe you mean "!is_uniform(src[0]) &&
(devinfo->gen >= 11 || type_sz(src.type) != 4)" instead?  That would be
somewhat more accurate than the expression in my patch, but
unfortunately the devinfo pointer is not available here.  I wouldn't
mind plumbing it through but patch is meant for mesa-stable, and it
shouldn't affect correctness to be more strict than necessary regarding
source/destination hazards.

>> +      }
>>     default:
>>        /* The SIMD16 compressed instruction
>>         *
>> @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct
>> gen_device_info *devinfo,
>>     case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
>>        return MIN2(8, inst->exec_size);
>>  
>> -   case SHADER_OPCODE_QUAD_SWIZZLE:
>> -      return 8;
>> -
>> +   case SHADER_OPCODE_QUAD_SWIZZLE: {
>> +      const unsigned swiz = inst->src[1].ud;
>> +      return (is_uniform(inst->src[0]) ?
>> +                 get_fpu_lowered_simd_width(devinfo, inst) :
>> +              devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ?
>> 8 :
>> +              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ?
>> 4 :
>> +              get_fpu_lowered_simd_width(devinfo, inst));
>> +   }
>>     case SHADER_OPCODE_MOV_INDIRECT: {
>>        /* From IVB and HSW PRMs:
>>         *
>> diff --git a/src/intel/compiler/brw_fs.h
>> b/src/intel/compiler/brw_fs.h
>> index 53d9b6ce7bf..dc36ecc21ac 100644
>> --- a/src/intel/compiler/brw_fs.h
>> +++ b/src/intel/compiler/brw_fs.h
>> @@ -480,6 +480,10 @@ private:
>>                           struct brw_reg src,
>>                           struct brw_reg idx);
>>  
>> +   void generate_quad_swizzle(const fs_inst *inst,
>> +                              struct brw_reg dst, struct brw_reg
>> src,
>> +                              unsigned swiz);
>> +
>>     bool patch_discard_jumps_to_fb_writes();
>>  
>>     const struct brw_compiler *compiler;
>> diff --git a/src/intel/compiler/brw_fs_generator.cpp
>> b/src/intel/compiler/brw_fs_generator.cpp
>> index 08dd83dded7..84627e83132 100644
>> --- a/src/intel/compiler/brw_fs_generator.cpp
>> +++ b/src/intel/compiler/brw_fs_generator.cpp
>> @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst *inst,
>>     }
>>  }
>>  
>> +void
>> +fs_generator::generate_quad_swizzle(const fs_inst *inst,
>> +                                    struct brw_reg dst, struct
>> brw_reg src,
>> +                                    unsigned swiz)
>> +{
>> +   /* Requires a quad. */
>> +   assert(inst->exec_size >= 4);
>> +
>> +   if (src.file == BRW_IMMEDIATE_VALUE ||
>> +       has_scalar_region(src)) {
>> +      /* The value is uniform across all channels */
>> +      brw_MOV(p, dst, src);
>> +
>> +   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
>> +      /* This only works on 8-wide 32-bit values */
>> +      assert(inst->exec_size == 8);
>> +      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
>> +      assert(src.vstride == src.width + 1);
>> +      brw_set_default_access_mode(p, BRW_ALIGN_16);
>> +      struct brw_reg swiz_src = stride(src, 4, 4, 1);
>> +      swiz_src.swizzle = swiz;
>> +      brw_MOV(p, dst, swiz_src);
>> +
>
> Extra blank line.
>
>> +   } else {
>> +      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
>> +      assert(src.vstride == src.width + 1);
>> +      const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz,
>> 0));
>> +
>> +      switch (swiz) {
>> +      case BRW_SWIZZLE_XXXX:
>> +      case BRW_SWIZZLE_YYYY:
>> +      case BRW_SWIZZLE_ZZZZ:
>> +      case BRW_SWIZZLE_WWWW:
>> +         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
>> +         break;
>> +
>> +      case BRW_SWIZZLE_XXZZ:
>> +      case BRW_SWIZZLE_YYWW:
>> +         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
>> +         break;
>> +
>> +      case BRW_SWIZZLE_XYXY:
>> +      case BRW_SWIZZLE_ZWZW:
>> +         assert(inst->exec_size == 4);
>> +         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
>> +         break;
>> +
>> +      default:
>> +         assert(inst->force_writemask_all);
>> +         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
>> +
>> +         for (unsigned c = 0; c < 4; c++) {
>> +            brw_inst *insn = brw_MOV(
>> +               p, stride(suboffset(dst, c),
>> +                         4 * inst->dst.stride, 1, 4 * inst-
>> >dst.stride),
>> +               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1,
>> 0));
>> +
>> +            brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
>> +            brw_inst_set_no_dd_check(devinfo, insn, c > 0);
>> +         }
>> +
>> +         break;
>> +      }
>> +   }
>> +}
>> +
>>  void
>>  fs_generator::generate_urb_read(fs_inst *inst,
>>                                  struct brw_reg dst,
>> @@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t *cfg,
>> int dispatch_width)
>>           break;
>>  
>>        case SHADER_OPCODE_QUAD_SWIZZLE:
>> -         /* This only works on 8-wide 32-bit values */
>> -         assert(inst->exec_size == 8);
>> -         assert(type_sz(src[0].type) == 4);
>> -         assert(inst->force_writemask_all);
>>           assert(src[1].file == BRW_IMMEDIATE_VALUE);
>>           assert(src[1].type == BRW_REGISTER_TYPE_UD);
>> -
>> -         if (src[0].file == BRW_IMMEDIATE_VALUE ||
>> -             (src[0].vstride == 0 && src[0].hstride == 0)) {
>> -            /* The value is uniform across all channels */
>> -            brw_MOV(p, dst, src[0]);
>> -         } else {
>> -            brw_set_default_access_mode(p, BRW_ALIGN_16);
>> -            struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
>> -            swiz_src.swizzle = inst->src[1].ud;
>> -            brw_MOV(p, dst, swiz_src);
>> -         }
>> +         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
>>           break;
>>  
>>        case SHADER_OPCODE_CLUSTER_BROADCAST: {
>
> _______________________________________________
> mesa-stable mailing list
> mesa-stable@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-stable
On Mon, 2019-01-07 at 11:58 -0800, Francisco Jerez wrote:
> Iago Toral <itoral@igalia.com> writes:
> 
> > On Sat, 2018-12-29 at 12:38 -0800, Francisco Jerez wrote:
> > > Align16 is no longer a thing, so a new implementation is provided
> > > using Align1 instead.  Not all possible swizzles can be
> > > represented
> > > as
> > > a single Align1 region, but some fast paths are provided for
> > > frequently used swizzles that can be represented efficiently in
> > > Align1
> > > mode.
> > > 
> > > Fixes ~90 subgroup quad swap Vulkan CTS tests.
> > > 
> > > Cc: mesa-stable@lists.freedesktop.org
> > > ---
> > >  src/intel/compiler/brw_fs.cpp           | 25 +++++++-
> > >  src/intel/compiler/brw_fs.h             |  4 ++
> > >  src/intel/compiler/brw_fs_generator.cpp | 82
> > > ++++++++++++++++++++---
> > > --
> > >  3 files changed, 93 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/src/intel/compiler/brw_fs.cpp
> > > b/src/intel/compiler/brw_fs.cpp
> > > index 2f0f0151219..97544fdf465 100644
> > > --- a/src/intel/compiler/brw_fs.cpp
> > > +++ b/src/intel/compiler/brw_fs.cpp
> > > @@ -315,6 +315,20 @@ fs_inst::has_source_and_destination_hazard()
> > > const
> > >         * may stomp all over it.
> > >         */
> > >        return true;
> > > +   case SHADER_OPCODE_QUAD_SWIZZLE:
> > > +      switch (src[1].ud) {
> > 
> > Maybe it is worth adding a small comment here indicating that these
> > are
> > the cases where we implement the opcode as a single instruction and
> > refer to the generator for details?
> > 
> 
> Yeah, fixed up locally.
> 
> > > +      case BRW_SWIZZLE_XXXX:
> > > +      case BRW_SWIZZLE_YYYY:
> > > +      case BRW_SWIZZLE_ZZZZ:
> > > +      case BRW_SWIZZLE_WWWW:
> > > +      case BRW_SWIZZLE_XXZZ:
> > > +      case BRW_SWIZZLE_YYWW:
> > > +      case BRW_SWIZZLE_XYXY:
> > > +      case BRW_SWIZZLE_ZWZW:
> > > +         return false;
> > > +      default:
> > > +         return !is_uniform(src[0]);
> > 
> > Shouldn't this be:
> > 
> > return !is_uniform(src[0]) ||
> >        (devinfo->gen < 11 && type_sz(src.type) == 4);
> > 
> > Since in that case we also implement the opcode with a single
> > ALIGN16
> > instruction.
> > 
> 
> Not really.  Maybe you mean "!is_uniform(src[0]) &&
> (devinfo->gen >= 11 || type_sz(src.type) != 4)" instead? 

Ah yes, that's what I meant.

>  That would be
> somewhat more accurate than the expression in my patch, but
> unfortunately the devinfo pointer is not available here.  I wouldn't
> mind plumbing it through but patch is meant for mesa-stable, and it
> shouldn't affect correctness to be more strict than necessary
> regarding
> source/destination hazards.

Right, I didn't realize we didn't have devinfo available here. I guess
it is fine, the only drawback is that we might add a little bit more of
register pressure in (probably very few) cases in those gens, but that
is a very small issue I guess and it can be improved in a future patch
if we want.

> > > +      }
> > >     default:
> > >        /* The SIMD16 compressed instruction
> > >         *
> > > @@ -5579,9 +5593,14 @@ get_lowered_simd_width(const struct
> > > gen_device_info *devinfo,
> > >     case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
> > >        return MIN2(8, inst->exec_size);
> > >  
> > > -   case SHADER_OPCODE_QUAD_SWIZZLE:
> > > -      return 8;
> > > -
> > > +   case SHADER_OPCODE_QUAD_SWIZZLE: {
> > > +      const unsigned swiz = inst->src[1].ud;
> > > +      return (is_uniform(inst->src[0]) ?
> > > +                 get_fpu_lowered_simd_width(devinfo, inst) :
> > > +              devinfo->gen < 11 && type_sz(inst->src[0].type) ==
> > > 4 ?
> > > 8 :
> > > +              swiz == BRW_SWIZZLE_XYXY || swiz ==
> > > BRW_SWIZZLE_ZWZW ?
> > > 4 :
> > > +              get_fpu_lowered_simd_width(devinfo, inst));
> > > +   }
> > >     case SHADER_OPCODE_MOV_INDIRECT: {
> > >        /* From IVB and HSW PRMs:
> > >         *
> > > diff --git a/src/intel/compiler/brw_fs.h
> > > b/src/intel/compiler/brw_fs.h
> > > index 53d9b6ce7bf..dc36ecc21ac 100644
> > > --- a/src/intel/compiler/brw_fs.h
> > > +++ b/src/intel/compiler/brw_fs.h
> > > @@ -480,6 +480,10 @@ private:
> > >                           struct brw_reg src,
> > >                           struct brw_reg idx);
> > >  
> > > +   void generate_quad_swizzle(const fs_inst *inst,
> > > +                              struct brw_reg dst, struct brw_reg
> > > src,
> > > +                              unsigned swiz);
> > > +
> > >     bool patch_discard_jumps_to_fb_writes();
> > >  
> > >     const struct brw_compiler *compiler;
> > > diff --git a/src/intel/compiler/brw_fs_generator.cpp
> > > b/src/intel/compiler/brw_fs_generator.cpp
> > > index 08dd83dded7..84627e83132 100644
> > > --- a/src/intel/compiler/brw_fs_generator.cpp
> > > +++ b/src/intel/compiler/brw_fs_generator.cpp
> > > @@ -582,6 +582,72 @@ fs_generator::generate_shuffle(fs_inst
> > > *inst,
> > >     }
> > >  }
> > >  
> > > +void
> > > +fs_generator::generate_quad_swizzle(const fs_inst *inst,
> > > +                                    struct brw_reg dst, struct
> > > brw_reg src,
> > > +                                    unsigned swiz)
> > > +{
> > > +   /* Requires a quad. */
> > > +   assert(inst->exec_size >= 4);
> > > +
> > > +   if (src.file == BRW_IMMEDIATE_VALUE ||
> > > +       has_scalar_region(src)) {
> > > +      /* The value is uniform across all channels */
> > > +      brw_MOV(p, dst, src);
> > > +
> > > +   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
> > > +      /* This only works on 8-wide 32-bit values */
> > > +      assert(inst->exec_size == 8);
> > > +      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
> > > +      assert(src.vstride == src.width + 1);
> > > +      brw_set_default_access_mode(p, BRW_ALIGN_16);
> > > +      struct brw_reg swiz_src = stride(src, 4, 4, 1);
> > > +      swiz_src.swizzle = swiz;
> > > +      brw_MOV(p, dst, swiz_src);
> > > +
> > 
> > Extra blank line.
> > 
> > > +   } else {
> > > +      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
> > > +      assert(src.vstride == src.width + 1);
> > > +      const struct brw_reg src_0 = suboffset(src,
> > > BRW_GET_SWZ(swiz,
> > > 0));
> > > +
> > > +      switch (swiz) {
> > > +      case BRW_SWIZZLE_XXXX:
> > > +      case BRW_SWIZZLE_YYYY:
> > > +      case BRW_SWIZZLE_ZZZZ:
> > > +      case BRW_SWIZZLE_WWWW:
> > > +         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
> > > +         break;
> > > +
> > > +      case BRW_SWIZZLE_XXZZ:
> > > +      case BRW_SWIZZLE_YYWW:
> > > +         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
> > > +         break;
> > > +
> > > +      case BRW_SWIZZLE_XYXY:
> > > +      case BRW_SWIZZLE_ZWZW:
> > > +         assert(inst->exec_size == 4);
> > > +         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
> > > +         break;
> > > +
> > > +      default:
> > > +         assert(inst->force_writemask_all);
> > > +         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) -
> > > 1);
> > > +
> > > +         for (unsigned c = 0; c < 4; c++) {
> > > +            brw_inst *insn = brw_MOV(
> > > +               p, stride(suboffset(dst, c),
> > > +                         4 * inst->dst.stride, 1, 4 * inst-
> > > > dst.stride),
> > > 
> > > +               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4,
> > > 1,
> > > 0));
> > > +
> > > +            brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
> > > +            brw_inst_set_no_dd_check(devinfo, insn, c > 0);
> > > +         }
> > > +
> > > +         break;
> > > +      }
> > > +   }
> > > +}
> > > +
> > >  void
> > >  fs_generator::generate_urb_read(fs_inst *inst,
> > >                                  struct brw_reg dst,
> > > @@ -2303,23 +2369,9 @@ fs_generator::generate_code(const cfg_t
> > > *cfg,
> > > int dispatch_width)
> > >           break;
> > >  
> > >        case SHADER_OPCODE_QUAD_SWIZZLE:
> > > -         /* This only works on 8-wide 32-bit values */
> > > -         assert(inst->exec_size == 8);
> > > -         assert(type_sz(src[0].type) == 4);
> > > -         assert(inst->force_writemask_all);
> > >           assert(src[1].file == BRW_IMMEDIATE_VALUE);
> > >           assert(src[1].type == BRW_REGISTER_TYPE_UD);
> > > -
> > > -         if (src[0].file == BRW_IMMEDIATE_VALUE ||
> > > -             (src[0].vstride == 0 && src[0].hstride == 0)) {
> > > -            /* The value is uniform across all channels */
> > > -            brw_MOV(p, dst, src[0]);
> > > -         } else {
> > > -            brw_set_default_access_mode(p, BRW_ALIGN_16);
> > > -            struct brw_reg swiz_src = stride(src[0], 4, 4, 1);
> > > -            swiz_src.swizzle = inst->src[1].ud;
> > > -            brw_MOV(p, dst, swiz_src);
> > > -         }
> > > +         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
> > >           break;
> > >  
> > >        case SHADER_OPCODE_CLUSTER_BROADCAST: {
> > 
> > _______________________________________________
> > mesa-stable mailing list
> > mesa-stable@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-stable