[v3,25/42] intel/compiler: workaround for SIMD8 half-float MAD in gen8

Submitted by Iago Toral Quiroga on Jan. 15, 2019, 1:53 p.m.

Details

Message ID 20190115135414.2313-26-itoral@igalia.com
State New
Headers show
Series "intel: VK_KHR_shader_float16_int8 implementation" ( rev: 5 4 ) in Mesa

Not browsing as part of any series.

Commit Message

Iago Toral Quiroga Jan. 15, 2019, 1:53 p.m.
Broadwell hardware has a bug that manifests in SIMD8 executions of
16-bit MAD instructions when any of the sources is a Y or W component.
We pack these components in the same SIMD register as components X and
Z respectively, but starting at offset 16B (so they live in the second
half of the register). The problem does not exist in SKL or later.

We work around this issue by moving any such sources to a temporary
starting at offset 0B. We want to do this after the main optimization loop
to prevent copy-propagation and friends to undo the fix.

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/intel/compiler/brw_fs.cpp | 48 +++++++++++++++++++++++++++++++++++
 src/intel/compiler/brw_fs.h   |  1 +
 2 files changed, 49 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 0b3ec94e2d2..d6096cd667d 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6540,6 +6540,48 @@  fs_visitor::optimize()
    validate();
 }
 
+/**
+ * Broadwell hardware has a bug that manifests in SIMD8 executions of 16-bit
+ * MAD instructions when any of the sources is a Y or W component. We pack
+ * these components in the same SIMD register as components X and Z
+ * respectively, but starting at offset 16B (so they live in the second half
+ * of the register).
+ *
+ * We work around this issue by moving any such sources to a temporary
+ * starting at offset 0B. We want to do this after the main optimization loop
+ * to prevent copy-propagation and friends to undo the fix.
+ */
+void
+fs_visitor::fixup_hf_mad()
+{
+   if (devinfo->gen != 8)
+      return;
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MAD ||
+          inst->dst.type != BRW_REGISTER_TYPE_HF ||
+          inst->exec_size > 8)
+         continue;
+
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].offset > 0) {
+            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
+            const fs_builder ibld =
+               bld.at(block, inst).exec_all().group(inst->exec_size, 0);
+            fs_reg tmp = ibld.vgrf(inst->src[i].type);
+            ibld.MOV(tmp, inst->src[i]);
+            inst->src[i] = tmp;
+            progress = true;
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+}
+
 /**
  * Three source instruction must have a GRF/MRF destination register.
  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
@@ -6698,6 +6740,7 @@  fs_visitor::run_vs()
    assign_curb_setup();
    assign_vs_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6782,6 +6825,7 @@  fs_visitor::run_tcs_single_patch()
    assign_curb_setup();
    assign_tcs_single_patch_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6816,6 +6860,7 @@  fs_visitor::run_tes()
    assign_curb_setup();
    assign_tes_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6865,6 +6910,7 @@  fs_visitor::run_gs()
    assign_curb_setup();
    assign_gs_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6965,6 +7011,7 @@  fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 
       assign_urb_setup();
 
+      fixup_hf_mad();
       fixup_3src_null_dest();
       allocate_registers(8, allow_spilling);
 
@@ -7009,6 +7056,7 @@  fs_visitor::run_cs(unsigned min_dispatch_width)
 
    assign_curb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(min_dispatch_width, true);
 
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 68287bcdcea..1879d4bc7f7 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -103,6 +103,7 @@  public:
    void setup_vs_payload();
    void setup_gs_payload();
    void setup_cs_payload();
+   void fixup_hf_mad();
    void fixup_3src_null_dest();
    void assign_curb_setup();
    void calculate_urb_setup();

Comments

On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <itoral@igalia.com>
wrote:

> Broadwell hardware has a bug that manifests in SIMD8 executions of
> 16-bit MAD instructions when any of the sources is a Y or W component.
> We pack these components in the same SIMD register as components X and
> Z respectively, but starting at offset 16B (so they live in the second
> half of the register). The problem does not exist in SKL or later.
>
> We work around this issue by moving any such sources to a temporary
> starting at offset 0B. We want to do this after the main optimization loop
> to prevent copy-propagation and friends to undo the fix.
>
> Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> ---
>  src/intel/compiler/brw_fs.cpp | 48 +++++++++++++++++++++++++++++++++++
>  src/intel/compiler/brw_fs.h   |  1 +
>  2 files changed, 49 insertions(+)
>
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 0b3ec94e2d2..d6096cd667d 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -6540,6 +6540,48 @@ fs_visitor::optimize()
>     validate();
>  }
>
> +/**
> + * Broadwell hardware has a bug that manifests in SIMD8 executions of
> 16-bit
> + * MAD instructions when any of the sources is a Y or W component. We pack
> + * these components in the same SIMD register as components X and Z
> + * respectively, but starting at offset 16B (so they live in the second
> half
> + * of the register).
>

What exactly do you mean by a Y or W component?  Is this for the case where
you have a scalar that happens to land at certain offsets?  Or does it
apply to regular stride == 1 MADs?  If it applied in the stride == 1 case,
then I really don't see what this is doing to fix it.  It might help if you
provided some before and after assembly example.

Also, this seems like something that should go in the new region
restrictions pass as a special case in has_invalid_src_region.

--Jason


> + *
> + * We work around this issue by moving any such sources to a temporary
> + * starting at offset 0B. We want to do this after the main optimization
> loop
> + * to prevent copy-propagation and friends to undo the fix.
> + */
> +void
> +fs_visitor::fixup_hf_mad()
> +{
> +   if (devinfo->gen != 8)
> +      return;
> +
> +   bool progress = false;
> +
> +   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
> +      if (inst->opcode != BRW_OPCODE_MAD ||
> +          inst->dst.type != BRW_REGISTER_TYPE_HF ||
> +          inst->exec_size > 8)
> +         continue;
> +
> +      for (int i = 0; i < 3; i++) {
> +         if (inst->src[i].offset > 0) {
> +            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
> +            const fs_builder ibld =
> +               bld.at(block, inst).exec_all().group(inst->exec_size, 0);
> +            fs_reg tmp = ibld.vgrf(inst->src[i].type);
> +            ibld.MOV(tmp, inst->src[i]);
> +            inst->src[i] = tmp;
> +            progress = true;
> +         }
> +      }
> +   }
> +
> +   if (progress)
> +      invalidate_live_intervals();
> +}
> +
>  /**
>   * Three source instruction must have a GRF/MRF destination register.
>   * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
> @@ -6698,6 +6740,7 @@ fs_visitor::run_vs()
>     assign_curb_setup();
>     assign_vs_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6782,6 +6825,7 @@ fs_visitor::run_tcs_single_patch()
>     assign_curb_setup();
>     assign_tcs_single_patch_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6816,6 +6860,7 @@ fs_visitor::run_tes()
>     assign_curb_setup();
>     assign_tes_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6865,6 +6910,7 @@ fs_visitor::run_gs()
>     assign_curb_setup();
>     assign_gs_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6965,6 +7011,7 @@ fs_visitor::run_fs(bool allow_spilling, bool
> do_rep_send)
>
>        assign_urb_setup();
>
> +      fixup_hf_mad();
>        fixup_3src_null_dest();
>        allocate_registers(8, allow_spilling);
>
> @@ -7009,6 +7056,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width)
>
>     assign_curb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(min_dispatch_width, true);
>
> diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
> index 68287bcdcea..1879d4bc7f7 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -103,6 +103,7 @@ public:
>     void setup_vs_payload();
>     void setup_gs_payload();
>     void setup_cs_payload();
> +   void fixup_hf_mad();
>     void fixup_3src_null_dest();
>     void assign_curb_setup();
>     void calculate_urb_setup();
> --
> 2.17.1
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
On Fri, 2019-01-18 at 11:51 -0600, Jason Ekstrand wrote:
> On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <itoral@igalia.com
> > wrote:
> > Broadwell hardware has a bug that manifests in SIMD8 executions of
> > 
> > 16-bit MAD instructions when any of the sources is a Y or W
> > component.
> > 
> > We pack these components in the same SIMD register as components X
> > and
> > 
> > Z respectively, but starting at offset 16B (so they live in the
> > second
> > 
> > half of the register). The problem does not exist in SKL or later.
> > 
> > 
> > 
> > We work around this issue by moving any such sources to a temporary
> > 
> > starting at offset 0B. We want to do this after the main
> > optimization loop
> > 
> > to prevent copy-propagation and friends to undo the fix.
> > 
> > 
> > 
> > Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> > 
> > ---
> > 
> >  src/intel/compiler/brw_fs.cpp | 48
> > +++++++++++++++++++++++++++++++++++
> > 
> >  src/intel/compiler/brw_fs.h   |  1 +
> > 
> >  2 files changed, 49 insertions(+)
> > 
> > 
> > 
> > diff --git a/src/intel/compiler/brw_fs.cpp
> > b/src/intel/compiler/brw_fs.cpp
> > 
> > index 0b3ec94e2d2..d6096cd667d 100644
> > 
> > --- a/src/intel/compiler/brw_fs.cpp
> > 
> > +++ b/src/intel/compiler/brw_fs.cpp
> > 
> > @@ -6540,6 +6540,48 @@ fs_visitor::optimize()
> > 
> >     validate();
> > 
> >  }
> > 
> > 
> > 
> > +/**
> > 
> > + * Broadwell hardware has a bug that manifests in SIMD8 executions
> > of 16-bit
> > 
> > + * MAD instructions when any of the sources is a Y or W component.
> > We pack
> > 
> > + * these components in the same SIMD register as components X and
> > Z
> > 
> > + * respectively, but starting at offset 16B (so they live in the
> > second half
> > 
> > + * of the register).
> 
> What exactly do you mean by a Y or W component?  Is this for the case
> where you have a scalar that happens to land at certain offsets?  Or
> does it apply to regular stride == 1 MADs?  If it applied in the
> stride == 1 case, then I really don't see what this is doing to fix
> it.  It might help if you provided some before and after assembly
> example.

This happens when a source to a half-float MAD instruction starts at
offset 16B, which at the time I wrote this patch,  happened when we
were packing the Y component or the W component of a 16-bit vec2/vec4
into the second half of a SIMD register. I have an old version of that
branch and the CTS tests and was able to reproduce the problem. Here is
a code trace which is not working as expected, making some CTS tests
fail:
send(8)         g16<1>UW        g19<8,8,1>UD                           
 data ( DC byte scattered read, 0, 4) mlen 1 rlen 1 { align1 1Q
};mov(8)          g14.8<1>HF      g16<16,8,2>HF                   {
align1 1Q };mad(8)          g18<1>HF        -
g17<4,4,1>HF   g14.8<4,4,1>HF  g11<4,4,1>HF { align16 1Q
};mov(8)          g21<2>W         g18<8,8,1>W                     {
align1 1Q };
If we run this pass, we would produce the same code, only that we would
replace the MAD instruction with this:
mov(8)          g22<1>HF        g14.8<8,8,1>HF                  {
align1 WE_all 1Q };mad(8)          g18<1>HF        -
g17<4,4,1>HF   g22<4,4,1>HF    g11<4,4,1>HF { align16 1Q };
Which makes the test pass.
It seems our compiler produces different code now than when I found
this and these same tests now pass without this pass because we simply
don't hit that scenario any more. It seems as if our shuffling code
after a load is not attempting to pack 2 16-bit vector components in
each VGRF anymore as we used to do when we implemented 16-bit storage
and therefore we no longer hit this scenario. Independently of whether
this change was intended or a bug, the hardware bug is there so I think
we still want to have code to dea with it.
> Also, this seems like something that should go in the new region
> restrictions pass as a special case in has_invalid_src_region.

Yes, I guess that makes sense now that we have this pass. I'll put this
there.
> --Jason
>  
> > + *
> > 
> > + * We work around this issue by moving any such sources to a
> > temporary
> > 
> > + * starting at offset 0B. We want to do this after the main
> > optimization loop
> > 
> > + * to prevent copy-propagation and friends to undo the fix.
> > 
> > + */
> > 
> > +void
> > 
> > +fs_visitor::fixup_hf_mad()
> > 
> > +{
> > 
> > +   if (devinfo->gen != 8)
> > 
> > +      return;
> > 
> > +
> > 
> > +   bool progress = false;
> > 
> > +
> > 
> > +   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
> > 
> > +      if (inst->opcode != BRW_OPCODE_MAD ||
> > 
> > +          inst->dst.type != BRW_REGISTER_TYPE_HF ||
> > 
> > +          inst->exec_size > 8)
> > 
> > +         continue;
> > 
> > +
> > 
> > +      for (int i = 0; i < 3; i++) {
> > 
> > +         if (inst->src[i].offset > 0) {
> > 
> > +            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
> > 
> > +            const fs_builder ibld =
> > 
> > +               bld.at(block, inst).exec_all().group(inst-
> > >exec_size, 0);
> > 
> > +            fs_reg tmp = ibld.vgrf(inst->src[i].type);
> > 
> > +            ibld.MOV(tmp, inst->src[i]);
> > 
> > +            inst->src[i] = tmp;
> > 
> > +            progress = true;
> > 
> > +         }
> > 
> > +      }
> > 
> > +   }
> > 
> > +
> > 
> > +   if (progress)
> > 
> > +      invalidate_live_intervals();
> > 
> > +}
> > 
> > +
> > 
> >  /**
> > 
> >   * Three source instruction must have a GRF/MRF destination
> > register.
> > 
> >   * ARF NULL is not allowed.  Fix that up by allocating a temporary
> > GRF.
> > 
> > @@ -6698,6 +6740,7 @@ fs_visitor::run_vs()
> > 
> >     assign_curb_setup();
> > 
> >     assign_vs_urb_setup();
> > 
> > 
> > 
> > +   fixup_hf_mad();
> > 
> >     fixup_3src_null_dest();
> > 
> >     allocate_registers(8, true);
> > 
> > 
> > 
> > @@ -6782,6 +6825,7 @@ fs_visitor::run_tcs_single_patch()
> > 
> >     assign_curb_setup();
> > 
> >     assign_tcs_single_patch_urb_setup();
> > 
> > 
> > 
> > +   fixup_hf_mad();
> > 
> >     fixup_3src_null_dest();
> > 
> >     allocate_registers(8, true);
> > 
> > 
> > 
> > @@ -6816,6 +6860,7 @@ fs_visitor::run_tes()
> > 
> >     assign_curb_setup();
> > 
> >     assign_tes_urb_setup();
> > 
> > 
> > 
> > +   fixup_hf_mad();
> > 
> >     fixup_3src_null_dest();
> > 
> >     allocate_registers(8, true);
> > 
> > 
> > 
> > @@ -6865,6 +6910,7 @@ fs_visitor::run_gs()
> > 
> >     assign_curb_setup();
> > 
> >     assign_gs_urb_setup();
> > 
> > 
> > 
> > +   fixup_hf_mad();
> > 
> >     fixup_3src_null_dest();
> > 
> >     allocate_registers(8, true);
> > 
> > 
> > 
> > @@ -6965,6 +7011,7 @@ fs_visitor::run_fs(bool allow_spilling, bool
> > do_rep_send)
> > 
> > 
> > 
> >        assign_urb_setup();
> > 
> > 
> > 
> > +      fixup_hf_mad();
> > 
> >        fixup_3src_null_dest();
> > 
> >        allocate_registers(8, allow_spilling);
> > 
> > 
> > 
> > @@ -7009,6 +7056,7 @@ fs_visitor::run_cs(unsigned
> > min_dispatch_width)
> > 
> > 
> > 
> >     assign_curb_setup();
> > 
> > 
> > 
> > +   fixup_hf_mad();
> > 
> >     fixup_3src_null_dest();
> > 
> >     allocate_registers(min_dispatch_width, true);
> > 
> > 
> > 
> > diff --git a/src/intel/compiler/brw_fs.h
> > b/src/intel/compiler/brw_fs.h
> > 
> > index 68287bcdcea..1879d4bc7f7 100644
> > 
> > --- a/src/intel/compiler/brw_fs.h
> > 
> > +++ b/src/intel/compiler/brw_fs.h
> > 
> > @@ -103,6 +103,7 @@ public:
> > 
> >     void setup_vs_payload();
> > 
> >     void setup_gs_payload();
> > 
> >     void setup_cs_payload();
> > 
> > +   void fixup_hf_mad();
> > 
> >     void fixup_3src_null_dest();
> > 
> >     void assign_curb_setup();
> > 
> >     void calculate_urb_setup();
> >
On Mon, Jan 21, 2019 at 4:55 AM Iago Toral <itoral@igalia.com> wrote:

> On Fri, 2019-01-18 at 11:51 -0600, Jason Ekstrand wrote:
>
> On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <itoral@igalia.com>
> wrote:
>
> Broadwell hardware has a bug that manifests in SIMD8 executions of
> 16-bit MAD instructions when any of the sources is a Y or W component.
> We pack these components in the same SIMD register as components X and
> Z respectively, but starting at offset 16B (so they live in the second
> half of the register). The problem does not exist in SKL or later.
>
> We work around this issue by moving any such sources to a temporary
> starting at offset 0B. We want to do this after the main optimization loop
> to prevent copy-propagation and friends to undo the fix.
>
> Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> ---
>  src/intel/compiler/brw_fs.cpp | 48 +++++++++++++++++++++++++++++++++++
>  src/intel/compiler/brw_fs.h   |  1 +
>  2 files changed, 49 insertions(+)
>
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 0b3ec94e2d2..d6096cd667d 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -6540,6 +6540,48 @@ fs_visitor::optimize()
>     validate();
>  }
>
> +/**
> + * Broadwell hardware has a bug that manifests in SIMD8 executions of
> 16-bit
> + * MAD instructions when any of the sources is a Y or W component. We pack
> + * these components in the same SIMD register as components X and Z
> + * respectively, but starting at offset 16B (so they live in the second
> half
> + * of the register).
>
>
> What exactly do you mean by a Y or W component?  Is this for the case
> where you have a scalar that happens to land at certain offsets?  Or does
> it apply to regular stride == 1 MADs?  If it applied in the stride == 1
> case, then I really don't see what this is doing to fix it.  It might help
> if you provided some before and after assembly example.
>
>
> This happens when a source to a half-float MAD instruction starts at
> offset 16B, which at the time I wrote this patch, happened when we were
> packing the Y component or the W component of a 16-bit vec2/vec4 into the
> second half of a SIMD register. I have an old version of that branch and
> the CTS tests and was able to reproduce the problem. Here is a code trace
> which is not working as expected, making some CTS tests fail:
>
> send(8)         g16<1>UW        g19<8,8,1>UD
>                             data ( DC byte scattered read, 0, 4) mlen 1
> rlen 1 { align1 1Q };
> mov(8)          g14.8<1>HF      g16<16,8,2>HF                   { align1
> 1Q };
> mad(8)          g18<1>HF        -g17<4,4,1>HF   g14.8<4,4,1>HF  g11<4,4,1>HF
> { align16 1Q };
> mov(8)          g21<2>W         g18<8,8,1>W                     { align1
> 1Q };
>
> If we run this pass, we would produce the same code, only that we would
> replace the MAD instruction with this:
>
> mov(8)          g22<1>HF        g14.8<8,8,1>HF                  { align1
> WE_all 1Q };
> mad(8)          g18<1>HF        -g17<4,4,1>HF   g22<4,4,1>HF    g11<4,4,1>HF
> { align16 1Q };
>
> Which makes the test pass.
>
> It seems our compiler produces different code now than when I found this
> and these same tests now pass without this pass because we simply don't hit
> that scenario any more. It seems as if our shuffling code after a load is
> not attempting to pack 2 16-bit vector components in each VGRF anymore as
> we used to do when we implemented 16-bit storage and therefore we no longer
> hit this scenario. Independently of whether this change was intended or a
> bug, the hardware bug is there so I think we still want to have code to dea
> with it.
>

Thanks for the info!  It makes a lot more sense now.  Does this only apply
to wide reads or does it also fail with a stride of 0?  Also, is it only 16
or is it all non-zero values or everything >= 16?  It'd be good if we could
get more data.


>
> Also, this seems like something that should go in the new region
> restrictions pass as a special case in has_invalid_src_region.
>
>
> Yes, I guess that makes sense now that we have this pass. I'll put this
> there.
>
> --Jason
>
>
> + *
> + * We work around this issue by moving any such sources to a temporary
> + * starting at offset 0B. We want to do this after the main optimization
> loop
> + * to prevent copy-propagation and friends to undo the fix.
> + */
> +void
> +fs_visitor::fixup_hf_mad()
> +{
> +   if (devinfo->gen != 8)
> +      return;
> +
> +   bool progress = false;
> +
> +   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
> +      if (inst->opcode != BRW_OPCODE_MAD ||
> +          inst->dst.type != BRW_REGISTER_TYPE_HF ||
> +          inst->exec_size > 8)
> +         continue;
> +
> +      for (int i = 0; i < 3; i++) {
> +         if (inst->src[i].offset > 0) {
> +            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
> +            const fs_builder ibld =
> +               bld.at(block, inst).exec_all().group(inst->exec_size, 0);
> +            fs_reg tmp = ibld.vgrf(inst->src[i].type);
> +            ibld.MOV(tmp, inst->src[i]);
> +            inst->src[i] = tmp;
> +            progress = true;
> +         }
> +      }
> +   }
> +
> +   if (progress)
> +      invalidate_live_intervals();
> +}
> +
>  /**
>   * Three source instruction must have a GRF/MRF destination register.
>   * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
> @@ -6698,6 +6740,7 @@ fs_visitor::run_vs()
>     assign_curb_setup();
>     assign_vs_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6782,6 +6825,7 @@ fs_visitor::run_tcs_single_patch()
>     assign_curb_setup();
>     assign_tcs_single_patch_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6816,6 +6860,7 @@ fs_visitor::run_tes()
>     assign_curb_setup();
>     assign_tes_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6865,6 +6910,7 @@ fs_visitor::run_gs()
>     assign_curb_setup();
>     assign_gs_urb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>
> @@ -6965,6 +7011,7 @@ fs_visitor::run_fs(bool allow_spilling, bool
> do_rep_send)
>
>        assign_urb_setup();
>
> +      fixup_hf_mad();
>        fixup_3src_null_dest();
>        allocate_registers(8, allow_spilling);
>
> @@ -7009,6 +7056,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width)
>
>     assign_curb_setup();
>
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(min_dispatch_width, true);
>
> diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
> index 68287bcdcea..1879d4bc7f7 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -103,6 +103,7 @@ public:
>     void setup_vs_payload();
>     void setup_gs_payload();
>     void setup_cs_payload();
> +   void fixup_hf_mad();
>     void fixup_3src_null_dest();
>     void assign_curb_setup();
>     void calculate_urb_setup();
>
>
On Mon, 2019-01-21 at 18:48 -0600, Jason Ekstrand wrote:
> On Mon, Jan 21, 2019 at 4:55 AM Iago Toral <itoral@igalia.com> wrote:
> > On Fri, 2019-01-18 at 11:51 -0600, Jason Ekstrand wrote:
> > > On Tue, Jan 15, 2019 at 7:55 AM Iago Toral Quiroga <
> > > itoral@igalia.com> wrote:
> > > > Broadwell hardware has a bug that manifests in SIMD8 executions
> > > > of
> > > > 
> > > > 16-bit MAD instructions when any of the sources is a Y or W
> > > > component.
> > > > 
> > > > We pack these components in the same SIMD register as
> > > > components X and
> > > > 
> > > > Z respectively, but starting at offset 16B (so they live in the
> > > > second
> > > > 
> > > > half of the register). The problem does not exist in SKL or
> > > > later.
> > > > 
> > > > 
> > > > 
> > > > We work around this issue by moving any such sources to a
> > > > temporary
> > > > 
> > > > starting at offset 0B. We want to do this after the main
> > > > optimization loop
> > > > 
> > > > to prevent copy-propagation and friends to undo the fix.
> > > > 
> > > > 
> > > > 
> > > > Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> > > > 
> > > > ---
> > > > 
> > > >  src/intel/compiler/brw_fs.cpp | 48
> > > > +++++++++++++++++++++++++++++++++++
> > > > 
> > > >  src/intel/compiler/brw_fs.h   |  1 +
> > > > 
> > > >  2 files changed, 49 insertions(+)
> > > > 
> > > > 
> > > > 
> > > > diff --git a/src/intel/compiler/brw_fs.cpp
> > > > b/src/intel/compiler/brw_fs.cpp
> > > > 
> > > > index 0b3ec94e2d2..d6096cd667d 100644
> > > > 
> > > > --- a/src/intel/compiler/brw_fs.cpp
> > > > 
> > > > +++ b/src/intel/compiler/brw_fs.cpp
> > > > 
> > > > @@ -6540,6 +6540,48 @@ fs_visitor::optimize()
> > > > 
> > > >     validate();
> > > > 
> > > >  }
> > > > 
> > > > 
> > > > 
> > > > +/**
> > > > 
> > > > + * Broadwell hardware has a bug that manifests in SIMD8
> > > > executions of 16-bit
> > > > 
> > > > + * MAD instructions when any of the sources is a Y or W
> > > > component. We pack
> > > > 
> > > > + * these components in the same SIMD register as components X
> > > > and Z
> > > > 
> > > > + * respectively, but starting at offset 16B (so they live in
> > > > the second half
> > > > 
> > > > + * of the register).
> > > 
> > > What exactly do you mean by a Y or W component?  Is this for the
> > > case where you have a scalar that happens to land at certain
> > > offsets?  Or does it apply to regular stride == 1 MADs?  If it
> > > applied in the stride == 1 case, then I really don't see what
> > > this is doing to fix it.  It might help if you provided some
> > > before and after assembly example.
> > 
> > This happens when a source to a half-float MAD instruction starts
> > at offset 16B, which at the time I wrote this patch,  happened when
> > we were packing the Y component or the W component of a 16-bit
> > vec2/vec4 into the second half of a SIMD register. I have an old
> > version of that branch and the CTS tests and was able to reproduce
> > the problem. Here is a code trace which is not working as expected,
> > making some CTS tests fail:
> > 
> > send(8)         g16<1>UW        g19<8,8,1>UD
> >                             data ( DC byte scattered read, 0, 4)
> > mlen 1 rlen 1 { align1 1Q };
> > mov(8)          g14.8<1>HF      g16<16,8,2>HF                   {
> > align1 1Q };
> > mad(8)          g18<1>HF        -
> > g17<4,4,1>HF   g14.8<4,4,1>HF  g11<4,4,1>HF { align16 1Q };
> > mov(8)          g21<2>W         g18<8,8,1>W                     {
> > align1 1Q };
> > 
> > If we run this pass, we would produce the same code, only that we
> > would replace the MAD instruction with this:
> > 
> > mov(8)          g22<1>HF        g14.8<8,8,1>HF                  {
> > align1 WE_all 1Q };
> > mad(8)          g18<1>HF        -
> > g17<4,4,1>HF   g22<4,4,1>HF    g11<4,4,1>HF { align16 1Q };
> > 
> > Which makes the test pass.
> > 
> > It seems our compiler produces different code now than when I found
> > this and these same tests now pass without this pass because we
> > simply don't hit that scenario any more. It seems as if our
> > shuffling code after a load is not attempting to pack 2 16-bit
> > vector components in each VGRF anymore as we used to do when we
> > implemented 16-bit storage and therefore we no longer hit this
> > scenario. Independently of whether this change was intended or a
> > bug, the hardware bug is there so I think we still want to have
> > code to dea with it.
> 
> Thanks for the info!  It makes a lot more sense now.  Does this only
> apply to wide reads or does it also fail with a stride of 0?  Also,
> is it only 16 or is it all non-zero values or everything >= 16?  It'd
> be good if we could get more data.

Sure, I have hacked a bit the tests and the old branch/driver code that
triggered the problem and experimented with more scenarios. The
conclusions are:
1. All offsets but 0B fail (tested with 16B, 8B, 4B, 3B, 2B, 1B).2. A
stride of 0 seems to works fine with any offset.
When I port this to the regioning lowering pass I'll make it so we
leave sources with a stride of 0 untouched.
>  
> > > Also, this seems like something that should go in the new region
> > > restrictions pass as a special case in has_invalid_src_region.
> > 
> > Yes, I guess that makes sense now that we have this pass. I'll put
> > this there.
> > > --Jason
> > >  
> > > > + *
> > > > 
> > > > + * We work around this issue by moving any such sources to a
> > > > temporary
> > > > 
> > > > + * starting at offset 0B. We want to do this after the main
> > > > optimization loop
> > > > 
> > > > + * to prevent copy-propagation and friends to undo the fix.
> > > > 
> > > > + */
> > > > 
> > > > +void
> > > > 
> > > > +fs_visitor::fixup_hf_mad()
> > > > 
> > > > +{
> > > > 
> > > > +   if (devinfo->gen != 8)
> > > > 
> > > > +      return;
> > > > 
> > > > +
> > > > 
> > > > +   bool progress = false;
> > > > 
> > > > +
> > > > 
> > > > +   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
> > > > 
> > > > +      if (inst->opcode != BRW_OPCODE_MAD ||
> > > > 
> > > > +          inst->dst.type != BRW_REGISTER_TYPE_HF ||
> > > > 
> > > > +          inst->exec_size > 8)
> > > > 
> > > > +         continue;
> > > > 
> > > > +
> > > > 
> > > > +      for (int i = 0; i < 3; i++) {
> > > > 
> > > > +         if (inst->src[i].offset > 0) {
> > > > 
> > > > +            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
> > > > 
> > > > +            const fs_builder ibld =
> > > > 
> > > > +               bld.at(block, inst).exec_all().group(inst-
> > > > >exec_size, 0);
> > > > 
> > > > +            fs_reg tmp = ibld.vgrf(inst->src[i].type);
> > > > 
> > > > +            ibld.MOV(tmp, inst->src[i]);
> > > > 
> > > > +            inst->src[i] = tmp;
> > > > 
> > > > +            progress = true;
> > > > 
> > > > +         }
> > > > 
> > > > +      }
> > > > 
> > > > +   }
> > > > 
> > > > +
> > > > 
> > > > +   if (progress)
> > > > 
> > > > +      invalidate_live_intervals();
> > > > 
> > > > +}
> > > > 
> > > > +
> > > > 
> > > >  /**
> > > > 
> > > >   * Three source instruction must have a GRF/MRF destination
> > > > register.
> > > > 
> > > >   * ARF NULL is not allowed.  Fix that up by allocating a
> > > > temporary GRF.
> > > > 
> > > > @@ -6698,6 +6740,7 @@ fs_visitor::run_vs()
> > > > 
> > > >     assign_curb_setup();
> > > > 
> > > >     assign_vs_urb_setup();
> > > > 
> > > > 
> > > > 
> > > > +   fixup_hf_mad();
> > > > 
> > > >     fixup_3src_null_dest();
> > > > 
> > > >     allocate_registers(8, true);
> > > > 
> > > > 
> > > > 
> > > > @@ -6782,6 +6825,7 @@ fs_visitor::run_tcs_single_patch()
> > > > 
> > > >     assign_curb_setup();
> > > > 
> > > >     assign_tcs_single_patch_urb_setup();
> > > > 
> > > > 
> > > > 
> > > > +   fixup_hf_mad();
> > > > 
> > > >     fixup_3src_null_dest();
> > > > 
> > > >     allocate_registers(8, true);
> > > > 
> > > > 
> > > > 
> > > > @@ -6816,6 +6860,7 @@ fs_visitor::run_tes()
> > > > 
> > > >     assign_curb_setup();
> > > > 
> > > >     assign_tes_urb_setup();
> > > > 
> > > > 
> > > > 
> > > > +   fixup_hf_mad();
> > > > 
> > > >     fixup_3src_null_dest();
> > > > 
> > > >     allocate_registers(8, true);
> > > > 
> > > > 
> > > > 
> > > > @@ -6865,6 +6910,7 @@ fs_visitor::run_gs()
> > > > 
> > > >     assign_curb_setup();
> > > > 
> > > >     assign_gs_urb_setup();
> > > > 
> > > > 
> > > > 
> > > > +   fixup_hf_mad();
> > > > 
> > > >     fixup_3src_null_dest();
> > > > 
> > > >     allocate_registers(8, true);
> > > > 
> > > > 
> > > > 
> > > > @@ -6965,6 +7011,7 @@ fs_visitor::run_fs(bool allow_spilling,
> > > > bool do_rep_send)
> > > > 
> > > > 
> > > > 
> > > >        assign_urb_setup();
> > > > 
> > > > 
> > > > 
> > > > +      fixup_hf_mad();
> > > > 
> > > >        fixup_3src_null_dest();
> > > > 
> > > >        allocate_registers(8, allow_spilling);
> > > > 
> > > > 
> > > > 
> > > > @@ -7009,6 +7056,7 @@ fs_visitor::run_cs(unsigned
> > > > min_dispatch_width)
> > > > 
> > > > 
> > > > 
> > > >     assign_curb_setup();
> > > > 
> > > > 
> > > > 
> > > > +   fixup_hf_mad();
> > > > 
> > > >     fixup_3src_null_dest();
> > > > 
> > > >     allocate_registers(min_dispatch_width, true);
> > > > 
> > > > 
> > > > 
> > > > diff --git a/src/intel/compiler/brw_fs.h
> > > > b/src/intel/compiler/brw_fs.h
> > > > 
> > > > index 68287bcdcea..1879d4bc7f7 100644
> > > > 
> > > > --- a/src/intel/compiler/brw_fs.h
> > > > 
> > > > +++ b/src/intel/compiler/brw_fs.h
> > > > 
> > > > @@ -103,6 +103,7 @@ public:
> > > > 
> > > >     void setup_vs_payload();
> > > > 
> > > >     void setup_gs_payload();
> > > > 
> > > >     void setup_cs_payload();
> > > > 
> > > > +   void fixup_hf_mad();
> > > > 
> > > >     void fixup_3src_null_dest();
> > > > 
> > > >     void assign_curb_setup();
> > > > 
> > > >     void calculate_urb_setup();
> > > >