[v2,35/53] intel/compiler: workaround for SIMD8 half-float MAD in gen < 9

Submitted by Iago Toral Quiroga on Dec. 19, 2018, 11:51 a.m.

Details

Message ID 20181219115121.20815-36-itoral@igalia.com
State New
Headers show
Series "intel: VK_KHR_shader_float16_int8 implementation" ( rev: 3 ) in Mesa

Not browsing as part of any series.

Commit Message

Iago Toral Quiroga Dec. 19, 2018, 11:51 a.m.
Broadwell hardware has a bug that manifests in SIMD8 executions of
16-bit MAD instructions when any of the sources is a Y or W component.
We pack these components in the same SIMD register as components X and
Z respectively, but starting at offset 16B (so they live in the second
half of the register). The problem does not exist in SKL or later.

We work around this issue by moving any such sources to a temporary
starting at offset 0B. We want to do this after the main optimization loop
to prevent copy-propagation and friends to undo the fix.
---
 src/intel/compiler/brw_fs.cpp | 48 +++++++++++++++++++++++++++++++++++
 src/intel/compiler/brw_fs.h   |  1 +
 2 files changed, 49 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 933b0b6ffc4..1343c2f4993 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6449,6 +6449,48 @@  fs_visitor::optimize()
    validate();
 }
 
+/**
+ * Broadwell hardware has a bug that manifests in SIMD8 executions of 16-bit
+ * MAD instructions when any of the sources is a Y or W component. We pack
+ * these components in the same SIMD register as components X and Z
+ * respectively, but starting at offset 16B (so they live in the second half
+ * of the register).
+ *
+ * We work around this issue by moving any such sources to a temporary
+ * starting at offset 0B. We want to do this after the main optimization loop
+ * to prevent copy-propagation and friends to undo the fix.
+ */
+void
+fs_visitor::fixup_hf_mad()
+{
+   if (devinfo->gen > 8)
+      return;
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MAD ||
+          inst->dst.type != BRW_REGISTER_TYPE_HF ||
+          inst->exec_size > 8)
+         continue;
+
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].offset > 0) {
+            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
+            const fs_builder ibld =
+               bld.at(block, inst).exec_all().group(inst->exec_size, 0);
+            fs_reg tmp = ibld.vgrf(inst->src[i].type);
+            ibld.MOV(tmp, inst->src[i]);
+            inst->src[i] = tmp;
+            progress = true;
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+}
+
 /**
  * Three source instruction must have a GRF/MRF destination register.
  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
@@ -6607,6 +6649,7 @@  fs_visitor::run_vs()
    assign_curb_setup();
    assign_vs_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6691,6 +6734,7 @@  fs_visitor::run_tcs_single_patch()
    assign_curb_setup();
    assign_tcs_single_patch_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6725,6 +6769,7 @@  fs_visitor::run_tes()
    assign_curb_setup();
    assign_tes_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6774,6 +6819,7 @@  fs_visitor::run_gs()
    assign_curb_setup();
    assign_gs_urb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(8, true);
 
@@ -6874,6 +6920,7 @@  fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
 
       assign_urb_setup();
 
+      fixup_hf_mad();
       fixup_3src_null_dest();
       allocate_registers(8, allow_spilling);
 
@@ -6918,6 +6965,7 @@  fs_visitor::run_cs(unsigned min_dispatch_width)
 
    assign_curb_setup();
 
+   fixup_hf_mad();
    fixup_3src_null_dest();
    allocate_registers(min_dispatch_width, true);
 
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index 163c0008820..f79f8554fb9 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -103,6 +103,7 @@  public:
    void setup_vs_payload();
    void setup_gs_payload();
    void setup_cs_payload();
+   void fixup_hf_mad();
    void fixup_3src_null_dest();
    void assign_curb_setup();
    void calculate_urb_setup();

Comments

On Wed, Dec 19, 2018 at 12:51:03PM +0100, Iago Toral Quiroga wrote:
> Broadwell hardware has a bug that manifests in SIMD8 executions of
> 16-bit MAD instructions when any of the sources is a Y or W component.
> We pack these components in the same SIMD register as components X and
> Z respectively, but starting at offset 16B (so they live in the second
> half of the register). The problem does not exist in SKL or later.
> 
> We work around this issue by moving any such sources to a temporary
> starting at offset 0B. We want to do this after the main optimization loop
> to prevent copy-propagation and friends to undo the fix.
> ---
>  src/intel/compiler/brw_fs.cpp | 48 +++++++++++++++++++++++++++++++++++
>  src/intel/compiler/brw_fs.h   |  1 +
>  2 files changed, 49 insertions(+)
> 
> diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
> index 933b0b6ffc4..1343c2f4993 100644
> --- a/src/intel/compiler/brw_fs.cpp
> +++ b/src/intel/compiler/brw_fs.cpp
> @@ -6449,6 +6449,48 @@ fs_visitor::optimize()
>     validate();
>  }
>  
> +/**
> + * Broadwell hardware has a bug that manifests in SIMD8 executions of 16-bit
> + * MAD instructions when any of the sources is a Y or W component. We pack
> + * these components in the same SIMD register as components X and Z
> + * respectively, but starting at offset 16B (so they live in the second half
> + * of the register).
> + *
> + * We work around this issue by moving any such sources to a temporary
> + * starting at offset 0B. We want to do this after the main optimization loop
> + * to prevent copy-propagation and friends to undo the fix.
> + */
> +void
> +fs_visitor::fixup_hf_mad()
> +{
> +   if (devinfo->gen > 8)

We don't want to run this for gen < 8 either as it would iterate the
instructions in vain. So just:

      if (devinfo->gen == 8)

Otherwise:

Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>

> +      return;
> +
> +   bool progress = false;
> +
> +   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
> +      if (inst->opcode != BRW_OPCODE_MAD ||
> +          inst->dst.type != BRW_REGISTER_TYPE_HF ||
> +          inst->exec_size > 8)
> +         continue;
> +
> +      for (int i = 0; i < 3; i++) {
> +         if (inst->src[i].offset > 0) {
> +            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
> +            const fs_builder ibld =
> +               bld.at(block, inst).exec_all().group(inst->exec_size, 0);
> +            fs_reg tmp = ibld.vgrf(inst->src[i].type);
> +            ibld.MOV(tmp, inst->src[i]);
> +            inst->src[i] = tmp;
> +            progress = true;
> +         }
> +      }
> +   }
> +
> +   if (progress)
> +      invalidate_live_intervals();
> +}
> +
>  /**
>   * Three source instruction must have a GRF/MRF destination register.
>   * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
> @@ -6607,6 +6649,7 @@ fs_visitor::run_vs()
>     assign_curb_setup();
>     assign_vs_urb_setup();
>  
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>  
> @@ -6691,6 +6734,7 @@ fs_visitor::run_tcs_single_patch()
>     assign_curb_setup();
>     assign_tcs_single_patch_urb_setup();
>  
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>  
> @@ -6725,6 +6769,7 @@ fs_visitor::run_tes()
>     assign_curb_setup();
>     assign_tes_urb_setup();
>  
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>  
> @@ -6774,6 +6819,7 @@ fs_visitor::run_gs()
>     assign_curb_setup();
>     assign_gs_urb_setup();
>  
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(8, true);
>  
> @@ -6874,6 +6920,7 @@ fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
>  
>        assign_urb_setup();
>  
> +      fixup_hf_mad();
>        fixup_3src_null_dest();
>        allocate_registers(8, allow_spilling);
>  
> @@ -6918,6 +6965,7 @@ fs_visitor::run_cs(unsigned min_dispatch_width)
>  
>     assign_curb_setup();
>  
> +   fixup_hf_mad();
>     fixup_3src_null_dest();
>     allocate_registers(min_dispatch_width, true);
>  
> diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
> index 163c0008820..f79f8554fb9 100644
> --- a/src/intel/compiler/brw_fs.h
> +++ b/src/intel/compiler/brw_fs.h
> @@ -103,6 +103,7 @@ public:
>     void setup_vs_payload();
>     void setup_gs_payload();
>     void setup_cs_payload();
> +   void fixup_hf_mad();
>     void fixup_3src_null_dest();
>     void assign_curb_setup();
>     void calculate_urb_setup();
> -- 
> 2.17.1
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
On Wed, 2019-01-02 at 11:46 +0200, Pohjolainen, Topi wrote:
> On Wed, Dec 19, 2018 at 12:51:03PM +0100, Iago Toral Quiroga wrote:
> > Broadwell hardware has a bug that manifests in SIMD8 executions of
> > 16-bit MAD instructions when any of the sources is a Y or W
> > component.
> > We pack these components in the same SIMD register as components X
> > and
> > Z respectively, but starting at offset 16B (so they live in the
> > second
> > half of the register). The problem does not exist in SKL or later.
> > 
> > We work around this issue by moving any such sources to a temporary
> > starting at offset 0B. We want to do this after the main
> > optimization loop
> > to prevent copy-propagation and friends to undo the fix.
> > ---
> >  src/intel/compiler/brw_fs.cpp | 48
> > +++++++++++++++++++++++++++++++++++
> >  src/intel/compiler/brw_fs.h   |  1 +
> >  2 files changed, 49 insertions(+)
> > 
> > diff --git a/src/intel/compiler/brw_fs.cpp
> > b/src/intel/compiler/brw_fs.cpp
> > index 933b0b6ffc4..1343c2f4993 100644
> > --- a/src/intel/compiler/brw_fs.cpp
> > +++ b/src/intel/compiler/brw_fs.cpp
> > @@ -6449,6 +6449,48 @@ fs_visitor::optimize()
> >     validate();
> >  }
> >  
> > +/**
> > + * Broadwell hardware has a bug that manifests in SIMD8 executions
> > of 16-bit
> > + * MAD instructions when any of the sources is a Y or W component.
> > We pack
> > + * these components in the same SIMD register as components X and
> > Z
> > + * respectively, but starting at offset 16B (so they live in the
> > second half
> > + * of the register).
> > + *
> > + * We work around this issue by moving any such sources to a
> > temporary
> > + * starting at offset 0B. We want to do this after the main
> > optimization loop
> > + * to prevent copy-propagation and friends to undo the fix.
> > + */
> > +void
> > +fs_visitor::fixup_hf_mad()
> > +{
> > +   if (devinfo->gen > 8)
> 
> We don't want to run this for gen < 8 either as it would iterate the
> instructions in vain. So just:
> 
>       if (devinfo->gen == 8)

Right, good point.

It should be "if (devinfo->gen != 8) though. I'll fix this, thanks!

> Otherwise:
> 
> Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
> 
> > +      return;
> > +
> > +   bool progress = false;
> > +
> > +   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
> > +      if (inst->opcode != BRW_OPCODE_MAD ||
> > +          inst->dst.type != BRW_REGISTER_TYPE_HF ||
> > +          inst->exec_size > 8)
> > +         continue;
> > +
> > +      for (int i = 0; i < 3; i++) {
> > +         if (inst->src[i].offset > 0) {
> > +            assert(inst->src[i].type == BRW_REGISTER_TYPE_HF);
> > +            const fs_builder ibld =
> > +               bld.at(block, inst).exec_all().group(inst-
> > >exec_size, 0);
> > +            fs_reg tmp = ibld.vgrf(inst->src[i].type);
> > +            ibld.MOV(tmp, inst->src[i]);
> > +            inst->src[i] = tmp;
> > +            progress = true;
> > +         }
> > +      }
> > +   }
> > +
> > +   if (progress)
> > +      invalidate_live_intervals();
> > +}
> > +
> >  /**
> >   * Three source instruction must have a GRF/MRF destination
> > register.
> >   * ARF NULL is not allowed.  Fix that up by allocating a temporary
> > GRF.
> > @@ -6607,6 +6649,7 @@ fs_visitor::run_vs()
> >     assign_curb_setup();
> >     assign_vs_urb_setup();
> >  
> > +   fixup_hf_mad();
> >     fixup_3src_null_dest();
> >     allocate_registers(8, true);
> >  
> > @@ -6691,6 +6734,7 @@ fs_visitor::run_tcs_single_patch()
> >     assign_curb_setup();
> >     assign_tcs_single_patch_urb_setup();
> >  
> > +   fixup_hf_mad();
> >     fixup_3src_null_dest();
> >     allocate_registers(8, true);
> >  
> > @@ -6725,6 +6769,7 @@ fs_visitor::run_tes()
> >     assign_curb_setup();
> >     assign_tes_urb_setup();
> >  
> > +   fixup_hf_mad();
> >     fixup_3src_null_dest();
> >     allocate_registers(8, true);
> >  
> > @@ -6774,6 +6819,7 @@ fs_visitor::run_gs()
> >     assign_curb_setup();
> >     assign_gs_urb_setup();
> >  
> > +   fixup_hf_mad();
> >     fixup_3src_null_dest();
> >     allocate_registers(8, true);
> >  
> > @@ -6874,6 +6920,7 @@ fs_visitor::run_fs(bool allow_spilling, bool
> > do_rep_send)
> >  
> >        assign_urb_setup();
> >  
> > +      fixup_hf_mad();
> >        fixup_3src_null_dest();
> >        allocate_registers(8, allow_spilling);
> >  
> > @@ -6918,6 +6965,7 @@ fs_visitor::run_cs(unsigned
> > min_dispatch_width)
> >  
> >     assign_curb_setup();
> >  
> > +   fixup_hf_mad();
> >     fixup_3src_null_dest();
> >     allocate_registers(min_dispatch_width, true);
> >  
> > diff --git a/src/intel/compiler/brw_fs.h
> > b/src/intel/compiler/brw_fs.h
> > index 163c0008820..f79f8554fb9 100644
> > --- a/src/intel/compiler/brw_fs.h
> > +++ b/src/intel/compiler/brw_fs.h
> > @@ -103,6 +103,7 @@ public:
> >     void setup_vs_payload();
> >     void setup_gs_payload();
> >     void setup_cs_payload();
> > +   void fixup_hf_mad();
> >     void fixup_3src_null_dest();
> >     void assign_curb_setup();
> >     void calculate_urb_setup();
> > -- 
> > 2.17.1
> > 
> > _______________________________________________
> > mesa-dev mailing list
> > mesa-dev@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> 
>