[v4,37/40] intel/compiler: validate region restrictions for mixed float mode

Submitted by Iago Toral Quiroga on Feb. 12, 2019, 11:56 a.m.

Details

Message ID 20190212115607.21467-38-itoral@igalia.com
State New
Headers show
Series "intel: VK_KHR_shader_float16_int8 implementation" ( rev: 11 10 9 8 7 6 ) in Mesa

Not browsing as part of any series.

Commit Message

Iago Toral Quiroga Feb. 12, 2019, 11:56 a.m.
---
 src/intel/compiler/brw_eu_validate.c    | 256 ++++++++++
 src/intel/compiler/test_eu_validate.cpp | 618 ++++++++++++++++++++++++
 2 files changed, 874 insertions(+)

Patch hide | download patch | download mbox

diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
index ed9c8fe59dd..a61d4c46e81 100644
--- a/src/intel/compiler/brw_eu_validate.c
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -170,6 +170,13 @@  src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
           brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
 }
 
+static bool
+src0_is_acc(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_ACCUMULATOR;
+}
+
 static bool
 src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
 {
@@ -847,6 +854,254 @@  general_restrictions_on_region_parameters(const struct gen_device_info *devinfo,
    return error_msg;
 }
 
+static struct string
+special_restrictions_for_mixed_float_mode(const struct gen_device_info *devinfo,
+                                          const brw_inst *inst)
+{
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   unsigned opcode = brw_inst_opcode(devinfo, inst);
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   if (num_sources >= 3)
+      return error_msg;
+
+   if (!is_mixed_float(devinfo, inst))
+      return error_msg;
+
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   bool is_align16 = brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16;
+
+   enum brw_reg_type src0_type = brw_inst_src0_type(devinfo, inst);
+   enum brw_reg_type src1_type = brw_inst_src1_type(devinfo, inst);
+   enum brw_reg_type dst_type = brw_inst_dst_type(devinfo, inst);
+
+   unsigned dst_stride = STRIDE(brw_inst_dst_hstride(devinfo, inst));
+   bool dst_is_packed = is_packed(exec_size * dst_stride, exec_size, dst_stride);
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "Indirect addressing on source is not supported when source and
+    *     destination data types are mixed float."
+    *
+    * Indirect addressing is only supported on the first source, so we only
+    * check that.
+    */
+   ERROR_IF(types_are_mixed_float(dst_type, src0_type) &&
+            brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT,
+            "Indirect addressing on source is not supported when source and "
+            "destination data types are mixed float");
+
+   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+    * Float Operations:
+    *
+    *    "No SIMD16 in mixed mode when destination is f32. Instruction
+    *     execution size must be no more than 8."
+    */
+   ERROR_IF(exec_size > 8 && dst_type == BRW_REGISTER_TYPE_F,
+            "Mixed float mode with 32-bit float destination is limited "
+            "to SIMD8");
+
+   if (is_align16) {
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *   "In Align16 mode, when half float and float data types are mixed
+       *    between source operands OR between source and destination operands,
+       *    the register content are assumed to be packed."
+       *
+       * Since Align16 doesn't have a concept of horizontal stride (or width),
+       * it means that vertical stride must always be 4, since 0 and 2 would
+       * lead to replicated data, and any other value is disallowed in Align16.
+       * However, the PRM also says:
+       *
+       *   "In Align16, vertical stride can never be zero for f16"
+       *
+       * Which is oddly redundant and specific considering the more general
+       * assumption that all operands are assumed to be packed, so we
+       * understand that this might be hinting that there may be an exception
+       * for f32 operands with a vstride of 0, so we don't validate this for
+       * them while we don't have empirical evidence that it is forbidden.
+       */
+      ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
+               (src0_type != BRW_REGISTER_TYPE_F ||
+                brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
+               "Align16 mixed float mode assumes packed data (vstride must "
+               "be 4 -or 0 for f32 operands-)");
+
+      ERROR_IF(num_sources >= 2 &&
+               brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4 &&
+               (src1_type != BRW_REGISTER_TYPE_F ||
+                brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0),
+               "Align16 mixed float mode assumes packed data (vstride must "
+               "be 4 -or 0 for f32 operands-)");
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *   "For Align16 mixed mode, both input and output packed f16 data
+       *    must be oword aligned, no oword crossing in packed f16."
+       *
+       * The previous rule requires that Align16 operands are always packed,
+       * and since there is only one bit for Align16 subnr, which represents
+       * offsets 0B and 16B, this rule is always enforced and we don't need to
+       * validate it.
+       */
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No SIMD16 in mixed mode when destination is packed f16 for both
+       *     Align1 and Align16."
+       *
+       * And:
+       *
+       *   "In Align16 mode, when half float and float data types are mixed
+       *    between source operands OR between source and destination operands,
+       *    the register content are assumed to be packed."
+       *
+       * Which implies that SIMD16 is not available in Align16. This is further
+       * confirmed by:
+       *
+       *    "For Align16 mixed mode, both input and output packed f16 data
+       *     must be oword aligned, no oword crossing in packed f16"
+       *
+       * Since oword-aligned packed f16 data would cross oword boundaries when
+       * the execution size is larger than 8.
+       */
+      ERROR_IF(exec_size > 8, "Align16 mixed float mode is limited to SIMD8");
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "Math operations for mixed mode:
+       *     - In Align16, only packed format is supported"
+       *
+       * It is not clear what this is restricting since as stated in previous
+       * spec quotes, Align16 always assumes packed data. However, since
+       * we are allowing vstride of 0 on f32, we check again here without that
+       * exception.
+       */
+      if (opcode == BRW_OPCODE_MATH) {
+         ERROR_IF(brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                  "Align16 mixed float mode math only supports packed format");
+
+         ERROR_IF(num_sources >= 2 &&
+                  brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                  "Align16 mixed float mode math only supports packed format");
+      }
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No accumulator read access for Align16 mixed float."
+       *
+       * Explicit accumulator reads are only supported on the first source,
+       * so we only check that.
+       */
+      ERROR_IF(src0_is_acc(devinfo, inst),
+               "No accumulator read access for Align16 mixed float");
+   } else {
+      assert(!is_align16);
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No SIMD16 in mixed mode when destination is packed f16 for both
+       *     Align1 and Align16."
+       */
+      ERROR_IF(exec_size > 8 && dst_is_packed &&
+               dst_type == BRW_REGISTER_TYPE_HF,
+               "Align1 mixed float mode is limited to SIMD8 when destination "
+               "is packed half-float");
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "Math operations for mixed mode:
+       *     - In Align1, f16 inputs need to be strided"
+       */
+      if (opcode == BRW_OPCODE_MATH) {
+         if (src0_type == BRW_REGISTER_TYPE_HF) {
+            ERROR_IF(STRIDE(brw_inst_src0_hstride(devinfo, inst)) <= 1,
+                     "Align1 mixed mode math needs strided half-float inputs");
+         }
+
+         if (num_sources >= 2 && src1_type == BRW_REGISTER_TYPE_HF) {
+            ERROR_IF(STRIDE(brw_inst_src1_hstride(devinfo, inst)) <= 1,
+                     "Align1 mixed mode math needs strided half-float inputs");
+         }
+      }
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "In Align1, destination stride can be smaller than execution
+       *     type. When destination is stride of 1, 16 bit packed data is
+       *     updated on the destination. However, output packed f16 data
+       *     must be oword aligned, no oword crossing in packed f16."
+       *
+       * The requirement of not crossing oword boundaries for 16-bit oword
+       * aligned data means that execution size is limited to 8.
+       */
+      if (dst_type == BRW_REGISTER_TYPE_HF && dst_stride == 1) {
+         unsigned subreg;
+         if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT)
+            subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+         else
+            subreg = brw_inst_dst_ia_subreg_nr(devinfo, inst);
+         ERROR_IF(subreg % 16 != 0,
+                  "Align1 mixed mode packed half-float output must be "
+                  "oword aligned");
+         ERROR_IF(exec_size > 8,
+                  "Align1 mixed mode packed half-float output must not "
+                  "cross oword boundaries (max exec size is 8)");
+      }
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "When source is float or half float from accumulator register and
+       *     destination is half float with a stride of 1, the source must
+       *     register aligned. i.e., source must have offset zero."
+       *
+       * Align16 mixed float mode doesn't allow accumulator access on sources,
+       * so we only need to check this for Align1. Also, explicit accumulator
+       * reads are only supported on the first source, so we only check that.
+       */
+      if (dst_type == BRW_REGISTER_TYPE_HF && dst_stride == 1) {
+         if (src0_is_acc(devinfo, inst) &&
+             (src0_type == BRW_REGISTER_TYPE_F ||
+              src0_type == BRW_REGISTER_TYPE_HF)) {
+            ERROR_IF(brw_inst_src0_da1_subreg_nr(devinfo, inst) != 0,
+                     "Mixed float mode requires register-aligned accumulator "
+                     "source reads when destination is packed half-float");
+         }
+      }
+
+      /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
+       * Float Operations:
+       *
+       *    "No swizzle is allowed when an accumulator is used as an implicit
+       *     source or an explicit source in an instruction. i.e. when
+       *     destination is half float with an implicit accumulator source,
+       *     destination stride needs to be 2."
+       *
+       * FIXME: it is not quite clear what the first sentence actually means
+       *        or its link to the implication described after it, so we only
+       *        validate the explicit implication, which is clearly described.
+       */
+      if (dst_type == BRW_REGISTER_TYPE_HF && opcode == BRW_OPCODE_MAC) {
+         ERROR_IF(dst_stride != 2,
+                  "Mixed float mode with implicit accumulator source and "
+                  "half-float destination requires a stride of 2 on the "
+                  "destination");
+      }
+   }
+
+   return error_msg;
+}
+
 /**
  * Creates an \p access_mask for an \p exec_size, \p element_size, and a region
  *
@@ -1485,6 +1740,7 @@  brw_validate_instructions(const struct gen_device_info *devinfo,
          CHECK(send_restrictions);
          CHECK(general_restrictions_based_on_operand_types);
          CHECK(general_restrictions_on_region_parameters);
+         CHECK(special_restrictions_for_mixed_float_mode);
          CHECK(region_alignment_rules);
          CHECK(vector_immediate_restrictions);
          CHECK(special_requirements_for_handling_double_precision_data_types);
diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
index 06beb53eb5d..23f2363f19e 100644
--- a/src/intel/compiler/test_eu_validate.cpp
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -1016,6 +1016,624 @@  TEST_P(validation_test, half_float_conversion)
    clear_instructions(p);
 }
 
+TEST_P(validation_test, mixed_float_source_indirect_addressing)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      bool dst_indirect;
+      bool src0_indirect;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             dst_indirect, src0_indirect, expected_result)                \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         dst_indirect,                                                    \
+         src0_indirect,                                                   \
+         expected_result,                                                 \
+      }
+
+      /* Source and dest are mixed float: indirect src addressing not allowed */
+      INST(HF,  F,  F, false, false, true),
+      INST(HF,  F,  F, true,  false, true),
+      INST(HF,  F,  F, false, true,  false),
+      INST(HF,  F,  F, true,  true,  false),
+      INST( F, HF,  F, false, false, true),
+      INST( F, HF,  F, true,  false, true),
+      INST( F, HF,  F, false, true,  false),
+      INST( F, HF,  F, true,  true,  false),
+
+      /* Source and dest are not mixed float: indirect src addressing allowed */
+      INST(HF, HF,  F, false, false, true),
+      INST(HF, HF,  F, true,  false, true),
+      INST(HF, HF,  F, false, true,  true),
+      INST(HF, HF,  F, true,  true,  true),
+      INST( F,  F, HF, false, false, true),
+      INST( F,  F, HF, true,  false, true),
+      INST( F,  F, HF, false, true,  true),
+      INST( F,  F, HF, true,  true,  true),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8)
+      return;
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_address_mode(&devinfo, last_inst, inst[i].dst_indirect);
+      brw_inst_set_src0_address_mode(&devinfo, last_inst, inst[i].src0_indirect);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_simd16)
+{
+   static const struct {
+      unsigned exec_size;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      bool expected_result;
+   } inst[] = {
+#define INST(exec_size, dst_type, src0_type, src1_type,                   \
+             dst_stride, expected_result)                                 \
+      {                                                                   \
+         BRW_EXECUTE_##exec_size,                                         \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         expected_result,                                                 \
+      }
+
+      /* No SIMD16 in mixed mode when destination is packed f16 */
+      INST( 8, HF,  F,  F, 1, true),
+      INST( 8, HF,  F, HF, 2, true),
+      INST(16, HF,  F, HF, 1, false),
+      INST(16, HF,  F, HF, 2, true),
+
+      /* No SIMD16 in mixed mode when destination is f32 */
+      INST( 8,  F, HF,  F, 1, true),
+      INST( 8,  F,  F, HF, 1, true),
+      INST(16,  F, HF,  F, 1, false),
+      INST(16,  F,  F, HF, 1, false),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8)
+      return;
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_packed_fp16_dst_acc_read_offset_0)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      bool read_acc;
+      unsigned subnr;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type, dst_stride,                  \
+             read_acc, subnr, expected_result)                            \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         read_acc,                                                        \
+         subnr,                                                           \
+         expected_result,                                                 \
+      }
+
+      /* Destination is not packed */
+      INST(HF, HF,  F, 2, true,  0, true),
+      INST(HF, HF,  F, 2, true,  2, true),
+      INST(HF, HF,  F, 2, true,  4, true),
+      INST(HF, HF,  F, 2, true,  8, true),
+      INST(HF, HF,  F, 2, true, 16, true),
+
+      /* Destination is packed, we don't read acc */
+      INST(HF, HF,  F, 1, false,  0, true),
+      INST(HF, HF,  F, 1, false,  2, true),
+      INST(HF, HF,  F, 1, false,  4, true),
+      INST(HF, HF,  F, 1, false,  8, true),
+      INST(HF, HF,  F, 1, false, 16, true),
+
+      /* Destination is packed, we read acc */
+      INST(HF, HF,  F, 1, true,  0, true),
+      INST(HF, HF,  F, 1, true,  2, false),
+      INST(HF, HF,  F, 1, true,  4, false),
+      INST(HF, HF,  F, 1, true,  8, false),
+      INST(HF, HF,  F, 1, true, 16, false),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8)
+      return;
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, inst[i].subnr);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_fp16_dest_with_implicit_acc)
+{
+   static const struct {
+      unsigned exec_size;
+      unsigned opcode;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      bool expected_result;
+   } inst[] = {
+#define INST(exec_size, opcode, dst_type, src0_type, src1_type,           \
+             dst_stride, expected_result)                                 \
+      {                                                                   \
+         BRW_EXECUTE_##exec_size,                                         \
+         BRW_OPCODE_##opcode,                                             \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         expected_result,                                                 \
+      }
+
+      /* Packed fp16 dest with implicit acc needs hstride=2 */
+      INST(8, MAC, HF, HF,  F, 1, false),
+      INST(8, MAC, HF, HF,  F, 2, true),
+      INST(8, MAC, HF, HF,  F, 4, false),
+      INST(8, MAC, HF,  F, HF, 1, false),
+      INST(8, MAC, HF,  F, HF, 2, true),
+      INST(8, MAC, HF,  F, HF, 4, false),
+
+      /* If destination is not fp16, restriction doesn't apply */
+      INST(8, MAC,  F, HF,  F, 1, true),
+      INST(8, MAC,  F, HF,  F, 2, true),
+
+      /* If there is no implicit acc, restriction doesn't apply */
+      INST(8, ADD, HF, HF,  F, 1, true),
+      INST(8, ADD, HF, HF,  F, 2, true),
+      INST(8, ADD, HF, HF,  F, 4, true),
+      INST(8, ADD, HF,  F, HF, 1, true),
+      INST(8, ADD, HF,  F, HF, 2, true),
+      INST(8, ADD, HF,  F, HF, 4, true),
+      INST(8, ADD,  F, HF,  F, 1, true),
+      INST(8, ADD,  F, HF,  F, 2, true),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8)
+      return;
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      if (inst[i].opcode == BRW_OPCODE_MAC) {
+         brw_MAC(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src0_type),
+                    retype(g0, inst[i].src1_type));
+      } else {
+         assert(inst[i].opcode == BRW_OPCODE_ADD);
+         brw_ADD(p, retype(g0, inst[i].dst_type),
+                    retype(g0, inst[i].src0_type),
+                    retype(g0, inst[i].src1_type));
+      }
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_math_strided_fp16_inputs)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      unsigned src0_stride;
+      unsigned src1_stride;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             dst_stride, src0_stride, src1_stride, expected_result)       \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         BRW_HORIZONTAL_STRIDE_##src0_stride,                             \
+         BRW_HORIZONTAL_STRIDE_##src1_stride,                             \
+         expected_result,                                                 \
+      }
+
+      INST(HF, HF,  F, 2, 2, 1, true),
+      INST(HF,  F, HF, 2, 1, 2, true),
+      INST(HF,  F, HF, 1, 1, 2, true),
+      INST(HF,  F, HF, 2, 1, 1, false),
+      INST(HF, HF,  F, 2, 1, 1, false),
+      INST(HF, HF,  F, 1, 1, 1, false),
+      INST(HF, HF,  F, 2, 1, 1, false),
+      INST( F, HF,  F, 1, 1, 1, false),
+      INST( F,  F, HF, 1, 1, 2, true),
+      INST( F, HF, HF, 1, 2, 1, false),
+      INST( F, HF, HF, 1, 2, 2, true),
+
+#undef INST
+   };
+
+   /* No half-float math in gen8 */
+   if (devinfo.gen < 9)
+      return;
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      gen6_math(p, retype(g0, inst[i].dst_type),
+                   BRW_MATH_FUNCTION_POW,
+                   retype(g0, inst[i].src0_type),
+                   retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, inst[i].src0_stride);
+
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src1_hstride(&devinfo, last_inst, inst[i].src1_stride);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align1_packed_fp16_dst)
+{
+   static const struct {
+      unsigned exec_size;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned dst_stride;
+      unsigned dst_subnr;
+      bool expected_result;
+   } inst[] = {
+#define INST(exec_size, dst_type, src0_type, src1_type,                   \
+             dst_stride, dst_subnr, expected_result)                      \
+      {                                                                   \
+         BRW_EXECUTE_##exec_size,                                         \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_HORIZONTAL_STRIDE_##dst_stride,                              \
+         dst_subnr,                                                       \
+         expected_result,                                                 \
+      }
+
+      /* SIMD8 packed fp16 dst won't cross oword boundaries if region is
+       * oword-aligned
+       */
+      INST( 8, HF, HF,  F, 1,  0, true),
+      INST( 8, HF, HF,  F, 1,  2, false),
+      INST( 8, HF, HF,  F, 1,  4, false),
+      INST( 8, HF, HF,  F, 1,  8, false),
+      INST( 8, HF, HF,  F, 1, 16, true),
+
+      /* SIMD16 packed fp16 always crosses oword boundaries */
+      INST(16, HF, HF,  F, 1,  0, false),
+      INST(16, HF, HF,  F, 1,  2, false),
+      INST(16, HF, HF,  F, 1,  4, false),
+      INST(16, HF, HF,  F, 1,  8, false),
+      INST(16, HF, HF,  F, 1, 16, false),
+
+      /* If destination is not packed (or not fp16) we can cross oword
+       * boundaries
+       */
+      INST( 8, HF, HF,  F, 2,  0, true),
+      INST( 8,  F, HF,  F, 1,  0, true),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8)
+      return;
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_dst_hstride(&devinfo, last_inst, inst[i].dst_stride);
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, inst[i].dst_subnr);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+      brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_packed_data)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned src0_vstride;
+      unsigned src1_vstride;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             src0_vstride, src1_vstride, expected_result)                 \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_VERTICAL_STRIDE_##src0_vstride,                              \
+         BRW_VERTICAL_STRIDE_##src1_vstride,                              \
+         expected_result,                                                 \
+      }
+
+      INST(HF,  F, HF, 4, 4, true),
+      INST(HF,  F, HF, 2, 4, false),
+      INST(HF,  F, HF, 4, 2, false),
+      INST(HF,  F, HF, 0, 4, true),
+      INST(HF,  F, HF, 4, 0, false),
+      INST(HF, HF,  F, 4, 4, true),
+      INST(HF, HF,  F, 4, 2, false),
+      INST(HF, HF,  F, 2, 4, false),
+      INST(HF, HF,  F, 0, 4, false),
+      INST(HF, HF,  F, 4, 0, true),
+      INST( F, HF,  F, 4, 0, true),
+      INST( F, HF, HF, 4, 4, true),
+      INST( F,  F, HF, 4, 0, false),
+      INST( F,  F, HF, 2, 4, false),
+      INST( F,  F, HF, 4, 2, false),
+      INST( F, HF, HF, 0, 4, false),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8 || devinfo.gen >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_no_simd16)
+{
+   static const struct {
+      unsigned exec_size;
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      bool expected_result;
+   } inst[] = {
+#define INST(exec_size, dst_type, src0_type, src1_type, expected_result)  \
+      {                                                                   \
+         BRW_EXECUTE_##exec_size,                                         \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         expected_result,                                                 \
+      }
+
+      INST( 8, HF,  F, HF, true),
+      INST( 8, HF, HF,  F, true),
+      INST( 8, HF,  F,  F, true),
+      INST( 8,  F,  F, HF, true),
+      INST( 8,  F, HF,  F, true),
+      INST( 8,  F,  F, HF, true),
+      INST(16, HF,  F, HF, false),
+      INST(16, HF, HF,  F, false),
+      INST(16, HF,  F,  F, false),
+      INST(16,  F,  F, HF, false),
+      INST(16,  F, HF,  F, false),
+      INST(16,  F,  F, HF, false),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8 || devinfo.gen >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_exec_size(&devinfo, last_inst, inst[i].exec_size);
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_no_acc_read)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      bool read_acc;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type, read_acc, expected_result)   \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         read_acc,                                                        \
+         expected_result,                                                 \
+      }
+
+      INST(HF,  F, HF, false, true),
+      INST(HF,  F, HF, true,  false),
+      INST(HF, HF,  F, false, true),
+      INST(HF, HF,  F, true,  false),
+      INST( F,  F, HF, false, true),
+      INST( F,  F, HF, true,  false),
+      INST( F, HF,  F, false, true),
+      INST( F, HF,  F, true,  false),
+
+#undef INST
+   };
+
+   if (devinfo.gen < 8 || devinfo.gen >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      brw_ADD(p, retype(g0, inst[i].dst_type),
+                 retype(inst[i].read_acc ? acc0 : g0, inst[i].src0_type),
+                 retype(g0, inst[i].src1_type));
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+TEST_P(validation_test, mixed_float_align16_math_packed_format)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src0_type;
+      enum brw_reg_type src1_type;
+      unsigned src0_vstride;
+      unsigned src1_vstride;
+      bool expected_result;
+   } inst[] = {
+#define INST(dst_type, src0_type, src1_type,                              \
+             src0_vstride, src1_vstride, expected_result)                 \
+      {                                                                   \
+         BRW_REGISTER_TYPE_##dst_type,                                    \
+         BRW_REGISTER_TYPE_##src0_type,                                   \
+         BRW_REGISTER_TYPE_##src1_type,                                   \
+         BRW_VERTICAL_STRIDE_##src0_vstride,                              \
+         BRW_VERTICAL_STRIDE_##src1_vstride,                              \
+         expected_result,                                                 \
+      }
+
+      INST(HF,  F, HF, 4, 4, true),
+      INST(HF,  F, HF, 2, 4, false),
+      INST(HF,  F, HF, 4, 2, false),
+      INST(HF,  F, HF, 0, 4, false),
+      INST(HF,  F, HF, 4, 0, false),
+      INST(HF, HF,  F, 4, 4, true),
+      INST(HF, HF,  F, 4, 2, false),
+      INST(HF, HF,  F, 2, 4, false),
+      INST(HF, HF,  F, 0, 4, false),
+      INST(HF, HF,  F, 4, 0, false),
+      INST( F, HF,  F, 4, 0, false),
+      INST( F, HF, HF, 4, 4, true),
+      INST( F,  F, HF, 4, 0, false),
+      INST( F,  F, HF, 2, 4, false),
+      INST( F,  F, HF, 4, 2, false),
+      INST( F, HF, HF, 0, 4, false),
+
+#undef INST
+   };
+
+   /* Align16 Math for mixed float mode is not supported in gen8 */
+   if (devinfo.gen < 9 || devinfo.gen >= 11)
+      return;
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < sizeof(inst) / sizeof(inst[0]); i++) {
+      gen6_math(p, retype(g0, inst[i].dst_type),
+                   BRW_MATH_FUNCTION_POW,
+                   retype(g0, inst[i].src0_type),
+                   retype(g0, inst[i].src1_type));
+
+      brw_inst_set_src0_vstride(&devinfo, last_inst, inst[i].src0_vstride);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, inst[i].src1_vstride);
+
+      EXPECT_EQ(inst[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
 TEST_P(validation_test, vector_immediate_destination_alignment)
 {
    static const struct {