[2/2] Use the Byte Gather after HSW when read byte/shor.

Submitted by Yang, Rong R on June 15, 2015, 6:45 a.m.

Details

Message ID 1434350735-32468-2-git-send-email-rong.r.yang@intel.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Yang, Rong R June 15, 2015, 6:45 a.m.
After HSW, the byte gather's performance issue has gone, so needn't read
dword and extract.
But for multi dst load, the combine reduce the
address calc, but need the extract the dst, maybe performance is
approximate, so still use the old logic.

Signed-off-by: Yang Rong <rong.r.yang@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 36 ++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d63c7e3..d289e8e 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -365,6 +365,8 @@  namespace gbe
     void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
     void setLdMsgOrder(uint32_t type)  { ldMsgOrder = type; }
     uint32_t getLdMsgOrder()  const { return ldMsgOrder; }
+    void setSlowByteGather(bool b) { slowByteGather = b; }
+    bool getSlowByteGather() { return slowByteGather; }
     /*! indicate whether a register is a scalar/uniform register. */
     INLINE bool isPartialWrite(const ir::Register &reg) const {
       return partialWriteRegs.find(reg.value()) != partialWriteRegs.end();
@@ -740,6 +742,7 @@  namespace gbe
     bool bHasLongType;
     bool bLongRegRestrict;
     uint32_t ldMsgOrder;
+    bool slowByteGather;
     INLINE ir::LabelIndex newAuxLabel()
     {
       currAuxLabel++;
@@ -779,7 +782,8 @@  namespace gbe
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
     stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), ldMsgOrder(LD_MSG_ORDER_IVB)
+    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), ldMsgOrder(LD_MSG_ORDER_IVB),
+    slowByteGather(false)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -2025,26 +2029,31 @@  namespace gbe
   Selection::Selection(GenContext &ctx) {
     this->blockList = NULL;
     this->opaque = GBE_NEW(Selection::Opaque, ctx);
+    this->opaque->setSlowByteGather(true);
   }
 
   Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setSlowByteGather(false);
   }
 
   Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
+    this->opaque->setSlowByteGather(false);
   }
 
   SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
     this->opaque->setLongRegRestrict(true);
+    this->opaque->setSlowByteGather(false);
   }
 
   Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
     this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+    this->opaque->setSlowByteGather(false);
   }
 
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
@@ -3519,8 +3528,31 @@  namespace gbe
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
         GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
+        if(sel.getSlowByteGather())
+          readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+        else {
+          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+          GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+
+          // We need a temporary register if we read bytes or words
+          Register dst = sel.reg(FAMILY_DWORD, isUniform);
+          sel.push();
+            if (isUniform)
+              sel.curr.noMask = 1;
+            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, bti.isConst ? NULL : & tmpFlag);
+          sel.pop();
 
-        readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+          sel.push();
+            if (isUniform) {
+              sel.curr.noMask = 1;
+              sel.curr.execWidth = 1;
+            }
+            if (elemSize == GEN_BYTE_SCATTER_WORD)
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
+            else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
+          sel.pop();
+        }
       }
     }
 

Comments

Should the unaligned optimization we did in vload/vstore also gone after HSW?

Thanks
Zou Nanhai

> -----Original Message-----

> From: Beignet [mailto:beignet-bounces@lists.freedesktop.org] On Behalf Of

> Yang Rong

> Sent: Monday, June 15, 2015 2:46 PM

> To: beignet@lists.freedesktop.org

> Cc: Yang, Rong R

> Subject: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read

> byte/shor.

> 

> After HSW, the byte gather's performance issue has gone, so needn't read

> dword and extract.

> But for multi dst load, the combine reduce the address calc, but need the

> extract the dst, maybe performance is approximate, so still use the old logic.

> 

> Signed-off-by: Yang Rong <rong.r.yang@intel.com>

> ---

>  backend/src/backend/gen_insn_selection.cpp | 36

> ++++++++++++++++++++++++++++--

>  1 file changed, 34 insertions(+), 2 deletions(-)

> 

> diff --git a/backend/src/backend/gen_insn_selection.cpp

> b/backend/src/backend/gen_insn_selection.cpp

> index d63c7e3..d289e8e 100644

> --- a/backend/src/backend/gen_insn_selection.cpp

> +++ b/backend/src/backend/gen_insn_selection.cpp

> @@ -365,6 +365,8 @@ namespace gbe

>      void setLongRegRestrict(bool b) { bLongRegRestrict = b; }

>      void setLdMsgOrder(uint32_t type)  { ldMsgOrder = type; }

>      uint32_t getLdMsgOrder()  const { return ldMsgOrder; }

> +    void setSlowByteGather(bool b) { slowByteGather = b; }

> +    bool getSlowByteGather() { return slowByteGather; }

>      /*! indicate whether a register is a scalar/uniform register. */

>      INLINE bool isPartialWrite(const ir::Register &reg) const {

>        return partialWriteRegs.find(reg.value()) != partialWriteRegs.end();

> @@ -740,6 +742,7 @@ namespace gbe

>      bool bHasLongType;

>      bool bLongRegRestrict;

>      uint32_t ldMsgOrder;

> +    bool slowByteGather;

>      INLINE ir::LabelIndex newAuxLabel()

>      {

>        currAuxLabel++;

> @@ -779,7 +782,8 @@ namespace gbe

>      curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),

>      maxInsnNum(ctx.getFunction().getLargestBlockSize()),

> dagPool(maxInsnNum),

>      stateNum(0), vectorNum(0), bwdCodeGeneration(false),

> currAuxLabel(ctx.getFunction().labelNum()),

> -    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false),

> ldMsgOrder(LD_MSG_ORDER_IVB)

> +    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false),

> ldMsgOrder(LD_MSG_ORDER_IVB),

> +    slowByteGather(false)

>    {

>      const ir::Function &fn = ctx.getFunction();

>      this->regNum = fn.regNum();

> @@ -2025,26 +2029,31 @@ namespace gbe

>    Selection::Selection(GenContext &ctx) {

>      this->blockList = NULL;

>      this->opaque = GBE_NEW(Selection::Opaque, ctx);

> +    this->opaque->setSlowByteGather(true);

>    }

> 

>    Selection75::Selection75(GenContext &ctx) : Selection(ctx) {

> +    this->opaque->setSlowByteGather(false);

>    }

> 

>    Selection8::Selection8(GenContext &ctx) : Selection(ctx) {

>      this->opaque->setHas32X32Mul(true);

>      this->opaque->setHasLongType(true);

> +    this->opaque->setSlowByteGather(false);

>    }

> 

>    SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {

>      this->opaque->setHas32X32Mul(true);

>      this->opaque->setHasLongType(true);

>      this->opaque->setLongRegRestrict(true);

> +    this->opaque->setSlowByteGather(false);

>    }

> 

>    Selection9::Selection9(GenContext &ctx) : Selection(ctx) {

>      this->opaque->setHas32X32Mul(true);

>      this->opaque->setHasLongType(true);

>      this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);

> +    this->opaque->setSlowByteGather(false);

>    }

> 

>    void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t

> msgNum, @@ -3519,8 +3528,31 @@ namespace gbe

>          GBE_ASSERT(insn.getValueNum() == 1);

>          const GenRegister value = sel.selReg(insn.getValue(0),

> insn.getValueType());

>          GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize

> == GEN_BYTE_SCATTER_BYTE);

> +        if(sel.getSlowByteGather())

> +          readByteAsDWord(sel, elemSize, address, value, isUniform, bti);

> +        else {

> +          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) :

> sel.selReg(bti.reg, ir::TYPE_U32);

> +          GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD,

> + true), ir::TYPE_U16);

> +

> +          // We need a temporary register if we read bytes or words

> +          Register dst = sel.reg(FAMILY_DWORD, isUniform);

> +          sel.push();

> +            if (isUniform)

> +              sel.curr.noMask = 1;

> +            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address,

> elemSize, b, bti.isConst ? NULL : & tmpFlag);

> +          sel.pop();

> 

> -        readByteAsDWord(sel, elemSize, address, value, isUniform, bti);

> +          sel.push();

> +            if (isUniform) {

> +              sel.curr.noMask = 1;

> +              sel.curr.execWidth = 1;

> +            }

> +            if (elemSize == GEN_BYTE_SCATTER_WORD)

> +              sel.MOV(GenRegister::retype(value, GEN_TYPE_UW),

> GenRegister::unpacked_uw(dst));

> +            else if (elemSize == GEN_BYTE_SCATTER_BYTE)

> +              sel.MOV(GenRegister::retype(value, GEN_TYPE_UB),

> GenRegister::unpacked_ub(dst));

> +          sel.pop();

> +        }

>        }

>      }

> 

> --

> 1.8.3.2

> 

> _______________________________________________

> Beignet mailing list

> Beignet@lists.freedesktop.org

> http://lists.freedesktop.org/mailman/listinfo/beignet
Yes, the vector load merged optimization, aligned byte/short vector load may be same as split load, and unaligned byte/short vector, the split load may be better than merged load.

I will send a new patch to handle unaligned byte/short vector load.

> -----Original Message-----

> From: Zou, Nanhai

> Sent: Tuesday, June 16, 2015 07:01

> To: Yang, Rong R; beignet@lists.freedesktop.org

> Cc: Yang, Rong R

> Subject: RE: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when

> read byte/shor.

> 

> Should the unaligned optimization we did in vload/vstore also gone after

> HSW?

> 

> Thanks

> Zou Nanhai

> 

> > -----Original Message-----

> > From: Beignet [mailto:beignet-bounces@lists.freedesktop.org] On Behalf

> > Of Yang Rong

> > Sent: Monday, June 15, 2015 2:46 PM

> > To: beignet@lists.freedesktop.org

> > Cc: Yang, Rong R

> > Subject: [Beignet] [PATCH 2/2] Use the Byte Gather after HSW when read

> > byte/shor.

> >

> > After HSW, the byte gather's performance issue has gone, so needn't

> > read dword and extract.

> > But for multi dst load, the combine reduce the address calc, but need

> > the extract the dst, maybe performance is approximate, so still use the old

> logic.

> >

> > Signed-off-by: Yang Rong <rong.r.yang@intel.com>

> > ---

> >  backend/src/backend/gen_insn_selection.cpp | 36

> > ++++++++++++++++++++++++++++--

> >  1 file changed, 34 insertions(+), 2 deletions(-)

> >

> > diff --git a/backend/src/backend/gen_insn_selection.cpp

> > b/backend/src/backend/gen_insn_selection.cpp

> > index d63c7e3..d289e8e 100644

> > --- a/backend/src/backend/gen_insn_selection.cpp

> > +++ b/backend/src/backend/gen_insn_selection.cpp

> > @@ -365,6 +365,8 @@ namespace gbe

> >      void setLongRegRestrict(bool b) { bLongRegRestrict = b; }

> >      void setLdMsgOrder(uint32_t type)  { ldMsgOrder = type; }

> >      uint32_t getLdMsgOrder()  const { return ldMsgOrder; }

> > +    void setSlowByteGather(bool b) { slowByteGather = b; }

> > +    bool getSlowByteGather() { return slowByteGather; }

> >      /*! indicate whether a register is a scalar/uniform register. */

> >      INLINE bool isPartialWrite(const ir::Register &reg) const {

> >        return partialWriteRegs.find(reg.value()) !=

> > partialWriteRegs.end(); @@ -740,6 +742,7 @@ namespace gbe

> >      bool bHasLongType;

> >      bool bLongRegRestrict;

> >      uint32_t ldMsgOrder;

> > +    bool slowByteGather;

> >      INLINE ir::LabelIndex newAuxLabel()

> >      {

> >        currAuxLabel++;

> > @@ -779,7 +782,8 @@ namespace gbe

> >      curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),

> >      maxInsnNum(ctx.getFunction().getLargestBlockSize()),

> > dagPool(maxInsnNum),

> >      stateNum(0), vectorNum(0), bwdCodeGeneration(false),

> > currAuxLabel(ctx.getFunction().labelNum()),

> > -    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false),

> > ldMsgOrder(LD_MSG_ORDER_IVB)

> > +    bHas32X32Mul(false), bHasLongType(false),

> > + bLongRegRestrict(false),

> > ldMsgOrder(LD_MSG_ORDER_IVB),

> > +    slowByteGather(false)

> >    {

> >      const ir::Function &fn = ctx.getFunction();

> >      this->regNum = fn.regNum();

> > @@ -2025,26 +2029,31 @@ namespace gbe

> >    Selection::Selection(GenContext &ctx) {

> >      this->blockList = NULL;

> >      this->opaque = GBE_NEW(Selection::Opaque, ctx);

> > +    this->opaque->setSlowByteGather(true);

> >    }

> >

> >    Selection75::Selection75(GenContext &ctx) : Selection(ctx) {

> > +    this->opaque->setSlowByteGather(false);

> >    }

> >

> >    Selection8::Selection8(GenContext &ctx) : Selection(ctx) {

> >      this->opaque->setHas32X32Mul(true);

> >      this->opaque->setHasLongType(true);

> > +    this->opaque->setSlowByteGather(false);

> >    }

> >

> >    SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {

> >      this->opaque->setHas32X32Mul(true);

> >      this->opaque->setHasLongType(true);

> >      this->opaque->setLongRegRestrict(true);

> > +    this->opaque->setSlowByteGather(false);

> >    }

> >

> >    Selection9::Selection9(GenContext &ctx) : Selection(ctx) {

> >      this->opaque->setHas32X32Mul(true);

> >      this->opaque->setHasLongType(true);

> >      this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);

> > +    this->opaque->setSlowByteGather(false);

> >    }

> >

> >    void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t

> > msgNum, @@ -3519,8 +3528,31 @@ namespace gbe

> >          GBE_ASSERT(insn.getValueNum() == 1);

> >          const GenRegister value = sel.selReg(insn.getValue(0),

> > insn.getValueType());

> >          GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize ==

> > GEN_BYTE_SCATTER_BYTE);

> > +        if(sel.getSlowByteGather())

> > +          readByteAsDWord(sel, elemSize, address, value, isUniform, bti);

> > +        else {

> > +          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) :

> > sel.selReg(bti.reg, ir::TYPE_U32);

> > +          GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD,

> > + true), ir::TYPE_U16);

> > +

> > +          // We need a temporary register if we read bytes or words

> > +          Register dst = sel.reg(FAMILY_DWORD, isUniform);

> > +          sel.push();

> > +            if (isUniform)

> > +              sel.curr.noMask = 1;

> > +            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address,

> > elemSize, b, bti.isConst ? NULL : & tmpFlag);

> > +          sel.pop();

> >

> > -        readByteAsDWord(sel, elemSize, address, value, isUniform, bti);

> > +          sel.push();

> > +            if (isUniform) {

> > +              sel.curr.noMask = 1;

> > +              sel.curr.execWidth = 1;

> > +            }

> > +            if (elemSize == GEN_BYTE_SCATTER_WORD)

> > +              sel.MOV(GenRegister::retype(value, GEN_TYPE_UW),

> > GenRegister::unpacked_uw(dst));

> > +            else if (elemSize == GEN_BYTE_SCATTER_BYTE)

> > +              sel.MOV(GenRegister::retype(value, GEN_TYPE_UB),

> > GenRegister::unpacked_ub(dst));

> > +          sel.pop();

> > +        }

> >        }

> >      }

> >

> > --

> > 1.8.3.2

> >

> > _______________________________________________

> > Beignet mailing list

> > Beignet@lists.freedesktop.org

> > http://lists.freedesktop.org/mailman/listinfo/beignet