[V2,2/2] Use the Byte Gather after HSW when read byte/shor.

Submitted by Yang, Rong R on June 24, 2015, 1:58 a.m.

Details

Message ID 1435111106-10385-2-git-send-email-rong.r.yang@intel.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Yang, Rong R June 24, 2015, 1:58 a.m.
After HSW, the byte gather's performance issue has gone, so needn't read
dword and extract.
But for multi dst load, the combine reduce the
address calc, but need the extract the dst, maybe performance is
approximate, so still use the old logic.

V2: Only HSW benifit from this patch, only enable it on HSW. Need find the root cause.
Signed-off-by: Yang Rong <rong.r.yang@intel.com>
---
 backend/src/backend/gen_insn_selection.cpp | 36 ++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index d63c7e3..181325a 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -365,6 +365,8 @@  namespace gbe
     void setLongRegRestrict(bool b) { bLongRegRestrict = b; }
     void setLdMsgOrder(uint32_t type)  { ldMsgOrder = type; }
     uint32_t getLdMsgOrder()  const { return ldMsgOrder; }
+    void setSlowByteGather(bool b) { slowByteGather = b; }
+    bool getSlowByteGather() { return slowByteGather; }
     /*! indicate whether a register is a scalar/uniform register. */
     INLINE bool isPartialWrite(const ir::Register &reg) const {
       return partialWriteRegs.find(reg.value()) != partialWriteRegs.end();
@@ -740,6 +742,7 @@  namespace gbe
     bool bHasLongType;
     bool bLongRegRestrict;
     uint32_t ldMsgOrder;
+    bool slowByteGather;
     INLINE ir::LabelIndex newAuxLabel()
     {
       currAuxLabel++;
@@ -779,7 +782,8 @@  namespace gbe
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
     stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), ldMsgOrder(LD_MSG_ORDER_IVB)
+    bHas32X32Mul(false), bHasLongType(false), bLongRegRestrict(false), ldMsgOrder(LD_MSG_ORDER_IVB),
+    slowByteGather(false)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -2025,26 +2029,31 @@  namespace gbe
   Selection::Selection(GenContext &ctx) {
     this->blockList = NULL;
     this->opaque = GBE_NEW(Selection::Opaque, ctx);
+    this->opaque->setSlowByteGather(true);
   }
 
   Selection75::Selection75(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setSlowByteGather(false);
   }
 
   Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
+    this->opaque->setSlowByteGather(true);
   }
 
   SelectionChv::SelectionChv(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
     this->opaque->setLongRegRestrict(true);
+    this->opaque->setSlowByteGather(true);
   }
 
   Selection9::Selection9(GenContext &ctx) : Selection(ctx) {
     this->opaque->setHas32X32Mul(true);
     this->opaque->setHasLongType(true);
     this->opaque->setLdMsgOrder(LD_MSG_ORDER_SKL);
+    this->opaque->setSlowByteGather(true);
   }
 
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
@@ -3519,8 +3528,31 @@  namespace gbe
         GBE_ASSERT(insn.getValueNum() == 1);
         const GenRegister value = sel.selReg(insn.getValue(0), insn.getValueType());
         GBE_ASSERT(elemSize == GEN_BYTE_SCATTER_WORD || elemSize == GEN_BYTE_SCATTER_BYTE);
+        if(sel.getSlowByteGather())
+          readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+        else {
+          GenRegister b = bti.isConst ? GenRegister::immud(bti.imm) : sel.selReg(bti.reg, ir::TYPE_U32);
+          GenRegister tmpFlag = sel.selReg(sel.reg(ir::FAMILY_WORD, true), ir::TYPE_U16);
+
+          // We need a temporary register if we read bytes or words
+          Register dst = sel.reg(FAMILY_DWORD, isUniform);
+          sel.push();
+            if (isUniform)
+              sel.curr.noMask = 1;
+            sel.BYTE_GATHER(sel.selReg(dst, ir::TYPE_U32), address, elemSize, b, bti.isConst ? NULL : & tmpFlag);
+          sel.pop();
 
-        readByteAsDWord(sel, elemSize, address, value, isUniform, bti);
+          sel.push();
+            if (isUniform) {
+              sel.curr.noMask = 1;
+              sel.curr.execWidth = 1;
+            }
+            if (elemSize == GEN_BYTE_SCATTER_WORD)
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UW), GenRegister::unpacked_uw(dst));
+            else if (elemSize == GEN_BYTE_SCATTER_BYTE)
+              sel.MOV(GenRegister::retype(value, GEN_TYPE_UB), GenRegister::unpacked_ub(dst));
+          sel.pop();
+        }
       }
     }