[05/13,V3] Backend: Establishing the thread/TID-EUID map.

Submitted by junyan.he@inbox.com on Dec. 10, 2015, 7:04 a.m.

Details

Message ID 1449731065-18807-1-git-send-email-junyan.he@inbox.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Beignet

Not browsing as part of any series.

Commit Message

junyan.he@inbox.com Dec. 10, 2015, 7:04 a.m.
From: Junyan He <junyan.he@linux.intel.com>

We need to use forward message to send data and sync
threads within the same work group. The HW lack the
feature to get the TID and EUID of other threads. So
we need to establish a map for this usage.

Signed-off-by: Junyan He <junyan.he@linux.intel.com>
---
 backend/src/backend/gen_insn_selection.cpp |  124 +++++++++++++++++++++++++++-
 backend/src/llvm/llvm_gen_backend.cpp      |   35 +++++++-
 2 files changed, 151 insertions(+), 8 deletions(-)

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index cd7b2eb..a32433b 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -500,6 +500,8 @@  namespace gbe
     DebugInfo DBGInfo;
     /*! To make function prototypes more readable */
     typedef const GenRegister &Reg;
+    /*! If true, the thread map has already been stored */
+    bool storeThreadMap;
 
     /*! Check for destination register. Major purpose is to find
         out partially updated dst registers. These registers will
@@ -809,8 +811,9 @@  namespace gbe
     ctx(ctx), block(NULL),
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
-    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
-    bHas32X32Mul(false), bHasLongType(false), bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false),
+    stateNum(0), vectorNum(0), bwdCodeGeneration(false), storeThreadMap(false),
+    currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false), bHasLongType(false),
+    bHasDoubleType(false), bHasHalfType(false), bLongRegRestrict(false),
     ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)
   {
     const ir::Function &fn = ctx.getFunction();
@@ -5978,6 +5981,106 @@  extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
   /*! WorkGroup instruction pattern */
   DECL_PATTERN(WorkGroupInstruction)
   {
+    INLINE bool storeThreadID(Selection::Opaque &sel, uint32_t slmAddr) const
+    {
+      using namespace ir;
+      GenRegister sr0_0 = GenRegister::retype(GenRegister::sr(0), GEN_TYPE_UW);
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GenRegister tmp;
+      GenRegister addr;
+      vector<GenRegister> fakeTemps;
+
+      if (simdWidth == 16) {
+        tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U16), GEN_TYPE_UD);
+        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U16), GEN_TYPE_UD);
+      } else {
+        tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32), GEN_TYPE_UD);
+        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32), GEN_TYPE_UD);
+      }
+
+      sr0_0 = GenRegister::vec1(sr0_0);
+      sel.push(); {
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.curr.execWidth = 8;
+
+        sel.MOV(tmp, sr0_0);
+
+        sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(2));
+        sel.ADD(addr, addr, GenRegister::immud(slmAddr));
+
+        sel.push(); {
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.push(); {
+            sel.curr.execWidth = 1;
+            sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x01));
+          } sel.pop();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.BYTE_SCATTER(addr, tmp, 1, GenRegister::immw(0xfe), fakeTemps);
+        } sel.pop();
+      } sel.pop();
+      return true;
+    }
+
+    INLINE GenRegister getNextThreadID(Selection::Opaque &sel, uint32_t slmAddr) const
+    {
+      using namespace ir;
+      const uint32_t simdWidth = sel.ctx.getSimdWidth();
+      GenRegister addr;
+      GenRegister nextThread;
+      GenRegister tid;
+      vector<GenRegister> fakeTemps;
+
+      if (simdWidth == 16) {
+        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U16), GEN_TYPE_UD);
+        nextThread = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U16), GEN_TYPE_UD);
+        tid = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD), ir::TYPE_U16), GEN_TYPE_UD);
+      } else {
+        addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        nextThread = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+        tid = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
+      }
+
+      sel.push(); {
+        sel.curr.execWidth = 8;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32), GenRegister::immud(1));
+
+        /* Wrap the next thread id. */
+        sel.push(); {
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          sel.CMP(GEN_CONDITIONAL_EQ, nextThread, sel.selReg(ocl::threadn, ir::TYPE_U32), GenRegister::null());
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.MOV(nextThread, GenRegister::immud(0));
+        } sel.pop();
+
+        sel.MUL(addr, nextThread, GenRegister::immud(2));
+        sel.ADD(addr, addr, GenRegister::immud(slmAddr));
+
+        sel.push(); {
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+          sel.push(); {
+            sel.curr.execWidth = 1;
+            sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x010));
+          } sel.pop();
+          sel.curr.flag = 0;
+          sel.curr.subFlag = 1;
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.BYTE_GATHER(tid, addr, 1, GenRegister::immw(0xfe), fakeTemps);
+        } sel.pop();
+
+      } sel.pop();
+      return tid;
+    }
+
     INLINE bool emitWGBroadcast(Selection::Opaque &sel, const ir::WorkGroupInstruction &insn) const {
       /*  1. BARRIER    Ensure all the threads have set the correct value for the var which will be broadcasted.
           2. CMP IDs    Compare the local IDs with the specified ones in the function call.
@@ -5993,8 +6096,6 @@  extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
       const uint32_t slmAddr = insn.getSlmAddr();
       GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);
       vector<GenRegister> fakeTemps;
-      fakeTemps.push_back(GenRegister::null());
-      fakeTemps.push_back(GenRegister::null());
 
       /* Then we insert a barrier to make sure all the var we are interested in
          have been assigned the final value. */
@@ -6053,6 +6154,21 @@  extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp
 
       if (workGroupOp == WORKGROUP_OP_BROADCAST) {
         return emitWGBroadcast(sel, insn);
+      } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD && workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) {
+        const uint32_t slmAddr = insn.getSlmAddr();
+        /* First, we create the TheadID/localID map, in order to get which thread hold the next 16 workitems. */
+
+        if (!sel.storeThreadMap) {
+          this->storeThreadID(sel, slmAddr);
+          sel.storeThreadMap = true;
+        }
+
+        /* Then we insert a barrier to make sure all the var we are interested in
+           have been assigned the final value. */
+        sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)), sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);
+
+        /* Third, get the next thread ID which we will Forward MSG to. */
+        GenRegister nextThreadID = getNextThreadID(sel, slmAddr);
       } else {
         GBE_ASSERT(0);
       }
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index add3db4..3ce3c8d 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3815,6 +3815,20 @@  namespace gbe
       GBE_ASSERT(f.getwgBroadcastSLM() >= 0);
     }
 
+    if (f.gettidMapSLM() < 0 && opcode >= ir::WORKGROUP_OP_REDUCE_ADD && opcode <= ir::WORKGROUP_OP_EXCLUSIVE_MAX) {
+      /* Because we can not know the thread ID and the EUID for every physical
+         thead which the work items execute on before the run time. We need to
+         sync the thread execution order when using work group functions. We
+         create the workitems/threadID map table in slm.
+         When we come to here, the global thread local vars should have all been
+         allocated, so it's safe for us to steal a piece of SLM for this usage. */
+      uint32_t mapSize = sizeof(uint16_t) * 64;// at most 64 thread for one subslice.
+      f.setUseSLM(true);
+      uint32_t oldSlm = f.getSLMSize();
+      f.setSLMSize(oldSlm + mapSize);
+      f.settidMapSLM(oldSlm);
+      GBE_ASSERT(f.gettidMapSLM() >= 0);
+    }
 
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
@@ -3835,10 +3849,23 @@  namespace gbe
       ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST, (uint32_t)f.getwgBroadcastSLM(), getRegister(&I), srcTuple, argNum,
           getType(ctx, (*CS.arg_begin())->getType()));
     } else {
-      const ir::Register src = this->getRegister(*(AI++));
-      const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1);
-      ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1,
-                    getType(ctx, (*CS.arg_begin())->getType()));
+      ConstantInt *sign = dyn_cast<ConstantInt>(AI);
+      GBE_ASSERT(sign);
+      bool isSign = sign->getZExtValue();
+      AI++;
+      ir::Type ty;
+      if (isSign) {
+        ty = getType(ctx, (*AI)->getType());
+      } else {
+        ty = getUnsignedType(ctx, (*AI)->getType());
+      }
+
+      ir::Register src[3];
+      src[0] = ir::ocl::threadn;
+      src[1] = ir::ocl::threadid;
+      src[2] = this->getRegister(*(AI++));
+      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);
+      ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(), getRegister(&I), srcTuple, 3, ty);
     }
 
     GBE_ASSERT(AI == AE);

Comments

The patchset pushed, Thanks.
OCL20 branch also need them, can you send a new patchset that rebase to OCL20?

> -----Original Message-----

> From: Beignet [mailto:beignet-bounces@lists.freedesktop.org] On Behalf Of

> junyan.he@inbox.com

> Sent: Thursday, December 10, 2015 15:04

> To: beignet@lists.freedesktop.org

> Subject: [Beignet] [PATCH 05/13 V3] Backend: Establishing the thread/TID-

> EUID map.

> 

> From: Junyan He <junyan.he@linux.intel.com>

> 

> We need to use forward message to send data and sync threads within the

> same work group. The HW lack the feature to get the TID and EUID of other

> threads. So we need to establish a map for this usage.

> 

> Signed-off-by: Junyan He <junyan.he@linux.intel.com>

> ---

>  backend/src/backend/gen_insn_selection.cpp |  124

> +++++++++++++++++++++++++++-

>  backend/src/llvm/llvm_gen_backend.cpp      |   35 +++++++-

>  2 files changed, 151 insertions(+), 8 deletions(-)

> 

> diff --git a/backend/src/backend/gen_insn_selection.cpp

> b/backend/src/backend/gen_insn_selection.cpp

> index cd7b2eb..a32433b 100644

> --- a/backend/src/backend/gen_insn_selection.cpp

> +++ b/backend/src/backend/gen_insn_selection.cpp

> @@ -500,6 +500,8 @@ namespace gbe

>      DebugInfo DBGInfo;

>      /*! To make function prototypes more readable */

>      typedef const GenRegister &Reg;

> +    /*! If true, the thread map has already been stored */

> +    bool storeThreadMap;

> 

>      /*! Check for destination register. Major purpose is to find

>          out partially updated dst registers. These registers will @@ -809,8 +811,9

> @@ namespace gbe

>      ctx(ctx), block(NULL),

>      curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),

>      maxInsnNum(ctx.getFunction().getLargestBlockSize()),

> dagPool(maxInsnNum),

> -    stateNum(0), vectorNum(0), bwdCodeGeneration(false),

> currAuxLabel(ctx.getFunction().labelNum()),

> -    bHas32X32Mul(false), bHasLongType(false), bHasDoubleType(false),

> bHasHalfType(false), bLongRegRestrict(false),

> +    stateNum(0), vectorNum(0), bwdCodeGeneration(false),

> storeThreadMap(false),

> +    currAuxLabel(ctx.getFunction().labelNum()), bHas32X32Mul(false),

> bHasLongType(false),

> +    bHasDoubleType(false), bHasHalfType(false),

> + bLongRegRestrict(false),

>      ldMsgOrder(LD_MSG_ORDER_IVB), slowByteGather(false)

>    {

>      const ir::Function &fn = ctx.getFunction(); @@ -5978,6 +5981,106 @@

> extern bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp

>    /*! WorkGroup instruction pattern */

>    DECL_PATTERN(WorkGroupInstruction)

>    {

> +    INLINE bool storeThreadID(Selection::Opaque &sel, uint32_t slmAddr)

> const

> +    {

> +      using namespace ir;

> +      GenRegister sr0_0 = GenRegister::retype(GenRegister::sr(0),

> GEN_TYPE_UW);

> +      const uint32_t simdWidth = sel.ctx.getSimdWidth();

> +      GenRegister tmp;

> +      GenRegister addr;

> +      vector<GenRegister> fakeTemps;

> +

> +      if (simdWidth == 16) {

> +        tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),

> ir::TYPE_U16), GEN_TYPE_UD);

> +        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),

> ir::TYPE_U16), GEN_TYPE_UD);

> +      } else {

> +        tmp = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD),

> ir::TYPE_U32), GEN_TYPE_UD);

> +        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_DWORD),

> ir::TYPE_U32), GEN_TYPE_UD);

> +      }

> +

> +      sr0_0 = GenRegister::vec1(sr0_0);

> +      sel.push(); {

> +        sel.curr.predicate = GEN_PREDICATE_NONE;

> +        sel.curr.noMask = 1;

> +        sel.curr.execWidth = 8;

> +

> +        sel.MOV(tmp, sr0_0);

> +

> +        sel.MUL(addr, sel.selReg(ocl::threadid, ir::TYPE_U32),

> GenRegister::immud(2));

> +        sel.ADD(addr, addr, GenRegister::immud(slmAddr));

> +

> +        sel.push(); {

> +          sel.curr.predicate = GEN_PREDICATE_NONE;

> +          sel.curr.noMask = 1;

> +          sel.push(); {

> +            sel.curr.execWidth = 1;

> +            sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x01));

> +          } sel.pop();

> +          sel.curr.flag = 0;

> +          sel.curr.subFlag = 1;

> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;

> +          sel.BYTE_SCATTER(addr, tmp, 1, GenRegister::immw(0xfe),

> fakeTemps);

> +        } sel.pop();

> +      } sel.pop();

> +      return true;

> +    }

> +

> +    INLINE GenRegister getNextThreadID(Selection::Opaque &sel, uint32_t

> slmAddr) const

> +    {

> +      using namespace ir;

> +      const uint32_t simdWidth = sel.ctx.getSimdWidth();

> +      GenRegister addr;

> +      GenRegister nextThread;

> +      GenRegister tid;

> +      vector<GenRegister> fakeTemps;

> +

> +      if (simdWidth == 16) {

> +        addr = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),

> ir::TYPE_U16), GEN_TYPE_UD);

> +        nextThread = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),

> ir::TYPE_U16), GEN_TYPE_UD);

> +        tid = GenRegister::retype(sel.selReg(sel.reg(FAMILY_WORD),

> ir::TYPE_U16), GEN_TYPE_UD);

> +      } else {

> +        addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);

> +        nextThread = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);

> +        tid = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);

> +      }

> +

> +      sel.push(); {

> +        sel.curr.execWidth = 8;

> +        sel.curr.predicate = GEN_PREDICATE_NONE;

> +        sel.curr.noMask = 1;

> +        sel.ADD(nextThread, sel.selReg(ocl::threadid, ir::TYPE_U32),

> + GenRegister::immud(1));

> +

> +        /* Wrap the next thread id. */

> +        sel.push(); {

> +          sel.curr.predicate = GEN_PREDICATE_NONE;

> +          sel.curr.noMask = 1;

> +          sel.curr.flag = 0;

> +          sel.curr.subFlag = 1;

> +          sel.CMP(GEN_CONDITIONAL_EQ, nextThread, sel.selReg(ocl::threadn,

> ir::TYPE_U32), GenRegister::null());

> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;

> +          sel.MOV(nextThread, GenRegister::immud(0));

> +        } sel.pop();

> +

> +        sel.MUL(addr, nextThread, GenRegister::immud(2));

> +        sel.ADD(addr, addr, GenRegister::immud(slmAddr));

> +

> +        sel.push(); {

> +          sel.curr.predicate = GEN_PREDICATE_NONE;

> +          sel.curr.noMask = 1;

> +          sel.push(); {

> +            sel.curr.execWidth = 1;

> +            sel.MOV(GenRegister::flag(0, 1), GenRegister::immuw(0x010));

> +          } sel.pop();

> +          sel.curr.flag = 0;

> +          sel.curr.subFlag = 1;

> +          sel.curr.predicate = GEN_PREDICATE_NORMAL;

> +          sel.BYTE_GATHER(tid, addr, 1, GenRegister::immw(0xfe), fakeTemps);

> +        } sel.pop();

> +

> +      } sel.pop();

> +      return tid;

> +    }

> +

>      INLINE bool emitWGBroadcast(Selection::Opaque &sel, const

> ir::WorkGroupInstruction &insn) const {

>        /*  1. BARRIER    Ensure all the threads have set the correct value for the

> var which will be broadcasted.

>            2. CMP IDs    Compare the local IDs with the specified ones in the

> function call.

> @@ -5993,8 +6096,6 @@ extern bool OCL_DEBUGINFO; // first defined by

> calling BVAR in program.cpp

>        const uint32_t slmAddr = insn.getSlmAddr();

>        GenRegister addr = sel.selReg(sel.reg(FAMILY_DWORD), ir::TYPE_U32);

>        vector<GenRegister> fakeTemps;

> -      fakeTemps.push_back(GenRegister::null());

> -      fakeTemps.push_back(GenRegister::null());

> 

>        /* Then we insert a barrier to make sure all the var we are interested in

>           have been assigned the final value. */ @@ -6053,6 +6154,21 @@ extern

> bool OCL_DEBUGINFO; // first defined by calling BVAR in program.cpp

> 

>        if (workGroupOp == WORKGROUP_OP_BROADCAST) {

>          return emitWGBroadcast(sel, insn);

> +      } else if (workGroupOp >= WORKGROUP_OP_REDUCE_ADD &&

> workGroupOp <= WORKGROUP_OP_EXCLUSIVE_MAX) {

> +        const uint32_t slmAddr = insn.getSlmAddr();

> +        /* First, we create the TheadID/localID map, in order to get

> + which thread hold the next 16 workitems. */

> +

> +        if (!sel.storeThreadMap) {

> +          this->storeThreadID(sel, slmAddr);

> +          sel.storeThreadMap = true;

> +        }

> +

> +        /* Then we insert a barrier to make sure all the var we are interested in

> +           have been assigned the final value. */

> +        sel.BARRIER(GenRegister::ud8grf(sel.reg(FAMILY_DWORD)),

> + sel.selReg(sel.reg(FAMILY_DWORD)), syncLocalBarrier);

> +

> +        /* Third, get the next thread ID which we will Forward MSG to. */

> +        GenRegister nextThreadID = getNextThreadID(sel, slmAddr);

>        } else {

>          GBE_ASSERT(0);

>        }

> diff --git a/backend/src/llvm/llvm_gen_backend.cpp

> b/backend/src/llvm/llvm_gen_backend.cpp

> index add3db4..3ce3c8d 100644

> --- a/backend/src/llvm/llvm_gen_backend.cpp

> +++ b/backend/src/llvm/llvm_gen_backend.cpp

> @@ -3815,6 +3815,20 @@ namespace gbe

>        GBE_ASSERT(f.getwgBroadcastSLM() >= 0);

>      }

> 

> +    if (f.gettidMapSLM() < 0 && opcode >=

> ir::WORKGROUP_OP_REDUCE_ADD && opcode <=

> ir::WORKGROUP_OP_EXCLUSIVE_MAX) {

> +      /* Because we can not know the thread ID and the EUID for every

> physical

> +         thead which the work items execute on before the run time. We need

> to

> +         sync the thread execution order when using work group functions. We

> +         create the workitems/threadID map table in slm.

> +         When we come to here, the global thread local vars should have all

> been

> +         allocated, so it's safe for us to steal a piece of SLM for this usage. */

> +      uint32_t mapSize = sizeof(uint16_t) * 64;// at most 64 thread for one

> subslice.

> +      f.setUseSLM(true);

> +      uint32_t oldSlm = f.getSLMSize();

> +      f.setSLMSize(oldSlm + mapSize);

> +      f.settidMapSLM(oldSlm);

> +      GBE_ASSERT(f.gettidMapSLM() >= 0);

> +    }

> 

>      CallSite::arg_iterator AI = CS.arg_begin();

>      CallSite::arg_iterator AE = CS.arg_end(); @@ -3835,10 +3849,23 @@

> namespace gbe

>        ctx.WORKGROUP(ir::WORKGROUP_OP_BROADCAST,

> (uint32_t)f.getwgBroadcastSLM(), getRegister(&I), srcTuple, argNum,

>            getType(ctx, (*CS.arg_begin())->getType()));

>      } else {

> -      const ir::Register src = this->getRegister(*(AI++));

> -      const ir::Tuple srcTuple = ctx.arrayTuple(&src, 1);

> -      ctx.WORKGROUP(opcode, (uint32_t)0, getRegister(&I), srcTuple, 1,

> -                    getType(ctx, (*CS.arg_begin())->getType()));

> +      ConstantInt *sign = dyn_cast<ConstantInt>(AI);

> +      GBE_ASSERT(sign);

> +      bool isSign = sign->getZExtValue();

> +      AI++;

> +      ir::Type ty;

> +      if (isSign) {

> +        ty = getType(ctx, (*AI)->getType());

> +      } else {

> +        ty = getUnsignedType(ctx, (*AI)->getType());

> +      }

> +

> +      ir::Register src[3];

> +      src[0] = ir::ocl::threadn;

> +      src[1] = ir::ocl::threadid;

> +      src[2] = this->getRegister(*(AI++));

> +      const ir::Tuple srcTuple = ctx.arrayTuple(&src[0], 3);

> +      ctx.WORKGROUP(opcode, (uint32_t)f.gettidMapSLM(),

> + getRegister(&I), srcTuple, 3, ty);

>      }

> 

>      GBE_ASSERT(AI == AE);

> --

> 1.7.9.5

> 

> 

> 

> _______________________________________________

> Beignet mailing list

> Beignet@lists.freedesktop.org

> http://lists.freedesktop.org/mailman/listinfo/beignet