[v6,1/4] Add built-in function __gen_ocl_vme.

Submitted by Chuanbo Weng on Nov. 6, 2015, 3:50 a.m.

Details

Message ID 5A0E318D73C83C40A09BDBBE131796D702DD6C8D@shsmsx102.ccr.corp.intel.com
State New
Headers show
Series "Series without cover letter" ( rev: 2 ) in Beignet

Not browsing as part of any series.

Commit Message

Chuanbo Weng Nov. 6, 2015, 3:50 a.m.
Hi Ruiling,
	As we discussed before, I have refined code to handle both simd8 and simd16 in backend and ocl kernel. Please
confirm if no problem so that this patchset can be pushed. Thanks! 

-----Original Message-----
From: Weng, Chuanbo 
Sent: Friday, November 06, 2015 11:28
To: beignet@lists.freedesktop.org
Cc: Weng, Chuanbo
Subject: [PATCH v6 1/4] Add built-in function __gen_ocl_vme.

__gen_ocl_vme is used for hardware accelerated video motion estimation.
It gets payload values as parameters and uses MOV to pass these payload values to VME SEND Message's payload grfs. The int8 return value is used to store SEND Message writeback.

v2:
Remove unnecessary 5 parameters(src_grf*) of built-in function(we just need to allocate related registers in gen_insn_selection step).

v3:
Remove redundant code and change MAX_SRC_NUM to 40.

v4:
Choose message response length by message type instead of hard code.

v5:
Choose message response length by message type in the whole backend pipeline.

v6:
Treat simd8 and simd16 differently when mov payload value to consecutive payload grfs.

Signed-off-by: Chuanbo Weng <chuanbo.weng@intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c          | 14 ++++
 backend/src/backend/gen7_instruction.hpp           | 15 ++++
 backend/src/backend/gen_context.cpp                | 98 ++++++++++++++++++++++
 backend/src/backend/gen_context.hpp                |  1 +
 backend/src/backend/gen_defs.hpp                   | 15 ++++
 backend/src/backend/gen_encoder.cpp                | 44 ++++++++++
 backend/src/backend/gen_encoder.hpp                | 13 +++
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |  1 +
 backend/src/backend/gen_insn_selection.cpp         | 73 ++++++++++++++++
 backend/src/backend/gen_insn_selection.hpp         | 14 +++-
 backend/src/backend/gen_insn_selection.hxx         |  1 +
 backend/src/ir/instruction.cpp                     | 66 +++++++++++++++
 backend/src/ir/instruction.hpp                     | 17 +++-
 backend/src/ir/instruction.hxx                     |  1 +
 backend/src/libocl/include/ocl_misc.h              | 15 ++++
 backend/src/llvm/llvm_gen_backend.cpp              | 47 +++++++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx         |  2 +
 backend/src/llvm/llvm_scalarize.cpp                |  4 +
 18 files changed, 436 insertions(+), 5 deletions(-)

--
1.9.1

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 5b71cfa..3198da7 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -476,6 +476,13 @@  static int column;
 
 static int gen_version;
 
+#define GEN7_BITS_FIELD(inst, gen7) \
+  ({                                                            \
+    int bits;                                                   \
+      bits = ((const union Gen7NativeInstruction *)inst)->gen7; \
+    bits;                                                       \
+  })
+
 #define GEN_BITS_FIELD(inst, gen)                               \
   ({                                                            \
     int bits;                                                   \
@@ -530,6 +537,8 @@  static int gen_version;
 #define EXECUTION_SIZE(inst)       GEN_BITS_FIELD(inst, header.execution_size)
 #define BRANCH_JIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.jip, bits3.gen8_branch.jip/8)
 #define BRANCH_UIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.uip, bits2.gen8_branch.uip/8)
+#define VME_BTI(inst)              GEN7_BITS_FIELD(inst, bits3.vme_gen7.bti)
+#define VME_MSG_TYPE(inst)         GEN7_BITS_FIELD(inst, bits3.vme_gen7.msg_type)
 #define SAMPLE_BTI(inst)           GEN_BITS_FIELD(inst, bits3.sampler_gen7.bti)
 #define SAMPLER(inst)              GEN_BITS_FIELD(inst, bits3.sampler_gen7.sampler)
 #define SAMPLER_MSG_TYPE(inst)     GEN_BITS_FIELD(inst, bits3.sampler_gen7.msg_type)
@@ -1431,6 +1440,11 @@  int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
 
     if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
       switch (target) {
+        case GEN_SFID_VIDEO_MOTION_EST:
+          format(file, " (bti: %d, msg_type: %d)",
+                 VME_BTI(inst),
+                 VME_MSG_TYPE(inst));
+          break;
         case GEN_SFID_SAMPLER:
           format(file, " (%d, %d, %d, %d)",
                  SAMPLE_BTI(inst),
diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp
index 51f342b..258dd24 100644
--- a/backend/src/backend/gen7_instruction.hpp
+++ b/backend/src/backend/gen7_instruction.hpp
@@ -350,6 +350,21 @@  union Gen7NativeInstruction
         uint32_t end_of_thread:1;
       } sampler_gen7;
 
+      struct {
+        uint32_t bti:8;
+        uint32_t vme_search_path_lut:3;
+        uint32_t lut_sub:2;
+        uint32_t msg_type:2;
+        uint32_t stream_in:1;
+        uint32_t stream_out:1;
+        uint32_t reserved_mbz:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } vme_gen7;
+
       /**
        * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
        *
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4e2ebfb..ccc9f17 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2266,6 +2266,104 @@  namespace gbe
     p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0, insn.extra.isLD, insn.extra.isUniform);
   }
 
+  void GenContext::emitVmeInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const unsigned int msg_type = insn.extra.msg_type;
+
+    GBE_ASSERT(msg_type == 1);
+    int rsp_len;
+    if(msg_type == 1)
+      rsp_len = 6;
+    uint32_t execWidth_org = p->curr.execWidth;
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    /* Use MOV to Setup bits of payload: mov payload value stored in insn.src(x) to
+     * 5 consecutive payload grf.
+     * In simd8 mode, one virtual grf register map to one physical grf register. But
+     * in simd16 mode, one virtual grf register map to two physical grf registers.
+     * So we should treat them differently.
+     * */
+    if(execWidth_org == 8){
+      for(int i=0; i < 5; i++){
+        GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+        payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+        payload_grf.width = GEN_WIDTH_1;
+        payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+        payload_grf.subphysical = 1;
+        for(int j=0; j < 8; j++){
+          payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+          GenRegister payload_val = ra->genReg(insn.src(i*8+j));
+          payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+          payload_val.width = GEN_WIDTH_1;
+          payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+          p->MOV(payload_grf, payload_val);
+        }
+      }
+    }
+    else if(execWidth_org == 16){
+      for(int i=0; i < 2; i++){
+        for(int k = 0; k < 2; k++){
+          GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+          payload_grf.nr += k;
+          payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+          payload_grf.width = GEN_WIDTH_1;
+          payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+          payload_grf.subphysical = 1;
+          for(int j=0; j < 8; j++){
+            payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+            GenRegister payload_val = ra->genReg(insn.src(i*16+k*8+j));
+            payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+            payload_val.width = GEN_WIDTH_1;
+            payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+            p->MOV(payload_grf, payload_val);
+          }
+        }
+      }
+      {
+        int i = 2;
+        GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+        payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+        payload_grf.width = GEN_WIDTH_1;
+        payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+        payload_grf.subphysical = 1;
+        for(int j=0; j < 8; j++){
+          payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+          GenRegister payload_val = ra->genReg(insn.src(i*16+j));
+          payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+          payload_val.width = GEN_WIDTH_1;
+          payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+          p->MOV(payload_grf, payload_val);
+        }
+      }
+    }
+    p->pop();
+
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(rsp_len)), GEN_TYPE_UB);
+    payload_did.vstride = GEN_VERTICAL_STRIDE_0;
+    payload_did.width = GEN_WIDTH_1;
+    payload_did.hstride = GEN_HORIZONTAL_STRIDE_0;
+    payload_did.subphysical = 1;
+    payload_did.subnr = 20 * typeSize(GEN_TYPE_UB);
+    GenRegister grf0 = GenRegister::ub1grf(0, 20);
+    p->MOV(payload_did, grf0);
+    p->pop();
+
+    const GenRegister msgPayload = ra->genReg(insn.dst(rsp_len));
+    const unsigned char bti = insn.getbti();
+    const unsigned int vme_search_path_lut = insn.extra.vme_search_path_lut;
+    const unsigned int lut_sub = insn.extra.lut_sub;
+    p->VME(bti, dst, msgPayload, msg_type, vme_search_path_lut, 
+ lut_sub);  }
+
   void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
     p->push();
     uint32_t simdWidth = p->curr.execWidth; diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 4044694..870266c 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -167,6 +167,7 @@  namespace gbe
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
     void emitDWordGatherInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
+    void emitVmeInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
     void emitSpillRegInstruction(const SelectionInstruction &insn);
     void emitUnSpillRegInstruction(const SelectionInstruction &insn); diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 1b550ac..09cb2ba 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -615,6 +615,21 @@  union GenNativeInstruction
         uint32_t end_of_thread:1;
       } sampler_gen7;
 
+      struct {
+        uint32_t bti:8;
+        uint32_t vme_search_path_lut:3;
+        uint32_t lut_sub:2;
+        uint32_t msg_type:2;
+        uint32_t stream_in:1;
+        uint32_t stream_out:1;
+        uint32_t reserved_mbz:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } vme_gen7;
+
       /**
        * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
        *
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 2cc51cc..be38cef 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1191,6 +1191,50 @@  namespace gbe
                        simd_mode, return_format);
   }
 
+  void GenEncoder::setVmeMessage(GenNativeInstruction *insn,
+                                unsigned char bti,
+                                uint32_t response_length,
+                                uint32_t msg_length,
+                                uint32_t msg_type,
+                                unsigned char vme_search_path_lut,
+                                unsigned char lut_sub)  {
+     const GenMessageTarget sfid = GEN_SFID_VIDEO_MOTION_EST;
+     setMessageDescriptor(insn, sfid, msg_length, response_length, true);
+     insn->bits3.vme_gen7.bti = bti;
+     insn->bits3.vme_gen7.vme_search_path_lut = vme_search_path_lut;
+     insn->bits3.vme_gen7.lut_sub = lut_sub;
+     insn->bits3.vme_gen7.msg_type = msg_type;
+     insn->bits3.vme_gen7.stream_in = 0;
+     insn->bits3.vme_gen7.stream_out = 0;
+     insn->bits3.vme_gen7.reserved_mbz = 0;
+
+  }
+
+  void GenEncoder::VME(unsigned char bti,
+                       GenRegister dest,
+                       GenRegister msg,
+                       uint32_t msg_type,
+                       uint32_t vme_search_path_lut,
+                       uint32_t lut_sub)  {
+    /* Currectly we just support inter search only, we will support other
+     * modes in future.
+     */
+    GBE_ASSERT(msg_type == 1);
+    uint32_t msg_length, response_length;
+    if(msg_type == 1){
+      msg_length = 5;
+      response_length = 6;
+    }
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    this->setDst(insn, dest);
+    this->setSrc0(insn, msg);
+    setVmeMessage(insn, bti, response_length, msg_length,
+                  msg_type, vme_search_path_lut, lut_sub);  }
+
   void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
   {
      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND); diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index f2bb5ab..6df7087 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -203,6 +203,19 @@  namespace gbe
                            bool header_present,
                            uint32_t simd_mode,
                            uint32_t return_format);
+    virtual void VME(unsigned char bti,
+                         GenRegister dest,
+                         GenRegister msg,
+                         uint32_t msg_type,
+                         uint32_t vme_search_path_lut,
+                         uint32_t lut_sub);
+    void setVmeMessage(GenNativeInstruction *insn,
+                          unsigned char bti,
+                          uint32_t response_length,
+                          uint32_t msg_length,
+                          uint32_t msg_type,
+                          unsigned char vme_search_path_lut,
+                          unsigned char lut_sub);
 
     /*! TypedWrite instruction for texture */
     virtual void TYPED_WRITE(GenRegister header, diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 9b60c17..878e0e7 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -36,6 +36,7 @@  DECL_GEN7_SCHEDULE(UnpackByte,      40,        1,        1)
 DECL_GEN7_SCHEDULE(PackLong,        40,        1,        1)
 DECL_GEN7_SCHEDULE(UnpackLong,      40,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          160,       1,        1)
+DECL_GEN7_SCHEDULE(Vme,             320,       1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
 DECL_GEN7_SCHEDULE(UnSpillReg,      160,       1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 2452aea..cfaa792 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -187,6 +187,7 @@  namespace gbe
            this->opcode == SEL_OP_ATOMIC       ||
            this->opcode == SEL_OP_BYTE_GATHER  ||
            this->opcode == SEL_OP_SAMPLE ||
+           this->opcode == SEL_OP_VME ||
            this->opcode == SEL_OP_DWORD_GATHER;
   }
 
@@ -661,6 +662,8 @@  namespace gbe
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
     void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform);
+    /*! Encode vme instructions */
+    void VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, 
+ uint32_t dstNum, uint32_t srcNum, uint32_t msg_type, uint32_t 
+ vme_search_path_lut, uint32_t lut_sub);
     /*! Encode typed write instructions */
     void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
@@ -2120,6 +2123,34 @@  namespace gbe
     insn->extra.isUniform = isUniform;
   }
 
+  void Selection::Opaque::VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal,
+                              uint32_t dstNum, uint32_t srcNum, uint32_t msg_type,
+                              uint32_t vme_search_path_lut, uint32_t lut_sub) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_VME, dstNum, srcNum);
+    SelectionVector *dstVector = this->appendVector();
+    SelectionVector *msgVector = this->appendVector();
+
+    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->src(elemID) = payloadVal[elemID];
+
+    dstVector->regNum = dstNum;
+    dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
+    dstVector->reg = &insn->dst(0);
+
+    msgVector->regNum = srcNum;
+    msgVector->isSrc = 1;
+    msgVector->offsetID = 0;
+    msgVector->reg = &insn->src(0);
+
+    insn->setbti(bti);
+    insn->extra.msg_type = msg_type;
+    insn->extra.vme_search_path_lut = vme_search_path_lut;
+    insn->extra.lut_sub = lut_sub;
+  }
+
   ///////////////////////////////////////////////////////////////////////////
   // Code selection public implementation
   ///////////////////////////////////////////////////////////////////////////
@@ -5126,6 +5157,47 @@  namespace gbe
     DECL_CTOR(SampleInstruction, 1, 1);
   };
 
+  DECL_PATTERN(VmeInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::VmeInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      uint32_t msg_type, vme_search_path_lut, lut_sub;
+      msg_type = insn.getMsgType();
+      vme_search_path_lut = 0;
+      lut_sub = 0;
+      GBE_ASSERT(msg_type == 1);
+      uint32_t payloadLen = 0;
+      //We allocate 5 virtual payload grfs to selection dst register.
+      if(msg_type == 1){
+        payloadLen = 5;
+      }
+      uint32_t selDstNum = insn.getDstNum() + payloadLen;
+      uint32_t srcNum = insn.getSrcNum();
+      vector<GenRegister> dst(selDstNum);
+      vector<GenRegister> payloadVal(srcNum);
+      uint32_t valueID = 0;
+      for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
+        dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
+      for (valueID = insn.getDstNum(); valueID < selDstNum; ++valueID)
+        dst[valueID] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+
+      for (valueID = 0; valueID < srcNum; ++valueID)
+        payloadVal[valueID] = sel.selReg(insn.getSrc(valueID), 
+ insn.getSrcType());
+
+      uint32_t bti = insn.getImageIndex();
+      if (bti > BTI_MAX_ID) {
+        std::cerr << "Too large bti " << bti;
+        return false;
+      }
+
+      sel.VME(bti, dst.data(), payloadVal.data(), selDstNum, srcNum, 
+ msg_type, vme_search_path_lut, lut_sub);
+
+      return true;
+    }
+    DECL_CTOR(VmeInstruction, 1, 1);
+  };
+
   /*! Typed write instruction pattern. */
   DECL_PATTERN(TypedWriteInstruction)
   {
@@ -5591,6 +5663,7 @@  namespace gbe
     this->insert<MulAddInstructionPattern>();
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
+    this->insert<VmeInstructionPattern>();
     this->insert<GetImageInfoInstructionPattern>();
     this->insert<ReadARFInstructionPattern>();
     this->insert<RegionInstructionPattern>();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index f51c905..578db41 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -90,8 +90,8 @@  namespace gbe
     const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
     /*! Damn C++ */
     const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
-    /*! No more than 9 sources (used by typed writes on simd8 mode.) */
-    enum { MAX_SRC_NUM = 9 };
+    /*! No more than 40 sources (40 sources are used by vme for payload passing and setting) */
+    enum { MAX_SRC_NUM = 40 };
     /*! No more than 16 destinations (15 used by I64DIV/I64REM) */
     enum { MAX_DST_NUM = 16 };
     /*! State of the instruction (extra fields neeed for the encoding) */ @@ -129,6 +129,12 @@ namespace gbe
         bool     isLD;  // is this a ld message?
         bool     isUniform;
       };
+      struct {
+        uint16_t vme_bti:8;
+        uint16_t msg_type:2;
+        uint16_t vme_search_path_lut:3;
+        uint16_t lut_sub:2;
+      };
       uint32_t barrierType;
       bool longjmp;
       uint32_t indirect_offset;
@@ -138,7 +144,7 @@  namespace gbe
     /*! Number of destinations */
     uint8_t dstNum:5;
     /*! Number of sources */
-    uint8_t srcNum:4;
+    uint8_t srcNum:6;
     /*! To store various indices */
     uint32_t index;
     /*! For BRC/IF to store the UIP */
@@ -152,6 +158,7 @@  namespace gbe
       switch (opcode) {
         case SEL_OP_DWORD_GATHER: return extra.function;
         case SEL_OP_SAMPLE: return extra.rdbti;
+        case SEL_OP_VME: return extra.vme_bti;
         case SEL_OP_TYPED_WRITE: return extra.bti;
         default:
           GBE_ASSERT(0);
@@ -164,6 +171,7 @@  namespace gbe
       switch (opcode) {
         case SEL_OP_DWORD_GATHER: extra.function = bti; return;
         case SEL_OP_SAMPLE: extra.rdbti = bti; return;
+        case SEL_OP_VME: extra.vme_bti = bti; return;
         case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
         default:
           GBE_ASSERT(0);
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 479398b..4d3e921 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -65,6 +65,7 @@  DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)  DECL_SELECTION_IR(PACK_LONG, PackLongInstruction)  DECL_SELECTION_IR(UNPACK_LONG, UnpackLongInstruction)  DECL_SELECTION_IR(SAMPLE, SampleInstruction)
+DECL_SELECTION_IR(VME, VmeInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)  DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)  DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction) diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index f93c528..7bf787e 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -595,6 +595,58 @@  namespace ir {
       static const uint32_t dstNum = 4;
     };
 
+    class ALIGNED_INSTRUCTION VmeInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<VmeInstruction>,
+      public TupleDstPolicy<VmeInstruction>
+    {
+    public:
+      VmeInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple,
+                     uint32_t dstNum, uint32_t srcNum, int msg_type,
+                     int vme_search_path_lut, int lut_sub) {
+        this->opcode = OP_VME;
+        this->dst = dstTuple;
+        this->src = srcTuple;
+        this->dstNum = dstNum;
+        this->srcNum = srcNum;
+        this->imageIdx = imageIdx;
+        this->msg_type = msg_type;
+        this->vme_search_path_lut = vme_search_path_lut;
+        this->lut_sub = lut_sub;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " src_surface id " << (int)this->getImageIndex()
+            << " ref_surface id " << (int)this->getImageIndex() + 1;
+        for(uint32_t i = 0; i < dstNum; i++){
+          out<< " %" << this->getDst(fn, i);
+        }
+        for(uint32_t i = 0; i < srcNum; i++){
+          out<< " %" << this->getSrc(fn, i);
+        }
+        out
+            << " msg_type " << (int)this->getMsgType()
+            << " vme_search_path_lut " << (int)this->vme_search_path_lut
+            << " lut_sub " << (int)this->lut_sub;
+      }
+      Tuple src;
+      Tuple dst;
+
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getMsgType(void) const { return this->msg_type; }
+
+      INLINE Type getSrcType(void) const { return TYPE_U32; }
+      INLINE Type getDstType(void) const { return TYPE_U32; }
+      uint8_t imageIdx;
+      uint8_t msg_type;
+      uint8_t vme_search_path_lut;
+      uint8_t lut_sub;
+      uint32_t srcNum;
+      uint32_t dstNum;
+    };
+
+
     class ALIGNED_INSTRUCTION TypedWriteInstruction : // TODO
       public BasePolicy,
       public TupleSrcPolicy<TypedWriteInstruction>,
@@ -1111,6 +1163,8 @@  namespace ir {
     // TODO
     INLINE bool SampleInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
+    INLINE bool VmeInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
     INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
     INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const @@ -1502,6 +1556,10 @@ START_INTROSPECTION(LabelInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LabelInstruction)
 
+START_INTROSPECTION(VmeInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(VmeInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -1694,6 +1752,10 @@  DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())  DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerIndex(void), getSamplerIndex())  DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerOffset(void), getSamplerOffset())  DECL_MEM_FN(SampleInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(VmeInstruction, Type, getSrcType(void), getSrcType()) 
+DECL_MEM_FN(VmeInstruction, Type, getDstType(void), getDstType()) 
+DECL_MEM_FN(VmeInstruction, uint8_t, getImageIndex(void), 
+getImageIndex()) DECL_MEM_FN(VmeInstruction, uint8_t, getMsgType(void), 
+getMsgType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())  DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())  DECL_MEM_FN(TypedWriteInstruction, uint8_t, getImageIndex(void), getImageIndex()) @@ -1932,6 +1994,10 @@ DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
     return internal::SampleInstruction(imageIndex, dst, src, srcNum, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
   }
 
+  Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub) {
+    return internal::VmeInstruction(imageIndex, dst, src, dstNum, 
+ srcNum, msg_type, vme_search_path_lut, lut_sub).convert();  }
+
   Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType) {
     return internal::TypedWriteInstruction(imageIndex, src, srcNum, srcType, coordType).convert();
   }
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp index 3f3c655..c8da416 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -191,8 +191,8 @@  namespace ir {
     template <typename T> INLINE bool isMemberOf(void) const {
       return T::isClassOf(*this);
     }
-    /*! max_src for store instruction (vec16 + addr) */
-    static const uint32_t MAX_SRC_NUM = 32;
+    /*! max_src used by vme for payload passing and setting */
+    static const uint32_t MAX_SRC_NUM = 40;
     static const uint32_t MAX_DST_NUM = 32;
   protected:
     BasicBlock *parent;      //!< The basic block containing the instruction
@@ -399,6 +399,17 @@  namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Video motion estimation */
+  class VmeInstruction : public Instruction {
+  public:
+    uint8_t getImageIndex() const;
+    uint8_t getMsgType() const;
+    Type getSrcType(void) const;
+    Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);  };
+
   typedef union _ImageInfoKey{
     _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
     _ImageInfoKey(int key) : data(key) {}; @@ -756,6 +767,8 @@ namespace ir {
   Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType);
   /*! sample textures */
   Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
+  /*! video motion estimation */
+  Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t 
+ dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int 
+ lut_sub);
   /*! get image information , such as width/height/depth/... */
   Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
   /*! label labelIndex */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx index 81548c9..27d59a9 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -85,6 +85,7 @@  DECL_INSN(SYNC, SyncInstruction)  DECL_INSN(LABEL, LabelInstruction)  DECL_INSN(READ_ARF, ReadARFInstruction)  DECL_INSN(REGION, RegionInstruction)
+DECL_INSN(VME, VmeInstruction)
 DECL_INSN(INDIRECT_MOV, IndirectMovInstruction)  DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)  DECL_INSN(MUL_HI, BinaryInstruction) diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 359025b..7d4abab 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -136,5 +136,20 @@  struct time_stamp {
   uint event;
 };
 
+uint __gen_ocl_region(ushort offset, uint data);
+
 struct time_stamp __gen_ocl_get_timestamp(void);
+
+uint8 __gen_ocl_vme(image2d_t, image2d_t,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   int, int, int);
 #endif
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 7299d53..19927ba 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3541,6 +3541,7 @@  namespace gbe
       case GEN_OCL_REGION:
       case GEN_OCL_SIMD_ID:
       case GEN_OCL_SIMD_SHUFFLE:
+      case GEN_OCL_VME:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
@@ -3839,6 +3840,52 @@  namespace gbe
             ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM);
             break;
           }
+          case GEN_OCL_VME:
+          {
+
+            const uint8_t imageID = getImageID(I);
+
+            AI++;
+            AI++;
+
+            uint32_t src_length = 40;
+
+            vector<ir::Register> dstTupleData, srcTupleData;
+            for (uint32_t i = 0; i < src_length; i++, AI++){
+              srcTupleData.push_back(this->getRegister(*AI));
+            }
+
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], 
+ src_length);
+
+            Constant *msg_type_cpv = dyn_cast<Constant>(*AI);
+            assert(msg_type_cpv);
+            const ir::Immediate &msg_type_x = processConstantImm(msg_type_cpv);
+            int msg_type = msg_type_x.getIntegerValue();
+            uint32_t dst_length;
+            //msy_type =1 indicate inter search only of gen vme shared function
+            GBE_ASSERT(msg_type == 1);
+            if(msg_type == 1)
+              dst_length = 6;
+            for (uint32_t elemID = 0; elemID < dst_length; ++elemID) {
+              const ir::Register reg = this->getRegister(&I, elemID);
+              dstTupleData.push_back(reg);
+            }
+            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dst_length);
+            ++AI;
+            Constant *vme_search_path_lut_cpv = dyn_cast<Constant>(*AI);
+            assert(vme_search_path_lut_cpv);
+            const ir::Immediate &vme_search_path_lut_x = processConstantImm(vme_search_path_lut_cpv);
+            ++AI;
+            Constant *lut_sub_cpv = dyn_cast<Constant>(*AI);
+            assert(lut_sub_cpv);
+            const ir::Immediate &lut_sub_x = 
+ processConstantImm(lut_sub_cpv);
+
+            ctx.VME(imageID, dstTuple, srcTuple, dst_length, src_length,
+                    msg_type, vme_search_path_lut_x.getIntegerValue(),
+                    lut_sub_x.getIntegerValue());
+
+            break;
+          }
           case GEN_OCL_REGION:
           {
             const ir::Register dst = this->getRegister(&I); diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index cabb225..3fbf847 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -168,5 +168,7 @@  DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)  DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)  DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
+DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme)
+
 // printf function
 DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf) diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 7ee5259..dc1d8ab 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -671,6 +671,10 @@  namespace gbe {
             *CI = InsertToVector(call, *CI);
             break;
           }
+          case GEN_OCL_VME:
+            setAppendPoint(call);
+            extractFromVector(call);
+            break;
         }
       }
     }