[v5,1/4] Add built-in function __gen_ocl_vme.

Submitted by Chuanbo Weng on Nov. 2, 2015, 2:57 a.m.

Details

Message ID 1446433030-13644-1-git-send-email-chuanbo.weng@intel.com
State New
Headers show
Series "Series without cover letter" ( rev: 1 ) in Beignet

Not browsing as part of any series.

Commit Message

Chuanbo Weng Nov. 2, 2015, 2:57 a.m.
__gen_ocl_vme is used for hardware accelerated video motion estimation.
It gets payload values as parameters and uses MOV to pass these payload
values to VME SEND Message's payload grfs. The int8 return value is used
to store SEND Message writeback.

v2:
Remove unnecessary 5 parameters(src_grf*) of built-in function(we just
need to allocate related registers in gen_insn_selection step).

v3:
Remove redundant code and change MAX_SRC_NUM to 40.

v4:
Choose message response length by message type instead of hard code.

v5:
Choose message response length by message type in the whole backend
pipeline.

Signed-off-by: Chuanbo Weng <chuanbo.weng@intel.com>
---
 backend/src/backend/gen/gen_mesa_disasm.c          | 14 +++++
 backend/src/backend/gen7_instruction.hpp           | 15 +++++
 backend/src/backend/gen_context.cpp                | 72 +++++++++++++++++++++
 backend/src/backend/gen_context.hpp                |  1 +
 backend/src/backend/gen_defs.hpp                   | 19 ++++++
 backend/src/backend/gen_encoder.cpp                | 44 +++++++++++++
 backend/src/backend/gen_encoder.hpp                | 13 ++++
 .../src/backend/gen_insn_gen7_schedule_info.hxx    |  1 +
 backend/src/backend/gen_insn_selection.cpp         | 73 ++++++++++++++++++++++
 backend/src/backend/gen_insn_selection.hpp         | 14 ++++-
 backend/src/backend/gen_insn_selection.hxx         |  1 +
 backend/src/ir/instruction.cpp                     | 66 +++++++++++++++++++
 backend/src/ir/instruction.hpp                     | 17 ++++-
 backend/src/ir/instruction.hxx                     |  1 +
 backend/src/libocl/include/ocl_misc.h              | 15 +++++
 backend/src/llvm/llvm_gen_backend.cpp              | 47 ++++++++++++++
 backend/src/llvm/llvm_gen_ocl_function.hxx         |  2 +
 backend/src/llvm/llvm_scalarize.cpp                |  4 ++
 18 files changed, 414 insertions(+), 5 deletions(-)

Patch hide | download patch | download mbox

diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index 5b71cfa..3198da7 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -476,6 +476,13 @@  static int column;
 
 static int gen_version;
 
+#define GEN7_BITS_FIELD(inst, gen7) \
+  ({                                                            \
+    int bits;                                                   \
+      bits = ((const union Gen7NativeInstruction *)inst)->gen7; \
+    bits;                                                       \
+  })
+
 #define GEN_BITS_FIELD(inst, gen)                               \
   ({                                                            \
     int bits;                                                   \
@@ -530,6 +537,8 @@  static int gen_version;
 #define EXECUTION_SIZE(inst)       GEN_BITS_FIELD(inst, header.execution_size)
 #define BRANCH_JIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.jip, bits3.gen8_branch.jip/8)
 #define BRANCH_UIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.uip, bits2.gen8_branch.uip/8)
+#define VME_BTI(inst)              GEN7_BITS_FIELD(inst, bits3.vme_gen7.bti)
+#define VME_MSG_TYPE(inst)         GEN7_BITS_FIELD(inst, bits3.vme_gen7.msg_type)
 #define SAMPLE_BTI(inst)           GEN_BITS_FIELD(inst, bits3.sampler_gen7.bti)
 #define SAMPLER(inst)              GEN_BITS_FIELD(inst, bits3.sampler_gen7.sampler)
 #define SAMPLER_MSG_TYPE(inst)     GEN_BITS_FIELD(inst, bits3.sampler_gen7.msg_type)
@@ -1431,6 +1440,11 @@  int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compac
 
     if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE) {
       switch (target) {
+        case GEN_SFID_VIDEO_MOTION_EST:
+          format(file, " (bti: %d, msg_type: %d)",
+                 VME_BTI(inst),
+                 VME_MSG_TYPE(inst));
+          break;
         case GEN_SFID_SAMPLER:
           format(file, " (%d, %d, %d, %d)",
                  SAMPLE_BTI(inst),
diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp
index 51f342b..258dd24 100644
--- a/backend/src/backend/gen7_instruction.hpp
+++ b/backend/src/backend/gen7_instruction.hpp
@@ -350,6 +350,21 @@  union Gen7NativeInstruction
         uint32_t end_of_thread:1;
       } sampler_gen7;
 
+      struct {
+        uint32_t bti:8;
+        uint32_t vme_search_path_lut:3;
+        uint32_t lut_sub:2;
+        uint32_t msg_type:2;
+        uint32_t stream_in:1;
+        uint32_t stream_out:1;
+        uint32_t reserved_mbz:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } vme_gen7;
+
       /**
        * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
        *
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4e2ebfb..64fae16 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2266,6 +2266,78 @@  namespace gbe
     p->SAMPLE(dst, msgPayload, msgLen, false, bti, sampler, simdWidth, -1, 0, insn.extra.isLD, insn.extra.isUniform);
   }
 
+  void GenContext::emitVmeInstruction(const SelectionInstruction &insn) {
+    const GenRegister dst = ra->genReg(insn.dst(0));
+    const unsigned int msg_type = insn.extra.msg_type;
+
+    GBE_ASSERT(msg_type == 1);
+    int rsp_len;
+    if(msg_type == 1)
+      rsp_len = 6;
+    /* Use MOV to Setup bits of payload */
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    for(int i=0; i < 2; i++){
+      for(int k = 0; k < 2; k++){
+        GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+        payload_grf.nr += k;
+        payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+        payload_grf.width = GEN_WIDTH_1;
+        payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+        payload_grf.subphysical = 1;
+        for(int j=0; j < 8; j++){
+          payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+          GenRegister payload_val = ra->genReg(insn.src(i*16+k*8+j));
+          payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+          payload_val.width = GEN_WIDTH_1;
+          payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+          p->MOV(payload_grf, payload_val);
+        }
+      }
+    }
+    {
+      int i = 2;
+      GenRegister payload_grf = ra->genReg(insn.dst(rsp_len+i));
+      payload_grf.vstride = GEN_VERTICAL_STRIDE_0;
+      payload_grf.width = GEN_WIDTH_1;
+      payload_grf.hstride = GEN_HORIZONTAL_STRIDE_0;
+      payload_grf.subphysical = 1;
+      for(int j=0; j < 8; j++){
+        payload_grf.subnr = (7 - j) * typeSize(GEN_TYPE_UD);
+        GenRegister payload_val = ra->genReg(insn.src(i*16+j));
+        payload_val.vstride = GEN_VERTICAL_STRIDE_0;
+        payload_val.width = GEN_WIDTH_1;
+        payload_val.hstride = GEN_HORIZONTAL_STRIDE_0;
+
+        p->MOV(payload_grf, payload_val);
+      }
+    }
+    p->pop();
+
+    p->push();
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    GenRegister payload_did = GenRegister::retype(ra->genReg(insn.dst(rsp_len)), GEN_TYPE_UB);
+    payload_did.vstride = GEN_VERTICAL_STRIDE_0;
+    payload_did.width = GEN_WIDTH_1;
+    payload_did.hstride = GEN_HORIZONTAL_STRIDE_0;
+    payload_did.subphysical = 1;
+    payload_did.subnr = 20 * typeSize(GEN_TYPE_UB);
+    GenRegister grf0 = GenRegister::ub1grf(0, 20);
+    p->MOV(payload_did, grf0);
+    p->pop();
+
+    const GenRegister msgPayload = ra->genReg(insn.dst(rsp_len));
+    const unsigned char bti = insn.getbti();
+    const unsigned int vme_search_path_lut = insn.extra.vme_search_path_lut;
+    const unsigned int lut_sub = insn.extra.lut_sub;
+    p->VME(bti, dst, msgPayload, msg_type, vme_search_path_lut, lut_sub);
+  }
+
   void GenContext::scratchWrite(const GenRegister header, uint32_t offset, uint32_t reg_num, uint32_t reg_type, uint32_t channel_mode) {
     p->push();
     uint32_t simdWidth = p->curr.execWidth;
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 4044694..870266c 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -167,6 +167,7 @@  namespace gbe
     virtual void emitUnpackLongInstruction(const SelectionInstruction &insn);
     void emitDWordGatherInstruction(const SelectionInstruction &insn);
     void emitSampleInstruction(const SelectionInstruction &insn);
+    void emitVmeInstruction(const SelectionInstruction &insn);
     void emitTypedWriteInstruction(const SelectionInstruction &insn);
     void emitSpillRegInstruction(const SelectionInstruction &insn);
     void emitUnSpillRegInstruction(const SelectionInstruction &insn);
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index 1b550ac..50e1e90 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -424,6 +424,10 @@  enum GenMessageTarget {
 #define GEN_SAMPLER_SIMD_MODE_SIMD16                    2
 #define GEN_SAMPLER_SIMD_MODE_SIMD32_64                 3
 
+#define VME_INTER_SEARCH_ONLY                              1
+#define VME_INTRA_SEARCH_ONLY                              2
+#define VME_INTER_AND_INTRA_SEARCH                         3
+
 #define GEN_MATH_FUNCTION_INV                              1
 #define GEN_MATH_FUNCTION_LOG                              2
 #define GEN_MATH_FUNCTION_EXP                              3
@@ -615,6 +619,21 @@  union GenNativeInstruction
         uint32_t end_of_thread:1;
       } sampler_gen7;
 
+      struct {
+        uint32_t bti:8;
+        uint32_t vme_search_path_lut:3;
+        uint32_t lut_sub:2;
+        uint32_t msg_type:2;
+        uint32_t stream_in:1;
+        uint32_t stream_out:1;
+        uint32_t reserved_mbz:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } vme_gen7;
+
       /**
        * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
        *
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 2cc51cc..be38cef 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -1191,6 +1191,50 @@  namespace gbe
                        simd_mode, return_format);
   }
 
+  void GenEncoder::setVmeMessage(GenNativeInstruction *insn,
+                                unsigned char bti,
+                                uint32_t response_length,
+                                uint32_t msg_length,
+                                uint32_t msg_type,
+                                unsigned char vme_search_path_lut,
+                                unsigned char lut_sub)
+  {
+     const GenMessageTarget sfid = GEN_SFID_VIDEO_MOTION_EST;
+     setMessageDescriptor(insn, sfid, msg_length, response_length, true);
+     insn->bits3.vme_gen7.bti = bti;
+     insn->bits3.vme_gen7.vme_search_path_lut = vme_search_path_lut;
+     insn->bits3.vme_gen7.lut_sub = lut_sub;
+     insn->bits3.vme_gen7.msg_type = msg_type;
+     insn->bits3.vme_gen7.stream_in = 0;
+     insn->bits3.vme_gen7.stream_out = 0;
+     insn->bits3.vme_gen7.reserved_mbz = 0;
+
+  }
+
+  void GenEncoder::VME(unsigned char bti,
+                       GenRegister dest,
+                       GenRegister msg,
+                       uint32_t msg_type,
+                       uint32_t vme_search_path_lut,
+                       uint32_t lut_sub)
+  {
+    /* Currectly we just support inter search only, we will support other
+     * modes in future.
+     */
+    GBE_ASSERT(msg_type == 1);
+    uint32_t msg_length, response_length;
+    if(msg_type == 1){
+      msg_length = 5;
+      response_length = 6;
+    }
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    this->setHeader(insn);
+    this->setDst(insn, dest);
+    this->setSrc0(insn, msg);
+    setVmeMessage(insn, bti, response_length, msg_length,
+                  msg_type, vme_search_path_lut, lut_sub);
+  }
+
   void GenEncoder::TYPED_WRITE(GenRegister msg, bool header_present, unsigned char bti)
   {
      GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index f2bb5ab..6df7087 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -203,6 +203,19 @@  namespace gbe
                            bool header_present,
                            uint32_t simd_mode,
                            uint32_t return_format);
+    virtual void VME(unsigned char bti,
+                         GenRegister dest,
+                         GenRegister msg,
+                         uint32_t msg_type,
+                         uint32_t vme_search_path_lut,
+                         uint32_t lut_sub);
+    void setVmeMessage(GenNativeInstruction *insn,
+                          unsigned char bti,
+                          uint32_t response_length,
+                          uint32_t msg_length,
+                          uint32_t msg_type,
+                          unsigned char vme_search_path_lut,
+                          unsigned char lut_sub);
 
     /*! TypedWrite instruction for texture */
     virtual void TYPED_WRITE(GenRegister header,
diff --git a/backend/src/backend/gen_insn_gen7_schedule_info.hxx b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
index 9b60c17..878e0e7 100644
--- a/backend/src/backend/gen_insn_gen7_schedule_info.hxx
+++ b/backend/src/backend/gen_insn_gen7_schedule_info.hxx
@@ -36,6 +36,7 @@  DECL_GEN7_SCHEDULE(UnpackByte,      40,        1,        1)
 DECL_GEN7_SCHEDULE(PackLong,        40,        1,        1)
 DECL_GEN7_SCHEDULE(UnpackLong,      40,        1,        1)
 DECL_GEN7_SCHEDULE(Sample,          160,       1,        1)
+DECL_GEN7_SCHEDULE(Vme,             320,       1,        1)
 DECL_GEN7_SCHEDULE(TypedWrite,      80,        1,        1)
 DECL_GEN7_SCHEDULE(SpillReg,        20,        1,        1)
 DECL_GEN7_SCHEDULE(UnSpillReg,      160,       1,        1)
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index fafd927..563b7f4 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -187,6 +187,7 @@  namespace gbe
            this->opcode == SEL_OP_ATOMIC       ||
            this->opcode == SEL_OP_BYTE_GATHER  ||
            this->opcode == SEL_OP_SAMPLE ||
+           this->opcode == SEL_OP_VME ||
            this->opcode == SEL_OP_DWORD_GATHER;
   }
 
@@ -661,6 +662,8 @@  namespace gbe
     void ALU3(SelectionOpcode opcode, Reg dst, Reg src0, Reg src1, Reg src2);
     /*! Encode sample instructions */
     void SAMPLE(GenRegister *dst, uint32_t dstNum, GenRegister *msgPayloads, uint32_t msgNum, uint32_t bti, uint32_t sampler, bool isLD, bool isUniform);
+    /*! Encode vme instructions */
+    void VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal, uint32_t dstNum, uint32_t srcNum, uint32_t msg_type, uint32_t vme_search_path_lut, uint32_t lut_sub);
     /*! Encode typed write instructions */
     void TYPED_WRITE(GenRegister *msgs, uint32_t msgNum, uint32_t bti, bool is3D);
     /*! Get image information */
@@ -2120,6 +2123,34 @@  namespace gbe
     insn->extra.isUniform = isUniform;
   }
 
+  void Selection::Opaque::VME(uint32_t bti, GenRegister *dst, GenRegister *payloadVal,
+                              uint32_t dstNum, uint32_t srcNum, uint32_t msg_type,
+                              uint32_t vme_search_path_lut, uint32_t lut_sub) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_VME, dstNum, srcNum);
+    SelectionVector *dstVector = this->appendVector();
+    SelectionVector *msgVector = this->appendVector();
+
+    for (uint32_t elemID = 0; elemID < dstNum; ++elemID)
+      insn->dst(elemID) = dst[elemID];
+    for (uint32_t elemID = 0; elemID < srcNum; ++elemID)
+      insn->src(elemID) = payloadVal[elemID];
+
+    dstVector->regNum = dstNum;
+    dstVector->isSrc = 0;
+    dstVector->offsetID = 0;
+    dstVector->reg = &insn->dst(0);
+
+    msgVector->regNum = srcNum;
+    msgVector->isSrc = 1;
+    msgVector->offsetID = 0;
+    msgVector->reg = &insn->src(0);
+
+    insn->setbti(bti);
+    insn->extra.msg_type = msg_type;
+    insn->extra.vme_search_path_lut = vme_search_path_lut;
+    insn->extra.lut_sub = lut_sub;
+  }
+
   ///////////////////////////////////////////////////////////////////////////
   // Code selection public implementation
   ///////////////////////////////////////////////////////////////////////////
@@ -5126,6 +5157,47 @@  namespace gbe
     DECL_CTOR(SampleInstruction, 1, 1);
   };
 
+  DECL_PATTERN(VmeInstruction)
+  {
+    INLINE bool emitOne(Selection::Opaque &sel, const ir::VmeInstruction &insn, bool &markChildren) const
+    {
+      using namespace ir;
+      uint32_t msg_type, vme_search_path_lut, lut_sub;
+      msg_type = insn.getMsgType();
+      vme_search_path_lut = 0;
+      lut_sub = 0;
+      GBE_ASSERT(msg_type == 1);
+      uint32_t payloadLen = 0;
+      //We allocate 5 virtual payload grfs to selection dst register.
+      if(msg_type == 1){
+        payloadLen = 5;
+      }
+      uint32_t selDstNum = insn.getDstNum() + payloadLen;
+      uint32_t srcNum = insn.getSrcNum();
+      vector<GenRegister> dst(selDstNum);
+      vector<GenRegister> payloadVal(srcNum);
+      uint32_t valueID = 0;
+      for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
+        dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
+      for (valueID = insn.getDstNum(); valueID < selDstNum; ++valueID)
+        dst[valueID] = sel.selReg(sel.reg(FAMILY_DWORD), TYPE_U32);
+
+      for (valueID = 0; valueID < srcNum; ++valueID)
+        payloadVal[valueID] = sel.selReg(insn.getSrc(valueID), insn.getSrcType());
+
+      uint32_t bti = insn.getImageIndex();
+      if (bti > BTI_MAX_ID) {
+        std::cerr << "Too large bti " << bti;
+        return false;
+      }
+
+      sel.VME(bti, dst.data(), payloadVal.data(), selDstNum, srcNum, msg_type, vme_search_path_lut, lut_sub);
+
+      return true;
+    }
+    DECL_CTOR(VmeInstruction, 1, 1);
+  };
+
   /*! Typed write instruction pattern. */
   DECL_PATTERN(TypedWriteInstruction)
   {
@@ -5591,6 +5663,7 @@  namespace gbe
     this->insert<MulAddInstructionPattern>();
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
+    this->insert<VmeInstructionPattern>();
     this->insert<GetImageInfoInstructionPattern>();
     this->insert<ReadARFInstructionPattern>();
     this->insert<RegionInstructionPattern>();
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 4efb80b..1de8986 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -90,8 +90,8 @@  namespace gbe
     const GenRegister &dst(uint32_t dstID) const { return regs[dstID]; }
     /*! Damn C++ */
     const GenRegister &src(uint32_t srcID) const { return regs[dstNum+srcID]; }
-    /*! No more than 9 sources (used by typed writes on simd8 mode.) */
-    enum { MAX_SRC_NUM = 9 };
+    /*! No more than 40 sources (40 sources are used by vme for payload passing and setting) */
+    enum { MAX_SRC_NUM = 40 };
     /*! No more than 16 destinations (15 used by I64DIV/I64REM) */
     enum { MAX_DST_NUM = 16 };
     /*! State of the instruction (extra fields neeed for the encoding) */
@@ -129,6 +129,12 @@  namespace gbe
         bool     isLD;  // is this a ld message?
         bool     isUniform;
       };
+      struct {
+        uint16_t vme_bti:8;
+        uint16_t msg_type:2;
+        uint16_t vme_search_path_lut:3;
+        uint16_t lut_sub:2;
+      };
       uint32_t barrierType;
       bool longjmp;
       uint32_t indirect_offset;
@@ -138,7 +144,7 @@  namespace gbe
     /*! Number of destinations */
     uint8_t dstNum:5;
     /*! Number of sources */
-    uint8_t srcNum:4;
+    uint8_t srcNum:6;
     /*! To store various indices */
     uint32_t index;
     /*! For BRC/IF to store the UIP */
@@ -152,6 +158,7 @@  namespace gbe
       switch (opcode) {
         case SEL_OP_DWORD_GATHER: return extra.function;
         case SEL_OP_SAMPLE: return extra.rdbti;
+        case SEL_OP_VME: return extra.vme_bti;
         case SEL_OP_TYPED_WRITE: return extra.bti;
         default:
           GBE_ASSERT(0);
@@ -164,6 +171,7 @@  namespace gbe
       switch (opcode) {
         case SEL_OP_DWORD_GATHER: extra.function = bti; return;
         case SEL_OP_SAMPLE: extra.rdbti = bti; return;
+        case SEL_OP_VME: extra.vme_bti = bti; return;
         case SEL_OP_TYPED_WRITE: extra.bti = bti; return;
         default:
           GBE_ASSERT(0);
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index 479398b..4d3e921 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -65,6 +65,7 @@  DECL_SELECTION_IR(UNPACK_BYTE, UnpackByteInstruction)
 DECL_SELECTION_IR(PACK_LONG, PackLongInstruction)
 DECL_SELECTION_IR(UNPACK_LONG, UnpackLongInstruction)
 DECL_SELECTION_IR(SAMPLE, SampleInstruction)
+DECL_SELECTION_IR(VME, VmeInstruction)
 DECL_SELECTION_IR(TYPED_WRITE, TypedWriteInstruction)
 DECL_SELECTION_IR(SPILL_REG, SpillRegInstruction)
 DECL_SELECTION_IR(UNSPILL_REG, UnSpillRegInstruction)
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index f93c528..7bf787e 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -595,6 +595,58 @@  namespace ir {
       static const uint32_t dstNum = 4;
     };
 
+    class ALIGNED_INSTRUCTION VmeInstruction :
+      public BasePolicy,
+      public TupleSrcPolicy<VmeInstruction>,
+      public TupleDstPolicy<VmeInstruction>
+    {
+    public:
+      VmeInstruction(uint8_t imageIdx, Tuple dstTuple, Tuple srcTuple,
+                     uint32_t dstNum, uint32_t srcNum, int msg_type,
+                     int vme_search_path_lut, int lut_sub) {
+        this->opcode = OP_VME;
+        this->dst = dstTuple;
+        this->src = srcTuple;
+        this->dstNum = dstNum;
+        this->srcNum = srcNum;
+        this->imageIdx = imageIdx;
+        this->msg_type = msg_type;
+        this->vme_search_path_lut = vme_search_path_lut;
+        this->lut_sub = lut_sub;
+      }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const {
+        this->outOpcode(out);
+        out << " src_surface id " << (int)this->getImageIndex()
+            << " ref_surface id " << (int)this->getImageIndex() + 1;
+        for(uint32_t i = 0; i < dstNum; i++){
+          out<< " %" << this->getDst(fn, i);
+        }
+        for(uint32_t i = 0; i < srcNum; i++){
+          out<< " %" << this->getSrc(fn, i);
+        }
+        out
+            << " msg_type " << (int)this->getMsgType()
+            << " vme_search_path_lut " << (int)this->vme_search_path_lut
+            << " lut_sub " << (int)this->lut_sub;
+      }
+      Tuple src;
+      Tuple dst;
+
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getMsgType(void) const { return this->msg_type; }
+
+      INLINE Type getSrcType(void) const { return TYPE_U32; }
+      INLINE Type getDstType(void) const { return TYPE_U32; }
+      uint8_t imageIdx;
+      uint8_t msg_type;
+      uint8_t vme_search_path_lut;
+      uint8_t lut_sub;
+      uint32_t srcNum;
+      uint32_t dstNum;
+    };
+
+
     class ALIGNED_INSTRUCTION TypedWriteInstruction : // TODO
       public BasePolicy,
       public TupleSrcPolicy<TypedWriteInstruction>,
@@ -1111,6 +1163,8 @@  namespace ir {
     // TODO
     INLINE bool SampleInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
+    INLINE bool VmeInstruction::wellFormed(const Function &fn, std::string &why) const
+    { return true; }
     INLINE bool TypedWriteInstruction::wellFormed(const Function &fn, std::string &why) const
     { return true; }
     INLINE bool GetImageInfoInstruction::wellFormed(const Function &fn, std::string &why) const
@@ -1502,6 +1556,10 @@  START_INTROSPECTION(LabelInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LabelInstruction)
 
+START_INTROSPECTION(VmeInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(VmeInstruction)
+
 #undef END_INTROSPECTION
 #undef START_INTROSPECTION
 #undef DECL_INSN
@@ -1694,6 +1752,10 @@  DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
 DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerIndex(void), getSamplerIndex())
 DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerOffset(void), getSamplerOffset())
 DECL_MEM_FN(SampleInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(VmeInstruction, Type, getSrcType(void), getSrcType())
+DECL_MEM_FN(VmeInstruction, Type, getDstType(void), getDstType())
+DECL_MEM_FN(VmeInstruction, uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(VmeInstruction, uint8_t, getMsgType(void), getMsgType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
 DECL_MEM_FN(TypedWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
@@ -1932,6 +1994,10 @@  DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex
     return internal::SampleInstruction(imageIndex, dst, src, srcNum, dstIsFloat, srcIsFloat, sampler, samplerOffset).convert();
   }
 
+  Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub) {
+    return internal::VmeInstruction(imageIndex, dst, src, dstNum, srcNum, msg_type, vme_search_path_lut, lut_sub).convert();
+  }
+
   Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType) {
     return internal::TypedWriteInstruction(imageIndex, src, srcNum, srcType, coordType).convert();
   }
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index 3f3c655..c8da416 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -191,8 +191,8 @@  namespace ir {
     template <typename T> INLINE bool isMemberOf(void) const {
       return T::isClassOf(*this);
     }
-    /*! max_src for store instruction (vec16 + addr) */
-    static const uint32_t MAX_SRC_NUM = 32;
+    /*! max_src used by vme for payload passing and setting */
+    static const uint32_t MAX_SRC_NUM = 40;
     static const uint32_t MAX_DST_NUM = 32;
   protected:
     BasicBlock *parent;      //!< The basic block containing the instruction
@@ -399,6 +399,17 @@  namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Video motion estimation */
+  class VmeInstruction : public Instruction {
+  public:
+    uint8_t getImageIndex() const;
+    uint8_t getMsgType() const;
+    Type getSrcType(void) const;
+    Type getDstType(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   typedef union _ImageInfoKey{
     _ImageInfoKey(uint8_t i, uint8_t t) : index(i), type(t) {};
     _ImageInfoKey(int key) : data(key) {};
@@ -756,6 +767,8 @@  namespace ir {
   Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, uint8_t srcNum, Type srcType, Type coordType);
   /*! sample textures */
   Instruction SAMPLE(uint8_t imageIndex, Tuple dst, Tuple src, uint8_t srcNum, bool dstIsFloat, bool srcIsFloat, uint8_t sampler, uint8_t samplerOffset);
+  /*! video motion estimation */
+  Instruction VME(uint8_t imageIndex, Tuple dst, Tuple src, uint32_t dstNum, uint32_t srcNum, int msg_type, int vme_search_path_lut, int lut_sub);
   /*! get image information , such as width/height/depth/... */
   Instruction GET_IMAGE_INFO(int infoType, Register dst, uint8_t imageIndex, Register infoReg);
   /*! label labelIndex */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 81548c9..27d59a9 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -85,6 +85,7 @@  DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
 DECL_INSN(READ_ARF, ReadARFInstruction)
 DECL_INSN(REGION, RegionInstruction)
+DECL_INSN(VME, VmeInstruction)
 DECL_INSN(INDIRECT_MOV, IndirectMovInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
 DECL_INSN(MUL_HI, BinaryInstruction)
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
index 359025b..7d4abab 100644
--- a/backend/src/libocl/include/ocl_misc.h
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -136,5 +136,20 @@  struct time_stamp {
   uint event;
 };
 
+uint __gen_ocl_region(ushort offset, uint data);
+
 struct time_stamp __gen_ocl_get_timestamp(void);
+
+uint8 __gen_ocl_vme(image2d_t, image2d_t,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   uint, uint, uint, uint,
+                   int, int, int);
 #endif
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 3d76265..bfbd072 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -3541,6 +3541,7 @@  namespace gbe
       case GEN_OCL_REGION:
       case GEN_OCL_SIMD_ID:
       case GEN_OCL_SIMD_SHUFFLE:
+      case GEN_OCL_VME:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
@@ -3839,6 +3840,52 @@  namespace gbe
             ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM);
             break;
           }
+          case GEN_OCL_VME:
+          {
+
+            const uint8_t imageID = getImageID(I);
+
+            AI++;
+            AI++;
+
+            uint32_t src_length = 40;
+
+            vector<ir::Register> dstTupleData, srcTupleData;
+            for (uint32_t i = 0; i < src_length; i++, AI++){
+              srcTupleData.push_back(this->getRegister(*AI));
+            }
+
+            const ir::Tuple srcTuple = ctx.arrayTuple(&srcTupleData[0], src_length);
+
+            Constant *msg_type_cpv = dyn_cast<Constant>(*AI);
+            assert(msg_type_cpv);
+            const ir::Immediate &msg_type_x = processConstantImm(msg_type_cpv);
+            int msg_type = msg_type_x.getIntegerValue();
+            uint32_t dst_length;
+            //msy_type =1 indicate inter search only of gen vme shared function
+            GBE_ASSERT(msg_type == 1);
+            if(msg_type == 1)
+              dst_length = 6;
+            for (uint32_t elemID = 0; elemID < dst_length; ++elemID) {
+              const ir::Register reg = this->getRegister(&I, elemID);
+              dstTupleData.push_back(reg);
+            }
+            const ir::Tuple dstTuple = ctx.arrayTuple(&dstTupleData[0], dst_length);
+            ++AI;
+            Constant *vme_search_path_lut_cpv = dyn_cast<Constant>(*AI);
+            assert(vme_search_path_lut_cpv);
+            const ir::Immediate &vme_search_path_lut_x = processConstantImm(vme_search_path_lut_cpv);
+            ++AI;
+            Constant *lut_sub_cpv = dyn_cast<Constant>(*AI);
+            assert(lut_sub_cpv);
+            const ir::Immediate &lut_sub_x = processConstantImm(lut_sub_cpv);
+
+            ctx.VME(imageID, dstTuple, srcTuple, dst_length, src_length,
+                    msg_type, vme_search_path_lut_x.getIntegerValue(),
+                    lut_sub_x.getIntegerValue());
+
+            break;
+          }
           case GEN_OCL_REGION:
           {
             const ir::Register dst = this->getRegister(&I);
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index cabb225..3fbf847 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -168,5 +168,7 @@  DECL_LLVM_GEN_FUNCTION(SIMD_SHUFFLE, intel_sub_group_shuffle)
 DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
 DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
 
+DECL_LLVM_GEN_FUNCTION(VME, __gen_ocl_vme)
+
 // printf function
 DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 7ee5259..dc1d8ab 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -671,6 +671,10 @@  namespace gbe {
             *CI = InsertToVector(call, *CI);
             break;
           }
+          case GEN_OCL_VME:
+            setAppendPoint(call);
+            extractFromVector(call);
+            break;
         }
       }
     }