[2/4] add extensions intel_accelerator and basic intel_motion_estimation

Submitted by Chuanbo Weng on Aug. 10, 2015, 5:48 a.m.

Details

Message ID 1439185738-6437-1-git-send-email-chuanbo.weng@intel.com
State New
Headers show

Not browsing as part of any series.

Commit Message

Chuanbo Weng Aug. 10, 2015, 5:48 a.m.
From: Guo Yejun <yejun.guo@intel.com>

Signed-off-by: Guo Yejun <yejun.guo@intel.com>
Signed-off-by: Chuanbo Weng <chuanbo.weng@intel.com>
---
 include/CL/cl_ext.h                                | 103 +++++++++
 src/CMakeLists.txt                                 |   4 +-
 src/cl_accelerator_intel.c                         |  86 ++++++++
 src/cl_accelerator_intel.h                         |  29 +++
 src/cl_api.c                                       | 106 +++++++++-
 src/cl_command_queue.c                             |  14 +-
 src/cl_command_queue_gen7.c                        |   5 +-
 src/cl_context.c                                   |   1 +
 src/cl_context.h                                   |   3 +
 src/cl_driver.h                                    |  21 ++
 src/cl_driver_defs.c                               |   2 +
 src/cl_extensions.c                                |   6 +-
 src/cl_extensions.h                                |   8 +
 src/cl_gen7_device.h                               |   5 +-
 src/cl_gt_device.h                                 |   6 +-
 src/cl_internals.h                                 |   1 +
 src/cl_kernel.c                                    |  55 ++++-
 src/cl_kernel.h                                    |   6 +-
 src/cl_utils.h                                     |  12 ++
 src/intel/intel_gpgpu.c                            | 219 ++++++++++++++++++-
 src/intel/intel_structs.h                          | 120 +++++++++++
 .../cl_internal_block_motion_estimate_intel.cl     | 231 +++++++++++++++++++++
 22 files changed, 1011 insertions(+), 32 deletions(-)
 create mode 100644 src/cl_accelerator_intel.c
 create mode 100644 src/cl_accelerator_intel.h
 create mode 100644 src/kernels/cl_internal_block_motion_estimate_intel.cl

Patch hide | download patch | download mbox

diff --git a/include/CL/cl_ext.h b/include/CL/cl_ext.h
index 710bea8..0a66d70 100644
--- a/include/CL/cl_ext.h
+++ b/include/CL/cl_ext.h
@@ -184,6 +184,109 @@  typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /
 #define CL_PRINTF_CALLBACK_ARM                      0x40B0
 #define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
 
+/*********************************
+* cl_intel_accelerator extension *
+*********************************/
+#define cl_intel_accelerator 1
+#define cl_intel_motion_estimation 1
+
+typedef struct _cl_accelerator_intel*     cl_accelerator_intel;
+typedef cl_uint                           cl_accelerator_type_intel;
+typedef cl_uint                           cl_accelerator_info_intel;
+
+typedef struct _cl_motion_estimation_desc_intel {
+    cl_uint mb_block_type;
+    cl_uint subpixel_mode;
+    cl_uint sad_adjust_mode;
+    cl_uint search_path_type;
+} cl_motion_estimation_desc_intel;
+
+/* Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL            -1094
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL       -1095
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
+
+/* Deprecated Error Codes */
+#define CL_INVALID_ACCELERATOR_INTEL_DEPRECATED            -6000
+#define CL_INVALID_ACCELERATOR_TYPE_INTEL_DEPRECATED       -6001
+#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL_DEPRECATED -6002
+#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL_DEPRECATED -6003
+
+/* cl_accelerator_type_intel */
+#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL     0x0
+
+/* cl_accelerator_info_intel */
+#define CL_ACCELERATOR_DESCRIPTOR_INTEL                 0x4090
+#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL            0x4091
+#define CL_ACCELERATOR_CONTEXT_INTEL                    0x4092
+#define CL_ACCELERATOR_TYPE_INTEL                       0x4093
+
+/*cl_motion_detect_desc_intel flags */
+#define CL_ME_MB_TYPE_16x16_INTEL                       0x0
+#define CL_ME_MB_TYPE_8x8_INTEL                         0x1
+#define CL_ME_MB_TYPE_4x4_INTEL                         0x2
+
+#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
+#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
+#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x2
+
+#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
+#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x1
+
+#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL              0x0
+#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL              0x1
+#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL            0x5
+
+extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
+clCreateAcceleratorINTEL(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_accelerator_intel
+    (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
+    cl_context                  /* context */,
+    cl_accelerator_type_intel   /* accelerator_type */,
+    size_t                      /* descriptor_size */,
+    const void*                 /* descriptor */,
+    cl_int*                     /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetAcceleratorInfoINTEL
+(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */,
+    cl_accelerator_info_intel   /* param_name */,
+    size_t                      /* param_value_size */,
+    void*                       /* param_value */,
+    size_t*                     /* param_value_size_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseAcceleratorINTEL(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+    (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
+    cl_accelerator_intel        /* accelerator */ ) CL_EXT_SUFFIX__VERSION_1_2;
+
 #ifdef CL_VERSION_1_1
    /***********************************
     * cl_ext_device_fission extension *
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40a9afb..c917e76 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -56,7 +56,8 @@  cl_internal_fill_buf_align8 cl_internal_fill_buf_align4
 cl_internal_fill_buf_align2 cl_internal_fill_buf_unalign
 cl_internal_fill_buf_align128 cl_internal_fill_image_1d
 cl_internal_fill_image_1d_array cl_internal_fill_image_2d
-cl_internal_fill_image_2d_array cl_internal_fill_image_3d)
+cl_internal_fill_image_2d_array cl_internal_fill_image_3d
+cl_internal_block_motion_estimate_intel)
 set (BUILT_IN_NAME  cl_internal_built_in_kernel)
 MakeBuiltInKernelStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
 MakeKernelBinStr ("${CMAKE_CURRENT_SOURCE_DIR}/kernels/" "${KERNEL_NAMES}")
@@ -70,6 +71,7 @@  set(OPENCL_SRC
     cl_program.c
     cl_gbe_loader.cpp
     cl_sampler.c
+    cl_accelerator_intel.c
     cl_event.c
     cl_enqueue.c
     cl_image.c
diff --git a/src/cl_accelerator_intel.c b/src/cl_accelerator_intel.c
new file mode 100644
index 0000000..cda8963
--- /dev/null
+++ b/src/cl_accelerator_intel.c
@@ -0,0 +1,86 @@ 
+#include "cl_context.h"
+#include "cl_accelerator_intel.h"
+#include "cl_utils.h"
+#include "cl_alloc.h"
+#include "cl_khr_icd.h"
+#include "cl_kernel.h"
+
+#include <assert.h>
+
+LOCAL cl_accelerator_intel
+cl_accelerator_intel_new(cl_context ctx,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret)
+{
+  cl_accelerator_intel accel = NULL;
+  cl_int err = CL_SUCCESS;
+
+  /* Allocate and inialize the structure itself */
+  TRY_ALLOC(accel, CALLOC(struct _cl_accelerator_intel));
+  SET_ICD(accel->dispatch)
+  accel->ref_n = 1;
+  accel->magic = CL_MAGIC_ACCELERATOR_INTEL_HEADER;
+
+  if (accel_type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL) {
+    err = CL_INVALID_ACCELERATOR_TYPE_INTEL;
+    goto error;
+  }
+  accel->type = accel_type;
+
+  if (desc == NULL) {   //  and check inside desc
+    err = CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL;
+    goto error;
+  }
+  accel->desc.me = *(cl_motion_estimation_desc_intel*)desc;
+
+  /* Append the accelerator_intel in the context accelerator_intel list */
+  /* does this really needed? */
+  pthread_mutex_lock(&ctx->accelerator_intel_lock);
+    accel->next = ctx->accels;
+    if (ctx->accels != NULL)
+      ctx->accels->prev = accel;
+    ctx->accels = accel;
+  pthread_mutex_unlock(&ctx->accelerator_intel_lock);
+
+  accel->ctx = ctx;
+  cl_context_add_ref(ctx);
+
+exit:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return accel;
+error:
+  cl_accelerator_intel_delete(accel);
+  accel = NULL;
+  goto exit;
+}
+
+LOCAL void
+cl_accelerator_intel_add_ref(cl_accelerator_intel accel)
+{
+  atomic_inc(&accel->ref_n);
+}
+
+LOCAL void
+cl_accelerator_intel_delete(cl_accelerator_intel accel)
+{
+  if (UNLIKELY(accel == NULL))
+    return;
+  if (atomic_dec(&accel->ref_n) > 1)
+    return;
+
+  /* Remove the accelerator_intel in the context accelerator_intel list */
+  pthread_mutex_lock(&accel->ctx->accelerator_intel_lock);
+    if (accel->prev)
+      accel->prev->next = accel->next;
+    if (accel->next)
+      accel->next->prev = accel->prev;
+    if (accel->ctx->accels == accel)
+      accel->ctx->accels = accel->next;
+  pthread_mutex_unlock(&accel->ctx->accelerator_intel_lock);
+
+  cl_context_delete(accel->ctx);
+  cl_free(accel);
+}
diff --git a/src/cl_accelerator_intel.h b/src/cl_accelerator_intel.h
new file mode 100644
index 0000000..cecfd2a
--- /dev/null
+++ b/src/cl_accelerator_intel.h
@@ -0,0 +1,29 @@ 
+#ifndef __CL_ACCELERATOR_INTEL_H__
+#define __CL_ACCELERATOR_INTEL_H__
+
+#include "CL/cl.h"
+#include "CL/cl_ext.h"
+#include <stdint.h>
+
+struct _cl_accelerator_intel {
+  DEFINE_ICD(dispatch)
+  uint64_t magic;            /* To identify it as a accelerator_intel object */
+  volatile int ref_n;        /* This object is reference counted */
+  cl_accelerator_intel prev, next;     /* We chain in the allocator, why chain? */
+  cl_context ctx;            /* Context it belongs to */
+  cl_accelerator_type_intel type;
+  union {
+    cl_motion_estimation_desc_intel me;
+  }desc;                     /* save desc before we decide how to handle it */
+};
+
+cl_accelerator_intel cl_accelerator_intel_new(cl_context ctx,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret);
+
+void cl_accelerator_intel_add_ref(cl_accelerator_intel accel);
+void cl_accelerator_intel_delete(cl_accelerator_intel accel);
+
+#endif
diff --git a/src/cl_api.c b/src/cl_api.c
index 69eb0bc..5d9d79d 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -28,6 +28,7 @@ 
 #include "cl_mem.h"
 #include "cl_image.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 #include "cl_alloc.h"
 #include "cl_utils.h"
 
@@ -2904,6 +2905,17 @@  clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     goto error;
   }
 
+  if (kernel->vme) {
+    if (work_dim != 2) {
+      err = CL_INVALID_WORK_DIMENSION;
+      goto error;
+    }
+    if (local_work_size != NULL) {
+      err = CL_INVALID_WORK_GROUP_SIZE;
+      goto error;
+    }
+  }
+
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i) {
       if (UNLIKELY(global_work_offset[i] + global_work_size[i] > (size_t)-1)) {
@@ -2937,22 +2949,31 @@  clEnqueueNDRangeKernel(cl_command_queue  command_queue,
     for (i = 0; i < work_dim; ++i)
       fixed_local_sz[i] = local_work_size[i];
   } else {
-    uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
-    for (i = 0; i< work_dim; i++) {
-      for (j = maxDimSize; j > 1; j--) {
-        if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
-          fixed_local_sz[i] = j;
-          maxGroupSize = maxGroupSize /j;
-          maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
-          break;  //choose next work_dim
+    if (kernel->vme) {
+        fixed_local_sz[0] = 16;
+        fixed_local_sz[1] = 1;
+    } else {
+      uint j, maxDimSize = 64 /* from 64? */, maxGroupSize = 256; //MAX_WORK_GROUP_SIZE may too large
+      for (i = 0; i< work_dim; i++) {
+        for (j = maxDimSize; j > 1; j--) {
+          if (global_work_size[i] % j == 0 && j <= maxGroupSize) {
+            fixed_local_sz[i] = j;
+            maxGroupSize = maxGroupSize /j;
+            maxDimSize = maxGroupSize > maxDimSize ? maxDimSize : maxGroupSize;
+            break;  //choose next work_dim
+          }
         }
       }
     }
   }
 
-  if (global_work_size != NULL)
+  if (kernel->vme) {
+    fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;
+    fixed_global_sz[1] = (global_work_size[1]+15) / 16;
+  } else {
     for (i = 0; i < work_dim; ++i)
       fixed_global_sz[i] = global_work_size[i];
+  }
   if (global_work_offset != NULL)
     for (i = 0; i < work_dim; ++i)
       fixed_global_off[i] = global_work_offset[i];
@@ -3181,6 +3202,10 @@  internal_clGetExtensionFunctionAddress(const char *func_name)
   EXTFUNC(clCreateBufferFromLibvaIntel)
   EXTFUNC(clCreateImageFromLibvaIntel)
   EXTFUNC(clGetMemObjectFdIntel)
+  EXTFUNC(clCreateAcceleratorINTEL)
+  EXTFUNC(clRetainAcceleratorINTEL)
+  EXTFUNC(clReleaseAcceleratorINTEL)
+  EXTFUNC(clGetAcceleratorInfoINTEL)
   return NULL;
 }
 
@@ -3349,3 +3374,66 @@  clGetMemObjectFdIntel(cl_context context,
 error:
   return err;
 }
+
+cl_accelerator_intel
+clCreateAcceleratorINTEL(cl_context context,
+                         cl_accelerator_type_intel accel_type,
+                         size_t desc_sz,
+                         const void* desc,
+                         cl_int* errcode_ret)
+{
+  cl_accelerator_intel accel = NULL;
+  cl_int err = CL_SUCCESS;
+  CHECK_CONTEXT(context);
+  accel = cl_accelerator_intel_new(context, accel_type, desc_sz, desc, &err);
+error:
+  if (errcode_ret)
+    *errcode_ret = err;
+  return accel;
+}
+
+cl_int
+clRetainAcceleratorINTEL(cl_accelerator_intel accel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+  cl_accelerator_intel_add_ref(accel);
+error:
+  return err;
+}
+
+cl_int
+clReleaseAcceleratorINTEL(cl_accelerator_intel accel)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+  cl_accelerator_intel_delete(accel);
+error:
+  return err;
+}
+
+cl_int
+clGetAcceleratorInfoINTEL(cl_accelerator_intel           accel,
+                            cl_accelerator_info_intel    param_name,
+                            size_t                       param_value_size,
+                            void*                        param_value,
+                            size_t*                      param_value_size_ret)
+{
+  cl_int err = CL_SUCCESS;
+  CHECK_ACCELERATOR_INTEL(accel);
+
+  if (param_name == CL_ACCELERATOR_REFERENCE_COUNT_INTEL) {
+    FILL_GETINFO_RET (cl_uint, 1, (cl_uint*)&accel->ref_n, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_CONTEXT_INTEL) {
+    FILL_GETINFO_RET (cl_context, 1, &accel->ctx, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_TYPE_INTEL) {
+    FILL_GETINFO_RET (cl_uint, 1, &accel->type, CL_SUCCESS);
+  } else if (param_name == CL_ACCELERATOR_DESCRIPTOR_INTEL) {
+    FILL_GETINFO_RET (cl_motion_estimation_desc_intel, 1, &(accel->desc.me), CL_SUCCESS);
+  } else{
+    return CL_INVALID_VALUE;
+  }
+
+error:
+  return err;
+}
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 4e4ebfb..787ee45 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -140,10 +140,16 @@  cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
 
     image = cl_mem_image(k->args[id].mem);
     set_image_info(k->curbe, &k->images[i], image);
-    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
-                        image->intel_fmt, image->image_type, image->bpp,
-                        image->w, image->h, image->depth,
-                        image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+    if(k->vme)
+      cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+                          image->intel_fmt, image->image_type, image->bpp,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
+    else
+      cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image->offset,
+                          image->intel_fmt, image->image_type, image->bpp,
+                          image->w, image->h, image->depth,
+                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image->tiling);
     // TODO, this workaround is for GEN7/GEN75 only, we may need to do it in the driver layer
     // on demand.
     if (image->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 89f39b3..82a7b56 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -378,7 +378,10 @@  cl_command_queue_ND_range_gen7(cl_command_queue queue,
   /* Bind user images */
   cl_command_queue_bind_image(queue, ker);
   /* Bind all samplers */
-  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
+  if (ker->vme)
+    cl_gpgpu_bind_vme_state(gpgpu, ker->accel);
+  else
+    cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);
 
   if (cl_gpgpu_set_scratch(gpgpu, scratch_sz) != 0)
     goto error;
diff --git a/src/cl_context.c b/src/cl_context.c
index c45e0aa..a6bde7d 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -177,6 +177,7 @@  cl_context_new(struct _cl_context_prop *props)
   pthread_mutex_init(&ctx->queue_lock, NULL);
   pthread_mutex_init(&ctx->buffer_lock, NULL);
   pthread_mutex_init(&ctx->sampler_lock, NULL);
+  pthread_mutex_init(&ctx->accelerator_intel_lock, NULL);
 
 exit:
   return ctx;
diff --git a/src/cl_context.h b/src/cl_context.h
index ef94823..489e5d7 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -21,6 +21,7 @@ 
 #define __CL_CONTEXT_H__
 
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 #include "cl_internals.h"
 #include "cl_driver.h"
 #include "cl_khr_icd.h"
@@ -107,11 +108,13 @@  struct _cl_context {
   cl_program programs;              /* All programs currently allocated */
   cl_mem buffers;                   /* All memory object currently allocated */
   cl_sampler samplers;              /* All sampler object currently allocated */
+  cl_accelerator_intel accels;      /* All accelerator_intel object currently allocated */
   cl_event   events;                /* All event object currently allocated */
   pthread_mutex_t queue_lock;       /* To allocate and deallocate queues */
   pthread_mutex_t program_lock;     /* To allocate and deallocate programs */
   pthread_mutex_t buffer_lock;      /* To allocate and deallocate buffers */
   pthread_mutex_t sampler_lock;     /* To allocate and deallocate samplers */
+  pthread_mutex_t accelerator_intel_lock;     /* To allocate and deallocate accelerator_intel */
   pthread_mutex_t event_lock;       /* To allocate and deallocate events */
   cl_program internal_prgs[CL_INTERNAL_KERNEL_MAX];
                                     /* All programs internal used, for example clEnqueuexxx api use */
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 1ab4dff..48049a5 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -23,9 +23,11 @@ 
 #include <stdint.h>
 #include <stdlib.h>
 #include "cl_driver_type.h"
+#include "CL/cl_ext.h"
 /* Various limitations we should remove actually */
 #define GEN_MAX_SURFACES 256
 #define GEN_MAX_SAMPLERS 16
+#define GEN_MAX_VME_STATES 8
 
 /**************************************************************************
  * cl_driver:
@@ -145,6 +147,9 @@  extern cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf;
 typedef void (cl_gpgpu_bind_sampler_cb)(cl_gpgpu, uint32_t *samplers, size_t sampler_sz);
 extern cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler;
 
+typedef void (cl_gpgpu_bind_vme_state_cb)(cl_gpgpu, cl_accelerator_intel accel);
+extern cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state;
+
 /* get the default cache control value. */
 typedef uint32_t (cl_gpgpu_get_cache_ctrl_cb)();
 extern cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl;
@@ -165,6 +170,22 @@  typedef void (cl_gpgpu_bind_image_cb)(cl_gpgpu state,
 
 extern cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image;
 
+typedef void (cl_gpgpu_bind_image_for_vme_cb)(cl_gpgpu state,
+                                              uint32_t id,
+                                              cl_buffer obj_bo,
+                                              uint32_t obj_bo_offset,
+                                              uint32_t format,
+                                              uint32_t bpp,
+                                              uint32_t type,
+                                              int32_t w,
+                                              int32_t h,
+                                              int32_t depth,
+                                              int pitch,
+                                              int32_t slice_pitch,
+                                              cl_gpgpu_tiling tiling);
+
+extern cl_gpgpu_bind_image_for_vme_cb *cl_gpgpu_bind_image_for_vme;
+
 /* Setup a stack */
 typedef void (cl_gpgpu_set_stack_cb)(cl_gpgpu, uint32_t offset, uint32_t size, uint32_t cchint);
 extern cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index b77acdc..4472373 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -69,6 +69,7 @@  LOCAL cl_gpgpu_bind_buf_cb *cl_gpgpu_bind_buf = NULL;
 LOCAL cl_gpgpu_set_stack_cb *cl_gpgpu_set_stack = NULL;
 LOCAL cl_gpgpu_set_scratch_cb *cl_gpgpu_set_scratch = NULL;
 LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image = NULL;
+LOCAL cl_gpgpu_bind_image_cb *cl_gpgpu_bind_image_for_vme = NULL;
 LOCAL cl_gpgpu_get_cache_ctrl_cb *cl_gpgpu_get_cache_ctrl = NULL;
 LOCAL cl_gpgpu_state_init_cb *cl_gpgpu_state_init = NULL;
 LOCAL cl_gpgpu_alloc_constant_buffer_cb * cl_gpgpu_alloc_constant_buffer = NULL;
@@ -82,6 +83,7 @@  LOCAL cl_gpgpu_batch_end_cb *cl_gpgpu_batch_end = NULL;
 LOCAL cl_gpgpu_flush_cb *cl_gpgpu_flush = NULL;
 LOCAL cl_gpgpu_walker_cb *cl_gpgpu_walker = NULL;
 LOCAL cl_gpgpu_bind_sampler_cb *cl_gpgpu_bind_sampler = NULL;
+LOCAL cl_gpgpu_bind_vme_state_cb *cl_gpgpu_bind_vme_state = NULL;
 LOCAL cl_gpgpu_event_new_cb *cl_gpgpu_event_new = NULL;
 LOCAL cl_gpgpu_event_update_status_cb *cl_gpgpu_event_update_status = NULL;
 LOCAL cl_gpgpu_event_flush_cb *cl_gpgpu_event_flush = NULL;
diff --git a/src/cl_extensions.c b/src/cl_extensions.c
index 3eb303f..8ca5f11 100644
--- a/src/cl_extensions.c
+++ b/src/cl_extensions.c
@@ -40,7 +40,7 @@  void check_opt1_extension(cl_extensions_t *extensions)
   int id;
   for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++)
   {
-    if (id == EXT_ID(khr_icd))
+    if (id == EXT_ID(khr_icd) || id == EXT_ID(intel_accelerator) || id == EXT_ID(intel_motion_estimation))
       extensions->extensions[id].base.ext_enabled = 1;
 #if  LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
     if (id == EXT_ID(khr_spir))
@@ -63,7 +63,9 @@  check_gl_extension(cl_extensions_t *extensions) {
 void
 check_intel_extension(cl_extensions_t *extensions)
 {
-  /* Should put those map/unmap extensions here. */
+  int id;
+  for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)
+    extensions->extensions[id].base.ext_enabled = 1;
 }
 
 void
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index b4544e2..27041c3 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -23,6 +23,10 @@ 
   DECL_EXT(khr_spir) \
   DECL_EXT(khr_icd)
 
+#define DECL_INTEL_EXTENSIONS \
+  DECL_EXT(intel_accelerator) \
+  DECL_EXT(intel_motion_estimation)
+
 #define DECL_GL_EXTENSIONS \
   DECL_EXT(khr_gl_sharing)\
   DECL_EXT(khr_gl_event)\
@@ -37,6 +41,7 @@ 
 #define DECL_ALL_EXTENSIONS \
   DECL_BASE_EXTENSIONS \
   DECL_OPT1_EXTENSIONS \
+  DECL_INTEL_EXTENSIONS \
   DECL_GL_EXTENSIONS \
   DECL_D3D_EXTENSIONS
 
@@ -54,6 +59,8 @@  cl_khr_extension_id_max
 #define BASE_EXT_END_ID EXT_ID(khr_fp64)
 #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
 #define OPT1_EXT_END_ID EXT_ID(khr_icd)
+#define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
+#define INTEL_EXT_END_ID EXT_ID(intel_motion_estimation)
 #define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
 #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
 
@@ -75,6 +82,7 @@  struct EXT_STRUCT_NAME(name) { \
 
 DECL_BASE_EXTENSIONS
 DECL_OPT1_EXTENSIONS
+DECL_INTEL_EXTENSIONS
 DECL_D3D_EXTENSIONS
 DECL_GL_EXTENSIONS
 #undef DECL_EXT
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 4ad5d96..648e84e 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -25,5 +25,8 @@ 
 .local_mem_size = 64 << 10,
 .scratch_mem_size = 12 << 10,
 
+//temporarily define to only export builtin kernel block_motion_estimate_intel only for Gen7
+//will remove after HSW+ also support
+#define GEN7_DEVICE
 #include "cl_gt_device.h"
-
+#undef GEN7_DEVICE
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index 4b43c20..2414470 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -115,7 +115,11 @@  DECL_INFO_STRING(built_in_kernels, "__cl_copy_region_align4;"
                                    "__cl_fill_image_1d_array;"
                                    "__cl_fill_image_2d;"
                                    "__cl_fill_image_2d_array;"
-                                   "__cl_fill_image_3d;")
+                                   "__cl_fill_image_3d;"
+#ifdef GEN7_DEVICE
+                                   "block_motion_estimate_intel;"
+#endif
+                                   )
 
 DECL_INFO_STRING(driver_version, LIBCL_DRIVER_VERSION_STRING)
 #undef DECL_INFO_STRING
diff --git a/src/cl_internals.h b/src/cl_internals.h
index cb3fc23..9aeb8c1 100644
--- a/src/cl_internals.h
+++ b/src/cl_internals.h
@@ -31,6 +31,7 @@ 
 #define CL_MAGIC_EVENT_HEADER     0x8324a9c810ebf90fLL
 #define CL_MAGIC_MEM_HEADER       0x381a27b9ce6504dfLL
 #define CL_MAGIC_DEAD_HEADER      0xdeaddeaddeaddeadLL
+#define CL_MAGIC_ACCELERATOR_INTEL_HEADER   0x7c6a08c9a7ac3e3fLL
 
 #endif /* __CL_INTERNALS_H__ */
 
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 28d88b6..ba77cea 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -27,6 +27,7 @@ 
 #include "cl_khr_icd.h"
 #include "CL/cl.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -113,10 +114,22 @@  cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
   arg_type = interp_kernel_get_arg_type(k->opaque, index);
   arg_sz = interp_kernel_get_arg_size(k->opaque, index);
 
-  if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
-    if (arg_type != GBE_ARG_SAMPLER ||
-        (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+  if (k->vme && index == 0) {
+    //the best method is to return the arg type of GBE_ARG_ACCELERATOR_INTEL
+    //but it is not straightforward since clang does not support it now
+    //the easy way is to consider typedef accelerator_intel_t as a struct,
+    //this easy way makes the size mismatched, so use another size check method.
+    if (sz != sizeof(cl_accelerator_intel) || arg_sz != sizeof(cl_motion_estimation_desc_intel))
       return CL_INVALID_ARG_SIZE;
+    cl_accelerator_intel* accel = (cl_accelerator_intel*)value;
+    if ((*accel)->type != CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL)
+      return CL_INVALID_ACCELERATOR_TYPE_INTEL;
+  } else {
+    if (UNLIKELY(arg_type != GBE_ARG_LOCAL_PTR && arg_sz != sz)) {
+      if (arg_type != GBE_ARG_SAMPLER ||
+          (arg_type == GBE_ARG_SAMPLER && sz != sizeof(cl_sampler)))
+        return CL_INVALID_ARG_SIZE;
+    }
   }
 
   if(UNLIKELY(arg_type == GBE_ARG_LOCAL_PTR && sz == 0))
@@ -150,15 +163,30 @@  cl_kernel_set_arg(cl_kernel k, cl_uint index, size_t sz, const void *value)
     }
   }
 
+  memset(&k->args[index], 0, sizeof(k->args[index]));
+
   /* Copy the structure or the value directly into the curbe */
   if (arg_type == GBE_ARG_VALUE) {
-    offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
-    assert(offset + sz <= k->curbe_sz);
-    memcpy(k->curbe + offset, value, sz);
-    k->args[index].local_sz = 0;
-    k->args[index].is_set = 1;
-    k->args[index].mem = NULL;
-    return CL_SUCCESS;
+    if (k->vme && index == 0) {
+      cl_accelerator_intel accel;
+      memcpy(&accel, value, sz);
+      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+      assert(offset + sz <= k->curbe_sz);
+      memcpy(k->curbe + offset, &(accel->desc.me), arg_sz);
+      k->args[index].local_sz = 0;
+      k->args[index].is_set = 1;
+      k->args[index].mem = NULL;
+      k->accel = accel;
+      return CL_SUCCESS;
+    } else {
+      offset = interp_kernel_get_curbe_offset(k->opaque, GBE_CURBE_KERNEL_ARGUMENT, index);
+      assert(offset + sz <= k->curbe_sz);
+      memcpy(k->curbe + offset, value, sz);
+      k->args[index].local_sz = 0;
+      k->args[index].is_set = 1;
+      k->args[index].mem = NULL;
+      return CL_SUCCESS;
+    }
   }
 
   /* For a local pointer just save the size */
@@ -327,6 +355,12 @@  cl_kernel_setup(cl_kernel k, gbe_kernel opaque)
   cl_buffer_subdata(k->bo, 0, code_sz, code);
   k->opaque = opaque;
 
+  const char* kname = cl_kernel_get_name(k);
+  if (strncmp(kname, "block_motion_estimate_intel", sizeof("block_motion_estimate_intel")) == 0)
+    k->vme = 1;
+  else
+    k->vme = 0;
+
   /* Create the curbe */
   k->curbe_sz = interp_kernel_get_curbe_size(k->opaque);
 
@@ -363,6 +397,7 @@  cl_kernel_dup(cl_kernel from)
   SET_ICD(to->dispatch)
   to->bo = from->bo;
   to->opaque = from->opaque;
+  to->vme = from->vme;
   to->ref_n = 1;
   to->magic = CL_MAGIC_KERNEL_HEADER;
   to->program = from->program;
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 140bbb1..7f59162 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -24,6 +24,7 @@ 
 #include "cl_driver.h"
 #include "cl_gbe_loader.h"
 #include "CL/cl.h"
+#include "CL/cl_ext.h"
 
 #include <stdint.h>
 #include <stdlib.h>
@@ -37,6 +38,7 @@  struct _gbe_kernel;
 typedef struct cl_argument {
   cl_mem mem;           /* For image and regular buffers */
   cl_sampler sampler;   /* For sampler. */
+  cl_accelerator_intel accel;
   unsigned char bti;
   uint32_t local_sz:31; /* For __local size specification */
   uint32_t is_set:1;    /* All args must be set before NDRange */
@@ -50,6 +52,7 @@  struct _cl_kernel {
   cl_buffer bo;               /* The code itself */
   cl_program program;         /* Owns this structure (and pointers) */
   gbe_kernel opaque;          /* (Opaque) compiler structure for the OCL kernel */
+  cl_accelerator_intel accel;     /* accelerator */
   char *curbe;                /* One curbe per kernel */
   size_t curbe_sz;            /* Size of it */
   uint32_t samplers[GEN_MAX_SAMPLERS]; /* samplers defined in kernel & kernel args */
@@ -63,8 +66,9 @@  struct _cl_kernel {
                                 (i.e. global_work_size argument to clEnqueueNDRangeKernel.)*/
   size_t stack_size;          /* stack size per work item. */
   cl_argument *args;          /* To track argument setting */
-  uint32_t arg_n:31;          /* Number of arguments */
+  uint32_t arg_n:30;          /* Number of arguments */
   uint32_t ref_its_program:1; /* True only for the user kernel (created by clCreateKernel) */
+  uint32_t vme:1;             /* True only if it is a built-in kernel for VME */
 };
 
 /* Allocate an empty kernel */
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 28fdef6..ee9d614 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -202,6 +202,18 @@  do {                                                        \
   }                                                         \
 } while (0)
 
+#define CHECK_ACCELERATOR_INTEL(ACCELERATOR_INTEL)                              \
+do {                                                                            \
+  if (UNLIKELY(ACCELERATOR_INTEL == NULL)) {                                    \
+    err = CL_INVALID_ACCELERATOR_INTEL;                                         \
+    goto error;                                                                 \
+  }                                                                             \
+  if (UNLIKELY(ACCELERATOR_INTEL->magic != CL_MAGIC_ACCELERATOR_INTEL_HEADER)) {\
+    err = CL_INVALID_ACCELERATOR_INTEL;                                         \
+    goto error;                                                                 \
+  }                                                                             \
+} while (0)
+
 #define CHECK_KERNEL(KERNEL)                                \
 do {                                                        \
   if (UNLIKELY(KERNEL == NULL)) {                           \
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index 901bd98..cd68b07 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -38,6 +38,7 @@ 
 #include "cl_alloc.h"
 #include "cl_utils.h"
 #include "cl_sampler.h"
+#include "cl_accelerator_intel.h"
 
 #ifndef CL_VERSION_1_2
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
@@ -941,10 +942,12 @@  intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->aux_offset.idrt_offset = size_aux;
   size_aux += MAX_IF_DESC * sizeof(struct gen6_interface_descriptor);
 
-  //sampler state must be 32 bytes aligned
+  //must be 32 bytes aligned
+  //sampler state and vme state share the same buffer,
   size_aux = ALIGN(size_aux, 32);
   gpgpu->aux_offset.sampler_state_offset = size_aux;
-  size_aux += GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t);
+  size_aux += MAX(GEN_MAX_SAMPLERS * sizeof(gen6_sampler_state_t),
+                  GEN_MAX_VME_STATES * sizeof(gen7_vme_state_t));
 
   //sampler border color state must be 32 bytes aligned
   size_aux = ALIGN(size_aux, 32);
@@ -985,6 +988,22 @@  intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
                     obj_bo);
 }
 
+static void
+intel_gpgpu_set_buf_reloc_for_vme_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_bo, uint32_t obj_bo_offset)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+                               index * sizeof(gen7_surface_state_t);
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    obj_bo_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen7_media_surface_state_t, ss0),
+                    obj_bo);
+}
+
 static dri_bo*
 intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
 {
@@ -1201,6 +1220,55 @@  intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
 }
 
 static void
+intel_gpgpu_bind_image_for_vme_gen7(intel_gpgpu_t *gpgpu,
+                                    uint32_t index,
+                                    dri_bo* obj_bo,
+                                    uint32_t obj_bo_offset,
+                                    uint32_t format,
+                                    cl_mem_object_type type,
+                                    uint32_t bpp,
+                                    int32_t w,
+                                    int32_t h,
+                                    int32_t depth,
+                                    int32_t pitch,
+                                    int32_t slice_pitch,
+                                    int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen7_media_surface_state_t *ss = (gen7_media_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
+
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.base_addr = obj_bo->offset + obj_bo_offset;
+  ss->ss1.uv_offset_v_direction = 0;
+  ss->ss1.pic_struct = 0;
+  ss->ss1.width = w - 1;
+  ss->ss1.height = h - 1;
+  if (tiling == GPGPU_NO_TILE) {
+    ss->ss2.tile_mode = 0;
+  }
+  else if (tiling == GPGPU_TILE_X){
+    ss->ss2.tile_mode = 2;
+  }
+  else if (tiling == GPGPU_TILE_Y){
+    ss->ss2.tile_mode = 3;
+  }
+  ss->ss2.half_pitch_for_chroma = 0;
+  ss->ss2.surface_pitch = pitch - 1;
+  ss->ss2.surface_object_control_state = cl_gpgpu_get_cache_ctrl();
+  ss->ss2.interleave_chroma = 0;
+  ss->ss2.surface_format = 12; //Y8_UNORM
+  ss->ss3.y_offset_for_u = 0;
+  ss->ss3.x_offset_for_u = 0;
+  ss->ss4.y_offset_for_u = 0;
+  ss->ss4.x_offset_for_v = 0;
+
+  intel_gpgpu_set_buf_reloc_for_vme_gen7(gpgpu, index, obj_bo, obj_bo_offset);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+
+static void
 intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
                               uint32_t index,
                               dri_bo* obj_bo,
@@ -1636,6 +1704,151 @@  int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
    }
 }
 
+static void intel_gpgpu_insert_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel, uint32_t index)
+{
+    gen7_vme_state_t* vme = (gen7_vme_state_t*)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset)  + index;
+    memset(vme, 0, sizeof(*vme));
+    gen7_vme_search_path_state_t* sp = vme->sp;
+
+    if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL){
+      sp[0].dw0.SPD_0_X = 0;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 0;
+      sp[0].dw0.SPD_1_Y = 0;
+      sp[0].dw0.SPD_2_X = 0;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 0;
+      sp[0].dw0.SPD_3_Y = 0;
+    }
+    else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL){
+      sp[0].dw0.SPD_0_X = 1;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 0;
+      sp[0].dw0.SPD_1_Y = 1;
+      sp[0].dw0.SPD_2_X = -1;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 0;
+      sp[0].dw0.SPD_3_Y = 0;
+    }
+    else if(accel->desc.me.search_path_type == CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL){
+      sp[0].dw0.SPD_0_X = 1;
+      sp[0].dw0.SPD_0_Y = 0;
+      sp[0].dw0.SPD_1_X = 1;
+      sp[0].dw0.SPD_1_Y = 0;
+      sp[0].dw0.SPD_2_X = 1;
+      sp[0].dw0.SPD_2_Y = 0;
+      sp[0].dw0.SPD_3_X = 1;
+      sp[0].dw0.SPD_3_Y = 0;
+
+      sp[1].dw0.SPD_0_X = 1;
+      sp[1].dw0.SPD_0_Y = 0;
+      sp[1].dw0.SPD_1_X = 1;
+      sp[1].dw0.SPD_1_Y = 0;
+      sp[1].dw0.SPD_2_X = 1;
+      sp[1].dw0.SPD_2_Y = 0;
+      sp[1].dw0.SPD_3_X = 0;
+      sp[1].dw0.SPD_3_Y = 1;
+
+      sp[2].dw0.SPD_0_X = -1;
+      sp[2].dw0.SPD_0_Y = 0;
+      sp[2].dw0.SPD_1_X = -1;
+      sp[2].dw0.SPD_1_Y = 0;
+      sp[2].dw0.SPD_2_X = -1;
+      sp[2].dw0.SPD_2_Y = 0;
+      sp[2].dw0.SPD_3_X = -1;
+      sp[2].dw0.SPD_3_Y = 0;
+
+      sp[3].dw0.SPD_0_X = -1;
+      sp[3].dw0.SPD_0_Y = 0;
+      sp[3].dw0.SPD_1_X = -1;
+      sp[3].dw0.SPD_1_Y = 0;
+      sp[3].dw0.SPD_2_X = -1;
+      sp[3].dw0.SPD_2_Y = 0;
+      sp[3].dw0.SPD_3_X = 0;
+      sp[3].dw0.SPD_3_Y = 1;
+
+      sp[4].dw0.SPD_0_X = 1;
+      sp[4].dw0.SPD_0_Y = 0;
+      sp[4].dw0.SPD_1_X = 1;
+      sp[4].dw0.SPD_1_Y = 0;
+      sp[4].dw0.SPD_2_X = 1;
+      sp[4].dw0.SPD_2_Y = 0;
+      sp[4].dw0.SPD_3_X = 1;
+      sp[4].dw0.SPD_3_Y = 0;
+
+      sp[5].dw0.SPD_0_X = 1;
+      sp[5].dw0.SPD_0_Y = 0;
+      sp[5].dw0.SPD_1_X = 1;
+      sp[5].dw0.SPD_1_Y = 0;
+      sp[5].dw0.SPD_2_X = 1;
+      sp[5].dw0.SPD_2_Y = 0;
+      sp[5].dw0.SPD_3_X = 0;
+      sp[5].dw0.SPD_3_Y = 1;
+
+      sp[6].dw0.SPD_0_X = -1;
+      sp[6].dw0.SPD_0_Y = 0;
+      sp[6].dw0.SPD_1_X = -1;
+      sp[6].dw0.SPD_1_Y = 0;
+      sp[6].dw0.SPD_2_X = -1;
+      sp[6].dw0.SPD_2_Y = 0;
+      sp[6].dw0.SPD_3_X = -1;
+      sp[6].dw0.SPD_3_Y = 0;
+
+      sp[7].dw0.SPD_0_X = -1;
+      sp[7].dw0.SPD_0_Y = 0;
+      sp[7].dw0.SPD_1_X = -1;
+      sp[7].dw0.SPD_1_Y = 0;
+      sp[7].dw0.SPD_2_X = -1;
+      sp[7].dw0.SPD_2_Y = 0;
+      sp[7].dw0.SPD_3_X = 0;
+      sp[7].dw0.SPD_3_Y = 1;
+
+      sp[8].dw0.SPD_0_X = 1;
+      sp[8].dw0.SPD_0_Y = 0;
+      sp[8].dw0.SPD_1_X = 1;
+      sp[8].dw0.SPD_1_Y = 0;
+      sp[8].dw0.SPD_2_X = 1;
+      sp[8].dw0.SPD_2_Y = 0;
+      sp[8].dw0.SPD_3_X = 1;
+      sp[8].dw0.SPD_3_Y = 0;
+
+      sp[9].dw0.SPD_0_X = 1;
+      sp[9].dw0.SPD_0_Y = 0;
+      sp[9].dw0.SPD_1_X = 1;
+      sp[9].dw0.SPD_1_Y = 0;
+      sp[9].dw0.SPD_2_X = 1;
+      sp[9].dw0.SPD_2_Y = 0;
+      sp[9].dw0.SPD_3_X = 0;
+      sp[9].dw0.SPD_3_Y = 1;
+
+      sp[10].dw0.SPD_0_X = -1;
+      sp[10].dw0.SPD_0_Y = 0;
+      sp[10].dw0.SPD_1_X = -1;
+      sp[10].dw0.SPD_1_Y = 0;
+      sp[10].dw0.SPD_2_X = -1;
+      sp[10].dw0.SPD_2_Y = 0;
+      sp[10].dw0.SPD_3_X = -1;
+      sp[10].dw0.SPD_3_Y = 0;
+
+      sp[11].dw0.SPD_0_X = -1;
+      sp[11].dw0.SPD_0_Y = 0;
+      sp[11].dw0.SPD_1_X = -1;
+      sp[11].dw0.SPD_1_Y = 0;
+      sp[11].dw0.SPD_2_X = -1;
+      sp[11].dw0.SPD_2_Y = 0;
+      sp[11].dw0.SPD_3_X = 0;
+      sp[11].dw0.SPD_3_Y = 0;
+    }
+}
+
+static void
+intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel accel)
+{
+  int index;
+  for(index = 0; index < GEN_MAX_VME_STATES; index++)
+    intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, index);
+}
+
 static void
 intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
 {
@@ -2141,6 +2354,7 @@  intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
   cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7;
+  cl_gpgpu_bind_vme_state = (cl_gpgpu_bind_vme_state_cb *) intel_gpgpu_bind_vme_state_gen7;
   cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
   cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
   cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
@@ -2218,6 +2432,7 @@  intel_set_gpgpu_callbacks(int device_id)
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
+    cl_gpgpu_bind_image_for_vme = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_for_vme_gen7;
     if (IS_BAYTRAIL_T(device_id)) {
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
       intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index fd6a82b..26ab735 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -381,6 +381,57 @@  typedef struct gen8_surface_state
   } ss15;
 } gen8_surface_state_t;
 
+typedef struct gen7_media_surface_state
+{
+  struct {
+    uint32_t base_addr;
+  } ss0;
+
+  struct {
+    uint32_t uv_offset_v_direction:2;
+    uint32_t pic_struct:2;
+    uint32_t width:14;
+    uint32_t height:14;
+  } ss1;
+
+  struct {
+    uint32_t tile_mode:2;
+    uint32_t half_pitch_for_chroma:1;
+    uint32_t surface_pitch:18;
+    uint32_t pad1:1;
+    uint32_t surface_object_control_state:4;
+    uint32_t pad0:1;
+    uint32_t interleave_chroma:1;
+    uint32_t surface_format:4;
+  } ss2;
+
+  struct {
+    uint32_t y_offset_for_u:14;
+    uint32_t pad1:2;
+    uint32_t x_offset_for_u:14;
+    uint32_t pad0:2;
+  } ss3;
+
+  struct {
+    uint32_t y_offset_for_u:15; // error in bspec?
+    uint32_t pad1:1;  // error in bpec?
+    uint32_t x_offset_for_v:14;
+    uint32_t pad0:2;
+  } ss4;
+
+  struct {
+    uint32_t pad0;  //error in bspec?
+  } ss5;
+
+  struct {
+    uint32_t pad0;
+  } ss6;
+
+  struct {
+    uint32_t pad0;
+  } ss7;
+} gen7_media_surface_state_t;
+
 typedef union gen_surface_state
 {
   gen7_surface_state_t gen7_surface_state;
@@ -555,6 +606,75 @@  typedef struct gen8_pipe_control
   } dw5;
 } gen8_pipe_control_t;
 
+#define GEN7_NUM_VME_SEARCH_PATH_STATES 14
+#define GEN7_NUM_VME_RD_LUT_SETS 4
+
+typedef struct gen7_vme_search_path_state
+{
+    struct {
+        uint32_t SPD_0_X : BITFIELD_RANGE(0, 3);        //search path distance
+        uint32_t SPD_0_Y : BITFIELD_RANGE(4, 7);
+        uint32_t SPD_1_X : BITFIELD_RANGE(8, 11);
+        uint32_t SPD_1_Y : BITFIELD_RANGE(12, 15);
+        uint32_t SPD_2_X : BITFIELD_RANGE(16, 19);
+        uint32_t SPD_2_Y : BITFIELD_RANGE(20, 23);
+        uint32_t SPD_3_X : BITFIELD_RANGE(24, 27);
+        uint32_t SPD_3_Y : BITFIELD_RANGE(28, 31);
+    }dw0;
+}gen7_vme_search_path_state_t;
+
+typedef struct gen7_vme_rd_lut_set
+{
+    struct {
+        uint32_t LUT_MbMode_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_1 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_2 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_3 : BITFIELD_RANGE(24, 31);
+    }dw0;
+
+    struct {
+        uint32_t LUT_MbMode_4 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_5 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_6 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_7 : BITFIELD_RANGE(24, 31);
+    }dw1;
+
+    struct {
+        uint32_t LUT_MV_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MV_1 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MV_2 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MV_3 : BITFIELD_RANGE(24, 31);
+    }dw2;
+
+    struct {
+        uint32_t LUT_MV_4 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MV_5 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MV_6 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MV_7 : BITFIELD_RANGE(24, 31);
+    }dw3;
+}gen7_vme_rd_lut_set_t;
+
+typedef struct gen7_vme_state
+{
+    gen7_vme_search_path_state_t sp[GEN7_NUM_VME_SEARCH_PATH_STATES];
+
+    struct {
+        uint32_t LUT_MbMode_8_0 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_9_0 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_8_1 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_9_1 : BITFIELD_RANGE(24, 31);
+    }dw14;
+
+    struct {
+        uint32_t LUT_MbMode_8_2 : BITFIELD_RANGE(0, 7);
+        uint32_t LUT_MbMode_9_2 : BITFIELD_RANGE(8, 15);
+        uint32_t LUT_MbMode_8_3 : BITFIELD_RANGE(16, 23);
+        uint32_t LUT_MbMode_9_3 : BITFIELD_RANGE(24, 31);
+    }dw15;
+
+    gen7_vme_rd_lut_set_t lut[GEN7_NUM_VME_RD_LUT_SETS];
+}gen7_vme_state_t;
+
 typedef struct gen6_sampler_state
 {
   struct {
diff --git a/src/kernels/cl_internal_block_motion_estimate_intel.cl b/src/kernels/cl_internal_block_motion_estimate_intel.cl
new file mode 100644
index 0000000..beef55a
--- /dev/null
+++ b/src/kernels/cl_internal_block_motion_estimate_intel.cl
@@ -0,0 +1,231 @@ 
+typedef struct _motion_estimation_desc_intel {
+  uint mb_block_type;
+  uint subpixel_mode;
+  uint sad_adjust_mode;
+  uint search_path_type;
+} accelerator_intel_t;
+
+__kernel __attribute__((reqd_work_group_size(16,1,1)))
+void block_motion_estimate_intel(accelerator_intel_t accel,
+                                 __read_only  image2d_t src_image,
+                                 __read_only  image2d_t ref_image,
+                                 __global short2 * prediction_motion_vector_buffer,
+                                 __global short2 * motion_vector_buffer,
+                                 __global ushort * residuals){
+
+  uint src_grf0_dw7;
+  uint src_grf0_dw6;
+  uint src_grf0_dw5;
+  uint src_grf0_dw4;
+  uint src_grf0_dw3;
+  uint src_grf0_dw2;
+  uint src_grf0_dw1;
+  uint src_grf0_dw0;
+  uint src_grf1_dw7;
+  uint src_grf1_dw6;
+  uint src_grf1_dw5;
+  uint src_grf1_dw4;
+  uint src_grf1_dw3;
+  uint src_grf1_dw2;
+  uint src_grf1_dw1;
+  uint src_grf1_dw0;
+  uint src_grf2_dw7;
+  uint src_grf2_dw6;
+  uint src_grf2_dw5;
+  uint src_grf2_dw4;
+  uint src_grf2_dw3;
+  uint src_grf2_dw2;
+  uint src_grf2_dw1;
+  uint src_grf2_dw0;
+  uint src_grf3_dw7;
+  uint src_grf3_dw6;
+  uint src_grf3_dw5;
+  uint src_grf3_dw4;
+  uint src_grf3_dw3;
+  uint src_grf3_dw2;
+  uint src_grf3_dw1;
+  uint src_grf3_dw0;
+  uint src_grf4_dw7;
+  uint src_grf4_dw6;
+  uint src_grf4_dw5;
+  uint src_grf4_dw4;
+  uint src_grf4_dw3;
+  uint src_grf4_dw2;
+  uint src_grf4_dw1;
+  uint src_grf4_dw0;
+
+  uint src_grf0, src_grf1, src_grf2, src_grf3, src_grf4;
+
+  uint8 vme_result = (0, 0, 0, 0, 0, 0, 0, 0);
+  int x_id = get_global_id(0);
+  int y_id = get_global_id(1);
+
+  int gid_0 = get_group_id(0);
+  int gid_1 = get_group_id(1);
+
+  uint2 srcCoord = 0;
+
+  /*srcCoord.x = gid_0 * 16 + get_global_offset(0);
+  srcCoord.y = gid_1 * 16 + get_global_offset(1); */
+  srcCoord.x = gid_0 * 16;
+  srcCoord.y = gid_1 * 16;
+
+  //CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL
+  if(accel.search_path_type == 0x0){
+    //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+    src_grf0_dw5 =   (20 << 24)         | (20 << 16)        | (0 << 8)       | (0);
+    //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+    src_grf0_dw1 =   0xfffefffe;
+    //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+    src_grf0_dw0 =   0xfffefffe;
+    //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                   //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+                     | (0 << 16)                     | (2 << 8)                       | (2);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL
+  else if(accel.search_path_type == 0x1){
+    src_grf0_dw5 =   (24 << 24)         | (24 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   0xfffcfffc;
+    src_grf0_dw0 =   0xfffcfffc;
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+  //CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL
+  else if(accel.search_path_type == 0x5){
+    src_grf0_dw5 =   (40 << 24)         | (48 << 16)        | (0 << 8)       | (0);
+    src_grf0_dw1 =   0xfff4fff0;
+    src_grf0_dw0 =   0xfff4fff0;
+    src_grf1_dw2 =   (0 << 28)                        | (0 << 24)                      | (0 << 20)
+                     | (0 << 16)                     | (48 << 8)                       | (48);
+  }
+
+  //src_grf0_dw7 = Debug;
+  src_grf0_dw7 = 0;
+  //src_grf0_dw6 = Debug;
+  src_grf0_dw6 = 0;
+  //src_grf0_dw5 = (Ref_Height << 24) | (Ref_Width << 16) | (Ignored << 8) | (Dispatch_Id?);
+  //src_grf0_dw4 = Ignored;
+  src_grf0_dw4 = 0;
+  //src_grf0_dw3 = (Reserved << 31)                 | (Sub_Mb_Part_Mask << 24)       | (Intra_SAD << 22)
+  src_grf0_dw3 =   (0 << 31)                 | (0x7e << 24)                   | (0 << 22)
+                 //| (Inter_SAD << 20)             | (BB_Skip_Enabled << 19)        | (Reserverd << 18)
+                   | (0 << 20)                     | (0 << 19)                      | (0 << 18)
+                 //| (Dis_Aligned_Src_Fetch << 17) | (Dis_Aligned_Ref_Fetch << 16)  | (Dis_Field_Cache_Alloc << 15)
+                   | (0 << 17)                     | (0 << 16)                      | (0 << 15)
+                 //| (Skip_Type << 14)             | (Sub_Pel_Mode << 12)           | (Dual_Search_Path_Opt << 11)
+                   | (0 << 14)                     | (0 << 12)                      | (0 << 11)
+                 //| (Search_Ctrl << 8)            | (Ref_Access << 7)              | (SrcAccess << 6)
+                   | (0 << 8)                      | (0 << 7)                       | (0 << 6)
+                 //| (Mb_Type_Remap << 4)          | (Reserved_Workaround << 3)     | (Reserved_Workaround << 2)
+                   | (0 << 4)                      | (0 << 3)                       | (0 << 2)
+                 //| (Src_Size);
+                   | (0);
+
+  //src_grf0_dw2 = (SrcY << 16) | (SrcX);
+  src_grf0_dw2 = (srcCoord.y << 16)  | (srcCoord.x);
+  //src_grf0_dw1 = (Ref1Y << 16)  | (Ref1X);
+  //src_grf0_dw0 = (Ref0Y << 16)  | (Ref0X);
+  /*src_grf1_dw7 = (Skip_Center_Mask << 24)         | (Reserved << 22)               | (Ref1_Field_Polarity << 21)
+                 | (Ref0_Field_Polarity << 20)   | (Src_Field_Polarity << 19)     | (Bilinear_Enable << 18)
+                 | (MV_Cost_Scale_Factor << 16)  | (Mb_Intra_Struct << 8)         | (Intra_Corner_Swap << 7)
+                 | (Non_Skip_Mode_Added << 6)    | (Non_Skip_ZMv_Added << 5)      | (IntraPartMask);*/
+  src_grf1_dw7 = 0;
+  //src_grf1_dw6 = Reserved;
+  src_grf1_dw6 = 0;
+  /*src_grf1_dw5 = (Cost_Center1Y << 16)  | (Cost_Center1X);
+  src_grf1_dw4 = (Cost_Center0Y << 16)  | (Cost_Center0X);
+  src_grf1_dw3 = (Ime_Too_Good << 24 )  | (Ime_Too_Bad << 16)  | (Part_Tolerance_Thrhd << 8) | (FBPrunThrhd);*/
+  src_grf1_dw5 = 0;
+  src_grf1_dw4 = 0;
+  src_grf1_dw3 = 0;
+  //src_grf1_dw2 = (Start1Y << 28)                  | (Start1X << 24)                | (Start0Y << 20)
+                 //| (Start0X << 16)               | (Max_Num_SU << 8)              | (LenSP);
+  /*src_grf1_dw1 = (RepartEn << 31)                 | (FBPrunEn << 30)               | (AdaptiveValidationControl << 29)
+                 | (Uni_Mix_Disable << 28)       | (Bi_Sub_Mb_Part_Mask << 24)    | (Reserverd << 22)
+                 | (Bi_Weight << 16)             | (Reserved << 6)                | (MaxNumMVs);*/
+  src_grf1_dw1 = (0 << 24) | (2);
+  /*src_grf1_dw0 = (Early_Ime_Stop << 24)           | (Early_Fme_Success << 16)      | (Skip_Success << 8)
+                 | (T8x8_Flag_For_Inter_En << 7) | (Quit_Inter_En << 6)           | (Early_Ime_Success_En << 5)
+                 | (Early_Success_En << 4)       | (Part_Candidate_En << 3)       | (Bi_Mix_Dis << 2)
+                 | (Adaptive_En  << 1)           | (SkipModeEn);*/
+  src_grf1_dw0 = 0;
+  /*src_grf2_dw7 = Ref1_SkipCenter_3_Delta_XY;
+  src_grf2_dw6 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw5 = Ref1_SkipCenter_2_Delta_XY;
+  src_grf2_dw4 = Ref0_SkipCenter_3_Delta_XY;
+  src_grf2_dw3 = Ref1_SkipCenter_1_Delta_XY;
+  src_grf2_dw2 = Ref0_SkipCenter_1_Delta_XY;
+  src_grf2_dw1 = Ref1_SkipCenter_0_Delta_XY;
+  src_grf2_dw0 = (Ref0_Skip_Center_0_Delta_Y << 16)  | (Ref0_Skip_Center_0_Delta_X);
+  src_grf3_dw7 = Neighbor pixel Luma value [23, -1] to [20, -1];
+  src_grf3_dw6 = Neighbor pixel Luma value [19, -1] to [16, -1];
+  src_grf3_dw5 = Neighbor pixel Luma value [15, -1] to [12, -1];
+  src_grf3_dw4 = Neighbor pixel Luma value [11, -1] to [8, -1];
+  src_grf3_dw3 = Neighbor pixel Luma value [7, -1] to [4, -1];
+  src_grf3_dw2 = (Neighbor pixel Luma value [3, -1] << 24)    | (Neighbor pixel Luma value [2, -1] << 16)
+                 | (Neighbor pixel Luma value [1, -1] << 8)  | (Neighbor pixel Luma value [0, -1]);
+  //src_grf3_dw1 = (?)  | (Reserved)  | ((Intra_16x16_Mode_Mask);
+  src_grf3_dw0 = (Reserved<<25)  | (Intra_16x16_Mode_Mask << 16)  | (Reserved)  | (Intra_16x16_Mode_Mask);
+  src_grf4_dw7 = Reserved;
+  src_grf4_dw6 = Reserved;
+  src_grf4_dw5 = Reserved;
+  src_grf4_dw4 = (Intra_MxM_Pred_Mode_B15 << 28)    | (Intra_MxM_Pred_Mode_B14 << 24)  | (Intra_MxM_Pred_Mode_B11 << 20)
+                 | (Intra_MxM_Pred_Mode_B10 << 16) | (Intra_MxM_Pred_Mode_A15 << 12)  | (Intra_MxM_Pred_Mode_A13 << 8)
+                 | (Intra_MxM_Pred_Mode_A7 << 4)   | (Intra_MxM_Pred_Mode_A5);
+  //src_grf4_dw3 = (?)  | (Neighbor pixel Luma value [-1, 14] to [-1, 12]);
+  src_grf4_dw2 = Neighbor pixel Luma value [-1, 11] to [-1, 8];
+  src_grf4_dw1 = Neighbor pixel Luma value [-1, 7] to [-1, 4];
+  src_grf4_dw0 = (Neighbor pixel Luma value [-1, 3] << 24)    | (Neighbor pixel Luma value [-1, 2] << 16)
+                 | (Neighbor pixel Luma value [-1, 1] << 8)  | (Neighbor pixel Luma value [-1, 0]);*/
+  src_grf2_dw7 = 0;
+  src_grf2_dw6 = 0;
+  src_grf2_dw5 = 0;
+  src_grf2_dw4 = 0;
+  src_grf2_dw3 = 0;
+  src_grf2_dw2 = 0;
+  src_grf2_dw1 = 0;
+  src_grf2_dw0 = 0;
+  src_grf3_dw7 = 0;
+  src_grf3_dw6 = 0;
+  src_grf3_dw5 = 0;
+  src_grf3_dw4 = 0;
+  src_grf3_dw3 = 0;
+  src_grf3_dw2 = 0;
+  src_grf3_dw1 = 0;
+  src_grf3_dw0 = 0;
+  src_grf4_dw7 = 0;
+  src_grf4_dw6 = 0;
+  src_grf4_dw5 = 0;
+  src_grf4_dw4 = 0;
+  src_grf4_dw3 = 0;
+  src_grf4_dw2 = 0;
+  src_grf4_dw1 = 0;
+  src_grf4_dw0 = 0;
+
+  vme_result = __gen_ocl_vme(src_image, ref_image,
+                src_grf0, src_grf1, src_grf2, src_grf3, src_grf4,
+                src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,
+                src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,
+                src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,
+                src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,
+                src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,
+                src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,
+                src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,
+                src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,
+                src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,
+                src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,
+                //msg_type, vme_search_path_lut, lut_sub,
+                1, 0, 0);
+
+  barrier(CLK_LOCAL_MEM_FENCE);
+
+  int lid_x = get_local_id(0);
+  uint write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1);
+  short2 val = as_short2( write_back_grf1_dw0 );
+  int index = gid_1 * get_num_groups(0) + gid_0;
+  if( lid_x == 0 ){
+    motion_vector_buffer[index] = val;
+  }
+
+}

Comments

> +    if (kernel->vme) {

> +        fixed_local_sz[0] = 16;

> +        fixed_local_sz[1] = 1;


Why it is 16? Does it work for all cases?

> -  if (global_work_size != NULL)

> +  if (kernel->vme) {

> +    fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;

> +    fixed_global_sz[1] = (global_work_size[1]+15) / 16;

> +  } else {

>      for (i = 0; i < work_dim; ++i)

>        fixed_global_sz[i] = global_work_size[i];

> +  }

>    if (global_work_offset != NULL)

>      for (i = 0; i < work_dim; ++i)

>        fixed_global_off[i] = global_work_offset[i];




> @@ -140,10 +140,16 @@

> cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)

> 

>      image = cl_mem_image(k->args[id].mem);

>      set_image_info(k->curbe, &k->images[i], image);

> -    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image-

> >offset,

> -                        image->intel_fmt, image->image_type, image->bpp,

> -                        image->w, image->h, image->depth,

> -                        image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image-

> >tiling);


Please check for invalid image format.

> +    if(k->vme)

> +      cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, image->base.bo,

> image->offset,

> +                          image->intel_fmt, image->image_type, image->bpp,

> +                          image->w, image->h, image->depth,

> +                          image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image-

> >tiling);



>    /* Bind all samplers */

> -  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);

> +  if (ker->vme)

> +    cl_gpgpu_bind_vme_state(gpgpu, ker->accel);

> +  else

> +    cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);

Like sampler state, I hope we can gather the vme count used in the kernel, and don't need to set the state of all VMEs. 


> diff --git a/src/cl_context.h b/src/cl_context.h

> --- a/src/cl_extensions.c

> +++ b/src/cl_extensions.c

> @@ -40,7 +40,7 @@ void check_opt1_extension(cl_extensions_t *extensions)

>    int id;

>    for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++)

>    {

> -    if (id == EXT_ID(khr_icd))

> +    if (id == EXT_ID(khr_icd) || id == EXT_ID(intel_accelerator) || id ==

> EXT_ID(intel_motion_estimation))

>        extensions->extensions[id].base.ext_enabled = 1;

>  #if  LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5

>      if (id == EXT_ID(khr_spir))

> @@ -63,7 +63,9 @@ check_gl_extension(cl_extensions_t *extensions) {

>  void

>  check_intel_extension(cl_extensions_t *extensions)

>  {

> -  /* Should put those map/unmap extensions here. */

> +  int id;

> +  for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)

> +    extensions->extensions[id].base.ext_enabled = 1;


Is this a duplicate of code in check_op1_extension()?

> +

> +static void

> +intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, cl_accelerator_intel

> accel)

> +{

> +  int index;


As I said, setting all the vme states is not a good idea, please refine this in next step.

> +  for(index = 0; index < GEN_MAX_VME_STATES; index++)

> +    intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, index);

> +}

> +

>  static void


I have discussed with Chuanbo, src_grf0, src_grf1,... src_grf4 don't need to be passed in to __gen_ocl_vme(), they are not real arguments.

Thanks!
Ruiling
> +  vme_result = __gen_ocl_vme(src_image, ref_image,

> +                src_grf0, src_grf1, src_grf2, src_grf3, src_grf4,

> +                src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,

> +                src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,

> +                src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,

> +                src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,

> +                src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,

> +                src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,

> +                src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,

> +                src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,

> +                src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,

> +                src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,

> +                //msg_type, vme_search_path_lut, lut_sub,

> +                1, 0, 0);

> +

> +  barrier(CLK_LOCAL_MEM_FENCE);

> +

> +  int lid_x = get_local_id(0);

> +  uint write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1);

> +  short2 val = as_short2( write_back_grf1_dw0 );

> +  int index = gid_1 * get_num_groups(0) + gid_0;

> +  if( lid_x == 0 ){

> +    motion_vector_buffer[index] = val;

> +  }

> +

> +}

> --

> 1.9.1

> 

> _______________________________________________

> Beignet mailing list

> Beignet@lists.freedesktop.org

> http://lists.freedesktop.org/mailman/listinfo/beignet
Regarding "fixed_local_sz[0] = 16", the reason is that the basic unit of VME hardware is 16*16 pixels, and our design is to handle 1*16 pixels in a work item, and use 16*1 as local size, so, each group is a basic unit of VME.

For the extension concern "Is this a duplicate of code in check_op1_extension()?",  yes, it is a duplicate code, will be removed in v2.

For others, Chuanbo will refine and send out the v2 patch.

-----Original Message-----
From: Song, Ruiling 

Sent: Sunday, September 06, 2015 3:02 PM
To: Weng, Chuanbo; beignet@lists.freedesktop.org
Cc: Guo, Yejun; Weng, Chuanbo
Subject: RE: [Beignet] [PATCH 2/4] add extensions intel_accelerator and basic intel_motion_estimation

> +    if (kernel->vme) {

> +        fixed_local_sz[0] = 16;

> +        fixed_local_sz[1] = 1;


Why it is 16? Does it work for all cases?

> -  if (global_work_size != NULL)

> +  if (kernel->vme) {

> +    fixed_global_sz[0] = (global_work_size[0]+15) / 16 * 16;

> +    fixed_global_sz[1] = (global_work_size[1]+15) / 16;  } else {

>      for (i = 0; i < work_dim; ++i)

>        fixed_global_sz[i] = global_work_size[i];

> +  }

>    if (global_work_offset != NULL)

>      for (i = 0; i < work_dim; ++i)

>        fixed_global_off[i] = global_work_offset[i];




> @@ -140,10 +140,16 @@

> cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)

> 

>      image = cl_mem_image(k->args[id].mem);

>      set_image_info(k->curbe, &k->images[i], image);

> -    cl_gpgpu_bind_image(gpgpu, k->images[i].idx, image->base.bo, image-

> >offset,

> -                        image->intel_fmt, image->image_type, image->bpp,

> -                        image->w, image->h, image->depth,

> -                        image->row_pitch, image->slice_pitch, (cl_gpgpu_tiling)image-

> >tiling);


Please check for invalid image format.

> +    if(k->vme)

> +      cl_gpgpu_bind_image_for_vme(gpgpu, k->images[i].idx, 

> + image->base.bo,

> image->offset,

> +                          image->intel_fmt, image->image_type, image->bpp,

> +                          image->w, image->h, image->depth,

> +                          image->row_pitch, image->slice_pitch, 

> + (cl_gpgpu_tiling)image-

> >tiling);



>    /* Bind all samplers */

> -  cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);

> +  if (ker->vme)

> +    cl_gpgpu_bind_vme_state(gpgpu, ker->accel);  else

> +    cl_gpgpu_bind_sampler(gpgpu, ker->samplers, ker->sampler_sz);

Like sampler state, I hope we can gather the vme count used in the kernel, and don't need to set the state of all VMEs. 


> diff --git a/src/cl_context.h b/src/cl_context.h

> --- a/src/cl_extensions.c

> +++ b/src/cl_extensions.c

> @@ -40,7 +40,7 @@ void check_opt1_extension(cl_extensions_t *extensions)

>    int id;

>    for(id = OPT1_EXT_START_ID; id <= OPT1_EXT_END_ID; id++)

>    {

> -    if (id == EXT_ID(khr_icd))

> +    if (id == EXT_ID(khr_icd) || id == EXT_ID(intel_accelerator) || 

> + id ==

> EXT_ID(intel_motion_estimation))

>        extensions->extensions[id].base.ext_enabled = 1;  #if  

> LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5

>      if (id == EXT_ID(khr_spir))

> @@ -63,7 +63,9 @@ check_gl_extension(cl_extensions_t *extensions) {  

> void  check_intel_extension(cl_extensions_t *extensions)  {

> -  /* Should put those map/unmap extensions here. */

> +  int id;

> +  for(id = INTEL_EXT_START_ID; id <= INTEL_EXT_END_ID; id++)

> +    extensions->extensions[id].base.ext_enabled = 1;


Is this a duplicate of code in check_op1_extension()?

> +

> +static void

> +intel_gpgpu_bind_vme_state_gen7(intel_gpgpu_t *gpgpu, 

> +cl_accelerator_intel

> accel)

> +{

> +  int index;


As I said, setting all the vme states is not a good idea, please refine this in next step.

> +  for(index = 0; index < GEN_MAX_VME_STATES; index++)

> +    intel_gpgpu_insert_vme_state_gen7(gpgpu, accel, index); }

> +

>  static void


I have discussed with Chuanbo, src_grf0, src_grf1,... src_grf4 don't need to be passed in to __gen_ocl_vme(), they are not real arguments.

Thanks!
Ruiling
> +  vme_result = __gen_ocl_vme(src_image, ref_image,

> +                src_grf0, src_grf1, src_grf2, src_grf3, src_grf4,

> +                src_grf0_dw7, src_grf0_dw6, src_grf0_dw5, src_grf0_dw4,

> +                src_grf0_dw3, src_grf0_dw2, src_grf0_dw1, src_grf0_dw0,

> +                src_grf1_dw7, src_grf1_dw6, src_grf1_dw5, src_grf1_dw4,

> +                src_grf1_dw3, src_grf1_dw2, src_grf1_dw1, src_grf1_dw0,

> +                src_grf2_dw7, src_grf2_dw6, src_grf2_dw5, src_grf2_dw4,

> +                src_grf2_dw3, src_grf2_dw2, src_grf2_dw1, src_grf2_dw0,

> +                src_grf3_dw7, src_grf3_dw6, src_grf3_dw5, src_grf3_dw4,

> +                src_grf3_dw3, src_grf3_dw2, src_grf3_dw1, src_grf3_dw0,

> +                src_grf4_dw7, src_grf4_dw6, src_grf4_dw5, src_grf4_dw4,

> +                src_grf4_dw3, src_grf4_dw2, src_grf4_dw1, src_grf4_dw0,

> +                //msg_type, vme_search_path_lut, lut_sub,

> +                1, 0, 0);

> +

> +  barrier(CLK_LOCAL_MEM_FENCE);

> +

> +  int lid_x = get_local_id(0);

> +  uint write_back_grf1_dw0 = __gen_ocl_region(0, vme_result.s1);

> +  short2 val = as_short2( write_back_grf1_dw0 );  int index = gid_1 * 

> + get_num_groups(0) + gid_0;  if( lid_x == 0 ){

> +    motion_vector_buffer[index] = val;  }

> +

> +}

> --

> 1.9.1

> 

> _______________________________________________

> Beignet mailing list

> Beignet@lists.freedesktop.org

> http://lists.freedesktop.org/mailman/listinfo/beignet